From 72b9787e2806694c5bc5609dd7c2e5dbbfaa4b2f Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 25 Nov 2014 20:00:18 +0100 Subject: [PATCH] Imported Upstream version 1.4 --- .hg_archival.txt | 4 + .hgignore | 11 + .hgtags | 17 + COPYING | 343 + ChangeLog | 55752 ++++++++++++++++ build/README.txt | 70 + build/icl32/build-all.bat | 14 + build/icl32/make-makefile.bat | 15 + build/icl64/build-all.bat | 14 + build/icl64/make-makefile.bat | 17 + build/linux/make-Makefiles.bash | 3 + build/msys/make-Makefiles.sh | 3 + .../msys/make-x86_64-w64-mingw32-Makefiles.sh | 8 + build/msys/toolchain-x86_64-w64-mingw32.cmake | 6 + build/vc10-x86/build-all.bat | 14 + build/vc10-x86/make-solutions.bat | 6 + build/vc10-x86_64/build-all.bat | 14 + build/vc10-x86_64/make-solutions.bat | 6 + build/vc11-x86/build-all.bat | 14 + build/vc11-x86/make-solutions.bat | 6 + build/vc11-x86_64/build-all.bat | 14 + build/vc11-x86_64/make-solutions.bat | 6 + build/vc12-x86/build-all.bat | 14 + build/vc12-x86/make-solutions.bat | 6 + build/vc12-x86_64/build-all.bat | 14 + build/vc12-x86_64/make-solutions.bat | 6 + build/vc9-x86/build-all.bat | 14 + build/vc9-x86/make-solutions.bat | 6 + build/vc9-x86_64/build-all.bat | 14 + build/vc9-x86_64/make-solutions.bat | 6 + build/xcode/make-project.sh | 2 + doc/intra/intra-16x16.txt | 561 + doc/intra/intra-32x32.txt | 1089 + doc/intra/intra-4x4.txt | 166 + doc/intra/intra-8x8.txt | 298 + doc/reST/Makefile | 97 + doc/reST/api.rst | 340 + doc/reST/cli.rst | 1243 + doc/reST/conf.py | 17 + doc/reST/index.rst | 11 + doc/reST/introduction.rst | 82 + doc/reST/lossless.rst | 162 + doc/reST/presets.rst | 103 + doc/reST/threading.rst | 241 + doc/uncrustify/codingstyle.cfg | 232 + source/CMakeLists.txt | 380 + source/PPA/CMakeLists.txt | 1 + source/PPA/ppa.cpp | 145 + source/PPA/ppa.h | 71 + source/PPA/ppaApi.h | 59 + source/PPA/ppaCPUEvents.h | 25 + source/cmake/CMakeASM_YASMInformation.cmake | 48 + .../CMakeDetermineASM_YASMCompiler.cmake | 5 + source/cmake/CMakeTestASM_YASMCompiler.cmake | 3 + source/cmake/FindVLD.cmake | 123 + source/cmake/FindYasm.cmake | 25 + source/cmake/clean-generated.cmake | 10 + source/cmake/cmake_uninstall.cmake.in | 19 + source/cmake/version.cmake | 90 + source/common/CMakeLists.txt | 110 + source/common/bitstream.cpp | 125 + source/common/bitstream.h | 158 + source/common/common.cpp | 208 + source/common/common.h | 403 + source/common/constants.cpp | 503 + source/common/constants.h | 104 + source/common/contexts.h | 309 + source/common/cpu.cpp | 374 + source/common/cpu.h | 59 + source/common/cudata.cpp | 2139 + source/common/cudata.h | 304 + source/common/dct.cpp | 893 + source/common/deblock.cpp | 647 + source/common/deblock.h | 75 + source/common/frame.cpp | 101 + source/common/frame.h | 79 + source/common/framedata.cpp | 69 + source/common/framedata.h | 100 + source/common/intrapred.cpp | 307 + source/common/ipfilter.cpp | 518 + source/common/loopfilter.cpp | 53 + source/common/lowres.cpp | 168 + source/common/lowres.h | 148 + source/common/md5.cpp | 268 + source/common/md5.h | 79 + source/common/mv.h | 104 + source/common/param.cpp | 1341 + source/common/param.h | 45 + source/common/piclist.cpp | 151 + source/common/piclist.h | 79 + source/common/picyuv.cpp | 397 + source/common/picyuv.h | 94 + source/common/pixel.cpp | 1387 + source/common/predict.cpp | 1060 + source/common/predict.h | 137 + source/common/primitives.cpp | 242 + source/common/primitives.h | 319 + source/common/quant.cpp | 1124 + source/common/quant.h | 136 + source/common/scalinglist.cpp | 379 + source/common/scalinglist.h | 80 + source/common/shortyuv.cpp | 120 + source/common/shortyuv.h | 93 + source/common/slice.cpp | 204 + source/common/slice.h | 361 + source/common/threading.cpp | 106 + source/common/threading.h | 476 + source/common/threadpool.cpp | 465 + source/common/threadpool.h | 111 + source/common/vec/dct-sse3.cpp | 1572 + source/common/vec/dct-sse41.cpp | 118 + source/common/vec/dct-ssse3.cpp | 1108 + source/common/vec/vec-primitives.cpp | 84 + source/common/version.cpp | 93 + source/common/wavefront.cpp | 139 + source/common/wavefront.h | 102 + source/common/winxp.cpp | 130 + source/common/winxp.h | 91 + source/common/x86/README.txt | 14 + source/common/x86/asm-primitives.cpp | 1853 + source/common/x86/blockcopy8.asm | 4925 ++ source/common/x86/blockcopy8.h | 216 + source/common/x86/const-a.asm | 111 + source/common/x86/cpu-a.asm | 197 + source/common/x86/dct8.asm | 2684 + source/common/x86/dct8.h | 45 + source/common/x86/intrapred.h | 164 + source/common/x86/intrapred16.asm | 12780 ++++ source/common/x86/intrapred8.asm | 31997 +++++++++ source/common/x86/ipfilter16.asm | 2894 + source/common/x86/ipfilter8.asm | 5599 ++ source/common/x86/ipfilter8.h | 629 + source/common/x86/loopfilter.asm | 85 + source/common/x86/loopfilter.h | 29 + source/common/x86/mc-a.asm | 3722 ++ source/common/x86/mc-a2.asm | 1133 + source/common/x86/mc.h | 69 + source/common/x86/pixel-32.asm | 420 + source/common/x86/pixel-a.asm | 6581 ++ source/common/x86/pixel-util.h | 130 + source/common/x86/pixel-util8.asm | 5001 ++ source/common/x86/pixel.h | 227 + source/common/x86/pixeladd8.asm | 740 + source/common/x86/sad-a.asm | 3712 + source/common/x86/sad16-a.asm | 833 + source/common/x86/ssd-a.asm | 2595 + source/common/x86/x86inc.asm | 1494 + source/common/x86/x86util.asm | 893 + source/common/yuv.cpp | 184 + source/common/yuv.h | 109 + source/compat/getopt/LGPL.txt | 504 + source/compat/getopt/getopt.c | 1066 + source/compat/getopt/getopt.h | 182 + source/compat/msvc/stdint.h | 24 + source/encoder/CMakeLists.txt | 25 + source/encoder/analysis.cpp | 1867 + source/encoder/analysis.h | 132 + source/encoder/api.cpp | 249 + source/encoder/bitcost.cpp | 91 + source/encoder/bitcost.h | 93 + source/encoder/dpb.cpp | 298 + source/encoder/dpb.h | 78 + source/encoder/encoder.cpp | 1492 + source/encoder/encoder.h | 175 + source/encoder/entropy.cpp | 2172 + source/encoder/entropy.h | 246 + source/encoder/frameencoder.cpp | 1142 + source/encoder/frameencoder.h | 216 + source/encoder/framefilter.cpp | 491 + source/encoder/framefilter.h | 75 + source/encoder/level.cpp | 397 + source/encoder/level.h | 39 + source/encoder/motion.cpp | 1169 + source/encoder/motion.h | 111 + source/encoder/nal.cpp | 218 + source/encoder/nal.h | 64 + source/encoder/ratecontrol.cpp | 2382 + source/encoder/ratecontrol.h | 269 + source/encoder/rdcost.h | 125 + source/encoder/reference.cpp | 118 + source/encoder/reference.h | 56 + source/encoder/sao.cpp | 1498 + source/encoder/sao.h | 151 + source/encoder/search.cpp | 3249 + source/encoder/search.h | 267 + source/encoder/sei.cpp | 74 + source/encoder/sei.h | 281 + source/encoder/slicetype.cpp | 1743 + source/encoder/slicetype.h | 189 + source/encoder/weightPrediction.cpp | 534 + source/filters/filters.cpp | 79 + source/filters/filters.h | 31 + source/input/input.cpp | 38 + source/input/input.h | 83 + source/input/y4m.cpp | 466 + source/input/y4m.h | 94 + source/input/yuv.cpp | 239 + source/input/yuv.h | 86 + source/output/output.cpp | 38 + source/output/output.h | 55 + source/output/y4m.cpp | 117 + source/output/y4m.h | 69 + source/output/yuv.cpp | 109 + source/output/yuv.h | 69 + source/test/CMakeLists.txt | 28 + source/test/checkasm-a.asm | 221 + source/test/intrapredharness.cpp | 304 + source/test/intrapredharness.h | 60 + source/test/ipfilterharness.cpp | 778 + source/test/ipfilterharness.h | 77 + source/test/mbdstharness.cpp | 509 + source/test/mbdstharness.h | 86 + source/test/pixelharness.cpp | 1781 + source/test/pixelharness.h | 111 + source/test/testbench.cpp | 236 + source/test/testharness.h | 173 + source/test/testpool.cpp | 238 + source/x265.cpp | 1172 + source/x265.def.in | 24 + source/x265.h | 1151 + source/x265.pc.in | 11 + source/x265.rc.in | 32 + source/x265_config.h.in | 34 + 223 files changed, 207050 insertions(+) create mode 100644 .hg_archival.txt create mode 100644 .hgignore create mode 100644 .hgtags create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 build/README.txt create mode 100644 build/icl32/build-all.bat create mode 100644 build/icl32/make-makefile.bat create mode 100644 build/icl64/build-all.bat create mode 100644 build/icl64/make-makefile.bat create mode 100755 build/linux/make-Makefiles.bash create mode 100644 build/msys/make-Makefiles.sh create mode 100644 build/msys/make-x86_64-w64-mingw32-Makefiles.sh create mode 100644 build/msys/toolchain-x86_64-w64-mingw32.cmake create mode 100644 build/vc10-x86/build-all.bat create mode 100644 build/vc10-x86/make-solutions.bat create mode 100644 build/vc10-x86_64/build-all.bat create mode 100644 build/vc10-x86_64/make-solutions.bat create mode 100644 build/vc11-x86/build-all.bat create mode 100644 build/vc11-x86/make-solutions.bat create mode 100644 build/vc11-x86_64/build-all.bat create mode 100644 build/vc11-x86_64/make-solutions.bat create mode 100644 build/vc12-x86/build-all.bat create mode 100644 build/vc12-x86/make-solutions.bat create mode 100644 build/vc12-x86_64/build-all.bat create mode 100644 build/vc12-x86_64/make-solutions.bat create mode 100644 build/vc9-x86/build-all.bat create mode 100644 build/vc9-x86/make-solutions.bat create mode 100644 build/vc9-x86_64/build-all.bat create mode 100644 build/vc9-x86_64/make-solutions.bat create mode 100755 build/xcode/make-project.sh create mode 100644 doc/intra/intra-16x16.txt create mode 100644 doc/intra/intra-32x32.txt create mode 100644 doc/intra/intra-4x4.txt create mode 100644 doc/intra/intra-8x8.txt create mode 100644 doc/reST/Makefile create mode 100644 doc/reST/api.rst create mode 100644 doc/reST/cli.rst create mode 100644 doc/reST/conf.py create mode 100644 doc/reST/index.rst create mode 100644 doc/reST/introduction.rst create mode 100644 doc/reST/lossless.rst create mode 100644 doc/reST/presets.rst create mode 100644 doc/reST/threading.rst create mode 100644 doc/uncrustify/codingstyle.cfg create mode 100644 source/CMakeLists.txt create mode 100644 source/PPA/CMakeLists.txt create mode 100644 source/PPA/ppa.cpp create mode 100644 source/PPA/ppa.h create mode 100644 source/PPA/ppaApi.h create mode 100644 source/PPA/ppaCPUEvents.h create mode 100644 source/cmake/CMakeASM_YASMInformation.cmake create mode 100644 source/cmake/CMakeDetermineASM_YASMCompiler.cmake create mode 100644 source/cmake/CMakeTestASM_YASMCompiler.cmake create mode 100644 source/cmake/FindVLD.cmake create mode 100644 source/cmake/FindYasm.cmake create mode 100644 source/cmake/clean-generated.cmake create mode 100644 source/cmake/cmake_uninstall.cmake.in create mode 100644 source/cmake/version.cmake create mode 100644 source/common/CMakeLists.txt create mode 100644 source/common/bitstream.cpp create mode 100644 source/common/bitstream.h create mode 100644 source/common/common.cpp create mode 100644 source/common/common.h create mode 100644 source/common/constants.cpp create mode 100644 source/common/constants.h create mode 100644 source/common/contexts.h create mode 100644 source/common/cpu.cpp create mode 100644 source/common/cpu.h create mode 100644 source/common/cudata.cpp create mode 100644 source/common/cudata.h create mode 100644 source/common/dct.cpp create mode 100644 source/common/deblock.cpp create mode 100644 source/common/deblock.h create mode 100644 source/common/frame.cpp create mode 100644 source/common/frame.h create mode 100644 source/common/framedata.cpp create mode 100644 source/common/framedata.h create mode 100644 source/common/intrapred.cpp create mode 100644 source/common/ipfilter.cpp create mode 100644 source/common/loopfilter.cpp create mode 100644 source/common/lowres.cpp create mode 100644 source/common/lowres.h create mode 100644 source/common/md5.cpp create mode 100644 source/common/md5.h create mode 100644 source/common/mv.h create mode 100644 source/common/param.cpp create mode 100644 source/common/param.h create mode 100644 source/common/piclist.cpp create mode 100644 source/common/piclist.h create mode 100644 source/common/picyuv.cpp create mode 100644 source/common/picyuv.h create mode 100644 source/common/pixel.cpp create mode 100644 source/common/predict.cpp create mode 100644 source/common/predict.h create mode 100644 source/common/primitives.cpp create mode 100644 source/common/primitives.h create mode 100644 source/common/quant.cpp create mode 100644 source/common/quant.h create mode 100644 source/common/scalinglist.cpp create mode 100644 source/common/scalinglist.h create mode 100644 source/common/shortyuv.cpp create mode 100644 source/common/shortyuv.h create mode 100644 source/common/slice.cpp create mode 100644 source/common/slice.h create mode 100644 source/common/threading.cpp create mode 100644 source/common/threading.h create mode 100644 source/common/threadpool.cpp create mode 100644 source/common/threadpool.h create mode 100644 source/common/vec/dct-sse3.cpp create mode 100644 source/common/vec/dct-sse41.cpp create mode 100644 source/common/vec/dct-ssse3.cpp create mode 100644 source/common/vec/vec-primitives.cpp create mode 100644 source/common/version.cpp create mode 100644 source/common/wavefront.cpp create mode 100644 source/common/wavefront.h create mode 100644 source/common/winxp.cpp create mode 100644 source/common/winxp.h create mode 100644 source/common/x86/README.txt create mode 100644 source/common/x86/asm-primitives.cpp create mode 100644 source/common/x86/blockcopy8.asm create mode 100644 source/common/x86/blockcopy8.h create mode 100644 source/common/x86/const-a.asm create mode 100644 source/common/x86/cpu-a.asm create mode 100644 source/common/x86/dct8.asm create mode 100644 source/common/x86/dct8.h create mode 100644 source/common/x86/intrapred.h create mode 100644 source/common/x86/intrapred16.asm create mode 100644 source/common/x86/intrapred8.asm create mode 100644 source/common/x86/ipfilter16.asm create mode 100644 source/common/x86/ipfilter8.asm create mode 100644 source/common/x86/ipfilter8.h create mode 100644 source/common/x86/loopfilter.asm create mode 100644 source/common/x86/loopfilter.h create mode 100644 source/common/x86/mc-a.asm create mode 100644 source/common/x86/mc-a2.asm create mode 100644 source/common/x86/mc.h create mode 100644 source/common/x86/pixel-32.asm create mode 100644 source/common/x86/pixel-a.asm create mode 100644 source/common/x86/pixel-util.h create mode 100644 source/common/x86/pixel-util8.asm create mode 100644 source/common/x86/pixel.h create mode 100644 source/common/x86/pixeladd8.asm create mode 100644 source/common/x86/sad-a.asm create mode 100644 source/common/x86/sad16-a.asm create mode 100644 source/common/x86/ssd-a.asm create mode 100644 source/common/x86/x86inc.asm create mode 100644 source/common/x86/x86util.asm create mode 100644 source/common/yuv.cpp create mode 100644 source/common/yuv.h create mode 100644 source/compat/getopt/LGPL.txt create mode 100644 source/compat/getopt/getopt.c create mode 100644 source/compat/getopt/getopt.h create mode 100644 source/compat/msvc/stdint.h create mode 100644 source/encoder/CMakeLists.txt create mode 100644 source/encoder/analysis.cpp create mode 100644 source/encoder/analysis.h create mode 100644 source/encoder/api.cpp create mode 100644 source/encoder/bitcost.cpp create mode 100644 source/encoder/bitcost.h create mode 100644 source/encoder/dpb.cpp create mode 100644 source/encoder/dpb.h create mode 100644 source/encoder/encoder.cpp create mode 100644 source/encoder/encoder.h create mode 100644 source/encoder/entropy.cpp create mode 100644 source/encoder/entropy.h create mode 100644 source/encoder/frameencoder.cpp create mode 100644 source/encoder/frameencoder.h create mode 100644 source/encoder/framefilter.cpp create mode 100644 source/encoder/framefilter.h create mode 100644 source/encoder/level.cpp create mode 100644 source/encoder/level.h create mode 100644 source/encoder/motion.cpp create mode 100644 source/encoder/motion.h create mode 100644 source/encoder/nal.cpp create mode 100644 source/encoder/nal.h create mode 100644 source/encoder/ratecontrol.cpp create mode 100644 source/encoder/ratecontrol.h create mode 100644 source/encoder/rdcost.h create mode 100644 source/encoder/reference.cpp create mode 100644 source/encoder/reference.h create mode 100644 source/encoder/sao.cpp create mode 100644 source/encoder/sao.h create mode 100644 source/encoder/search.cpp create mode 100644 source/encoder/search.h create mode 100644 source/encoder/sei.cpp create mode 100644 source/encoder/sei.h create mode 100644 source/encoder/slicetype.cpp create mode 100644 source/encoder/slicetype.h create mode 100644 source/encoder/weightPrediction.cpp create mode 100644 source/filters/filters.cpp create mode 100644 source/filters/filters.h create mode 100644 source/input/input.cpp create mode 100644 source/input/input.h create mode 100644 source/input/y4m.cpp create mode 100644 source/input/y4m.h create mode 100644 source/input/yuv.cpp create mode 100644 source/input/yuv.h create mode 100644 source/output/output.cpp create mode 100644 source/output/output.h create mode 100644 source/output/y4m.cpp create mode 100644 source/output/y4m.h create mode 100644 source/output/yuv.cpp create mode 100644 source/output/yuv.h create mode 100644 source/test/CMakeLists.txt create mode 100644 source/test/checkasm-a.asm create mode 100644 source/test/intrapredharness.cpp create mode 100644 source/test/intrapredharness.h create mode 100644 source/test/ipfilterharness.cpp create mode 100644 source/test/ipfilterharness.h create mode 100644 source/test/mbdstharness.cpp create mode 100644 source/test/mbdstharness.h create mode 100644 source/test/pixelharness.cpp create mode 100644 source/test/pixelharness.h create mode 100644 source/test/testbench.cpp create mode 100644 source/test/testharness.h create mode 100644 source/test/testpool.cpp create mode 100644 source/x265.cpp create mode 100644 source/x265.def.in create mode 100644 source/x265.h create mode 100644 source/x265.pc.in create mode 100644 source/x265.rc.in create mode 100644 source/x265_config.h.in diff --git a/.hg_archival.txt b/.hg_archival.txt new file mode 100644 index 0000000..39aab44 --- /dev/null +++ b/.hg_archival.txt @@ -0,0 +1,4 @@ +repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf +node: 5e604833c5aa605d0b6efbe5234492b5e7d8ac61 +branch: stable +tag: 1.4 diff --git a/.hgignore b/.hgignore new file mode 100644 index 0000000..87f1042 --- /dev/null +++ b/.hgignore @@ -0,0 +1,11 @@ +syntax: glob +doc/uncrustify/uncrustify.exe +build/ +**.rej +**.orig +**.hevc +**.yuv +**.y4m +**.out +**.swp +.DS_Store diff --git a/.hgtags b/.hgtags new file mode 100644 index 0000000..42d4ebd --- /dev/null +++ b/.hgtags @@ -0,0 +1,17 @@ +681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD1 +3ec4837e6f6c7159f438e1f537dff117c93ee139 LASTKNOWNGOOD2 +9a6800e84295db446fdce2e7f27059ec8ae838a7 LASTKNOWNGOOD +99fab2ef92be051cd3b3b2d817064cead282b42c 0.1 +b3471d9009f5cd487b23c8c61a6bfff8980e54f2 0.2 +3767fbfa970ff4b2dc2e8647db0274168727147e 0.3 +2ba6ec553f218d2b06ad803b87d6ec751fd639f7 0.4 +93707bc4fccdaa89a1f2da11db8808ca912a691c 0.4.1 +69acb3cb777f977f5edde908069ac565915dd366 0.5 +b970ffbdd696e3ce45c93b315902eb6366ff085e 0.6 +d24e2a8c4326b0cd01bfa6c414c5378481af9018 0.7 +527d03c56d6860dc979ddea1196f7e94d13d3e82 0.8 +82bbd2bf3b49ba086be0f0922f91fe0084896351 0.9 +cea97c4d79456842e00ade6be6fd5ec34610e5f8 1.0 +ae9609aeebdc3271114168ece003679e9b1dca1b 1.1 +d6257335c5370ee54317a0426a12c1f0724b18b9 1.2 +c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3 diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d5457c9 --- /dev/null +++ b/COPYING @@ -0,0 +1,343 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. + +This program is also available under a commercial proprietary license. +For more information, contact us at license @ x265.com. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..80323fb --- /dev/null +++ b/ChangeLog @@ -0,0 +1,55752 @@ +2014-10-31 Steve Borho + + * source/encoder/encoder.cpp: + encoder: emit an Active Parameter Sets SEI in stream headers if + interlaced + + The APS is technically required if we're going to be emitting + picture timing SEI which we do for HRD and for interlaced inputs. + The lack of APS for interlaced content was causing a warning from + the HM decoder and on Windows the decoder would later crash. + [5e604833c5aa] [1.4] + +2014-10-30 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: fix an msvc warning + [c369e6a12710] + +2014-10-30 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: AVX2 version luma_vpp[4x4], improve 391c -> 302c + [f0f073deb207] + + * source/common/x86/ipfilter8.asm: + asm: replace constant table tab_c_512 by pw_512 + [b39313659e71] + + * source/common/x86/ipfilter8.asm: + asm: replace constant table tab_c_128 by pb_128 + [7bb7f03d3e6b] + +2014-10-30 Nicolas Morey-Chaisemartin + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp: + lowres: save intra mode in lowres + [7047fec7140a] + +2014-10-30 Steve Borho + + * doc/reST/api.rst: + doc: make a note about how to ensure the version number is accurate + [f807d346663e] + + * source/encoder/encoder.cpp: + encoder: fix some obviously incorrect comments + [de28d1b07e6f] + + * doc/reST/cli.rst, source/common/param.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + Merge with stable + [0f14e29eceb1] + + * source/encoder/encoder.cpp: + encoder: give more warnings when features are automatically disabled + + and add comments describing why the combinations are prevented. Some + of them are simply impossible, the option would have no affect and + so it is best not to pretend it is enabled. Some will not be useful + (have a negative impact on performance with no compression + improvement). And others are just currently broken and not typically + used. + [ba3193adff60] + + * doc/reST/cli.rst, source/common/param.cpp, source/x265.cpp, + source/x265.h: + api: allow --psy-rdoq values up to 50; it can be beneficial for film + grain + [73c243602b07] + + * source/encoder/analysis.cpp: + analysis: remove TODO comment, I've given up on the idea + [31d648740464] + +2014-10-29 Gopu Govindaswamy + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: move m_bestME[] from search to Mode structure + [a147b3b6c2f7] + +2014-10-29 Steve Borho + + * doc/reST/cli.rst, doc/reST/threading.rst, source/x265.h: + docs: improve --pmode documentation, the feature is fully functional + [9b73a4d2210a] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: inline checkBestMode(), improve comments + [86ca1de606e3] + + * source/encoder/analysis.cpp: + Merge with stable + [2b7d08c60105] + + * source/test/CMakeLists.txt, source/test/testpool.cpp: + cmake: remove obsolete pool test + [393eb6c95e7c] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/frameencoder.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: nits - pull Mode out of Search class, remove unused + NUM_LAYERS + [2bcf4e77b4bf] + + * source/encoder/analysis.cpp: + analysis: do not allow top-skip and depth earlyout in --pmode with + --rd 5/6 + + Now outputs match or are better than those without --pmode + [476acb7a4088] + +2014-10-29 Min Chen + + * source/common/x86/asm-primitives.cpp: + asm: correct wrong index name + [3995c5e0f313] + +2014-10-29 Steve Borho + + * source/encoder/analysis.cpp: + analysis: clarify --rd 1 + [2a719b6e07ee] + + * source/encoder/analysis.cpp: + analysis: add #if to make pmode exactly match non-pmode + + This switch will throw away the hard work of some worker thread, so + it should only be used for debugging. + + With the flag enabled, pmode matches non-pmode output exactly for RD + levels 2, 3 and 4. But RD 5 and 6 still have problems. + [e92170188568] + + * doc/reST/cli.rst, doc/reST/presets.rst: + Merge with stable + [31ed48cdbefe] + + * doc/reST/cli.rst, doc/reST/lossless.rst, doc/reST/presets.rst: + docs: update preset table and various command line options + [812ce345a14d] + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + Merge with stable + [210967feb8c3] + +2014-10-30 Satoshi Nakagawa + + * source/common/predict.cpp, source/common/predict.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + fix rd=0,1 + [41220fab15c1] + +2014-10-29 Ashok Kumar Mishra + + * source/encoder/search.cpp: + search: nit + [27827068b3ec] + +2014-10-29 Steve Borho + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + api: expose deblocking filter offsets as public params, deprecate + --[no-]lft + + It was always a bit unfortunate to base the deblock param name on + loop filter when HEVC has two (deblock and SAO). + + Now we support --deblock=: and similar options as x264 + does + [f58a1cc0126a] + + * source/x265.h: + api: improve comment for bEnableCbfFastMode + [f834d2e8195f] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + search: relocate intra in inter functions to search.cpp + [fe3b88e11f44] + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + Merge with stable + [f5603998be03] + + * source/encoder/analysis.cpp: + analysis: fix for --rd 0, do not assume a CU is only inter or intra + [3aec7242d0be] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: split generateCoeffRecon() into its two callers, improve + comments + + The function consisted of a big if (inter) else (intra) expression + and it had only two callers, one which knew it was inter and one + which knew it was intra. + [7c2b831e52fb] + +2014-10-29 gopi jayaraman + + * source/encoder/analysis.cpp: + analysis: rect cost compare typo fix for pmode + [da5ba239bf59] + +2014-10-28 Steve Borho + + * source/encoder/search.cpp: + search: ensure RDOQ entropy state is always initialized for chroma + intra + + Fixes non-determinism seen in --preset slow and lower + [afb216f71318] + + * doc/uncrustify/apply-to-all-source.py, doc/uncrustify/drag- + uncrustify.bat, source/Lib/COPYING.HM, source/Lib/README.txt, + source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/vec/blockcopy- + sse3.cpp, source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/predict.cpp, source/encoder/predict.h: + Merge with default (prep for 1.4 release) + [24b4177ea1ec] + + * source/encoder/search.cpp: + search: trModeC -> tuDepthC + [7cfc1edb083f] + + * source/encoder/entropy.cpp: + entropy: simplify loadIntraDirModeLuma + [42566b53b96d] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: make getIntraRemModeBits() const + [c561b0e99684] + + * source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/search.cpp: + search: add a fast method for estimating non-MPM intra mode signal + cost + [9cdc7c61d3fb] + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + search: move getIntraDirLumaPredictor() into getIntraRemModeBits() + [9cc367aa2b40] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: use fast-path to get mpm mode signal cost + + inline single caller of getIntraModeBits + [5b1d67874dd3] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: make a fast const method for getting MPM mode signal cost + [7400828ccd0e] + + * source/encoder/search.cpp: + search: simplify checks for 2x2 chroma blocks + + "log2TrSize == 2 && m_csp != X265_CSP_I444" essentially means that + the chroma transform would be 2x2, aka log2TrSizeC == 1. + + In offsetSubTUCBFs(), the chroma tu size is not calculated but + implied. We should be able to skip the X265_CSP_I444 check since the + function should only be called by 4:2:2 encodes that code two half- + sized chroma blocks per luma block + [90e1b515a364] + + * source/encoder/search.cpp: + search: remove redundant work from residualTransformQuantInter() + [252f886f4871] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/search.cpp: + cudata: split getQuadtreeTULog2MinSizeInCU() into intra/inter + functions + + The caller usually knows what the CU prediction mode is + [98573a12738d] + + * source/encoder/search.cpp: + search: remove redundant logic + [689e105ae41f] + + * doc/uncrustify/apply-to-all-source.py, doc/uncrustify/drag- + uncrustify.bat: + doc: remove uncrustify helper scripts + + I don't expect a lot of whole-new file development or wholesale + style enforcement. Leave the config script in place for new + developers. + [f91c01f6ca83] + +2014-10-28 Ashok Kumar Mishra + + * source/encoder/search.cpp: + [OUTPUT CHANGED for 422] made loops for chroma components in + xEstimateResidualQT() + + The output change for 422 is valid. Initially the no. of bits(cbf + and coeff.) were calculated per block and per chroma component. Now + the no. of bits are calculated per chroma component. + [554dd4aad4a0] + +2014-10-27 Steve Borho + + * source/encoder/search.h: + search: sync up argument names between source and header + [fa79ec52c34d] + + * source/encoder/search.cpp, source/encoder/search.h: + search: remove x prefixes from inter residual coding functions + [da3191896381] + + * source/encoder/search.cpp, source/encoder/search.h: + search: use Cost instances to accumulate costs in + xEstimateResidualQT + [efe17882bca5] + + * source/encoder/search.cpp: + search: nits + [4ad4ba77a339] + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: coding style nits + + * reorder arguments so outputs are listed first + * pass const by reference + * return single integer output rather than pass by reference + * A == 0 ? B : C => A ? C : B; + * standardized variable names (puIdx, absPartIdx, etc) + [4afcdb09550e] + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: perform MV scaling directly within POC distance function + + this avoids some code duplication and is also a bit more efficient + [59df6b4fe1d7] + + * source/encoder/encoder.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: remove top-level Encoder pointer from Slicetype + + Move the slice type assignment earlier to happen even before the + picture is given to the lookahead + [3ccb20b6c022] + + * source/encoder/encoder.cpp: + encoder: nit cleanup of code copying data from input picture + [84f4cb50fe46] + + * source/common/cudata.cpp, source/common/deblock.cpp, + source/common/frame.h, source/common/predict.cpp, + source/common/predict.h, source/common/quant.cpp, + source/encoder/analysis.cpp, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sao.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + fix some header include loops + [fd95b6a592ee] + + * source/common/frame.cpp, source/common/frame.h, + source/encoder/encoder.cpp: + frame: initialize and re-initialize m_bChromaExtended + + fixes some non-determinism + [0fc522bebda5] + +2014-10-27 Nicolas Morey-Chaisemartin + + * source/common/param.cpp, source/encoder/encoder.cpp, + source/encoder/search.cpp: + Set tuQTMaxLog2Size to MIN(5, maxLog2CUSize) + + This allows 32x32 TUs even when --ctu 32 is used, for instance + [14388f2a7a88] + + * doc/reST/cli.rst, source/common/param.cpp, source/x265.cpp: + Add CLI option to enable/disable Temporal MVP + [f6f662559bde] + + * source/CMakeLists.txt, source/common/cudata.cpp, + source/common/param.cpp, source/common/slice.h, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/x265.h: + Add flag to enable/disable temporal MVP + [ef27c0eb2fd6] + +2014-10-27 Min Chen + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h: + asm: AVX2 version of luma_pp[4x4], improve 320c -> 188c + [52ba1fb2227e] + +2014-10-27 Steve Borho + + * source/encoder/analysis.cpp: + analysis: simplify logic slightly, combine conditionals + [d62838b20805] + + * source/encoder/analysis.cpp: + analysis: fix a typo in --rd 5 --rect + [685127dfd466] + + * source/encoder/analysis.cpp: + analysis: allow AMP to work at 64x64 with --pmode and --rd 5 + [0f71dcb02c59] + + * source/encoder/search.cpp: + search: leave a helpful comment to avoid unfortunate reordering + [26e1574a5424] + + * source/common/predict.cpp, source/common/predict.h, + source/encoder/search.cpp: + predict: cache color space dimension shifts, use in search.cpp + [c36cfbbb7133] + + * source/encoder/analysis.cpp: + analysis: in RD 5/6 ignore redundant merge candidates + [f5f26b4b6487] + + * source/encoder/analysis.cpp: + analysis: use same MV range check in merge functions + [07482bd07946] + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + analysis: defer broadcast sets of merge mv field data until final + selection + + The MC / encode functions do not need for the ME sub-parts to be + set. This is only needed by later neighbor parts + [3d5b73b500d4] + + * source/encoder/search.cpp: + search: use member variable from Predict for csp + + There's no need to derefernece the CU for the color space + [14ad8ed3792e] + + * source/encoder/entropy.cpp: + entropy: nit + [3abbf77f3c5a] + + * source/encoder/search.cpp, source/encoder/search.h: + search: simplify handling of TU size edge case in + extractIntraResultChromaQT + [bea22ebe6af1] + + * source/encoder/search.cpp, source/encoder/search.h: + search: fixes for chroma tskip coding with placebo preset + + Move the tskip decision after the special logic that deals with 4x4 + luma, etc A couple of places where tskip == 0 was implied needed to + actually set the flag + [453d131f974b] + + * source/encoder/search.cpp: + search: trMode -> tuDepth, improve clarity + [93a1d11c0a6e] + + * source/common/cudata.h: + cudata: remove unused setDepthSubParts + [9e74e80f2192] + + * source/common/cudata.h, source/encoder/search.cpp: + cudata: setTrIdxSubParts() -> setTUDepthSubParts() + [a2d70700d53b] + + * source/common/cudata.cpp, source/common/cudata.h, + source/common/deblock.cpp, source/common/quant.cpp, + source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/search.cpp: + cudata: renames (m_depth, m_trIdx) -> (m_cuDepth, m_tuDepth) + + No behavior changes + [288db4430fcd] + + * doc/reST/cli.rst: + docs: update --tu-intra-depth / --tu-inter-depth docs (refs #71) + [654ea07f93de] + + * source/encoder/entropy.cpp: + entropy: improve check strings and comments + [97ca46e10144] + + * source/encoder/search.cpp: + search: further --rd-penalty 2 fixes (refs #71) + [b0eaff8d1d1a] + + * source/encoder/analysis.cpp: + analysis: invalidate slave entropy contexts, to help find bugs + + this function has no effect in non-checked builds + [8fd59b57e5e0] + + * source/encoder/analysis.cpp: + analysis: checkIntraInInter_rd0_4() needs the entropy context to be + initialized + [5a3c867cdf57] + +2014-10-25 Steve Borho + + * source/encoder/analysis.cpp: + analysis: further work on --pmode for --rd 5/6 + + Still needs debugging. The results with --pmode enabled have worse + compression than without, when used with --rd 5. That should be un- + possible. + [69277ae804d0] + +2014-10-24 gopi jayaraman + + * source/encoder/analysis.cpp: + analysis: adding support for --rd 5/6 in compressInterCU_dist() and + parallelModeAnalysis() + [9e221c8530bc] + +2014-10-26 Steve Borho + + * source/encoder/entropy.cpp: + entropy: readability nits + [78af768201c7] + + * source/common/common.h: + common: add support for debug breaks in Xcode + [3e4aee59e21d] + + * source/common/param.cpp: + param: fix w-s spacing in log messages for --rdpenalty and chroma + offsets + + and some nit fixes with braces while I was in the area + [d9ce2644307a] + +2014-10-27 Deepthi Nandakumar + + * source/encoder/analysis.cpp: + analysis: motionCompensation(chroma) needs to be performed for + asymmetric merge also + + This fixes a bug introduced in commit (372a8230110a) for rd2 + rect/amp options + [1b555ddd7667] + + * source/encoder/motion.cpp: + motion: this emms is required + [d32f1ad8d061] + + * source/encoder/analysis.cpp: + analysis: cleanup + [5fd8b40075a6] + +2014-10-27 Satoshi Nakagawa + + * source/common/cudata.h, source/encoder/frameencoder.cpp: + cugeom: fix uninitialized reported by valgrind + [3a8f6f685436] + +2014-10-25 Steve Borho + + * source/common/cudata.cpp: + cudata: validate subCU size against current analysis depth + [67d73bffd1fd] + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: make sure per-part buffers are allocate for g_maxCUSize + + There are memsets which rely on the data being consecutive, and for + this to work they have to be the correct length. This fixes some + X265_CHECK failures and possibly some odd behavior with --ctu 32 or + --ctu 16 + [d3a3d6cad8db] + + * source/encoder/search.cpp: + search: improve comments in mergeEstimation() + [5186635c0536] + + * source/encoder/search.cpp: + search: turn some redundant clears of tskip flags into runtime + checks + [72f2b87c86eb] + + * source/encoder/search.cpp: + search: cleanup residualQTIntraChroma + + There was a bug where it was reading tskip before setting it to + zero, but fortunately we never allow analysis to set tskip anyway. + [4d3797830500] + + * doc/reST/cli.rst: + docs: update --tskip and --cu-lossless docs + [f81a2cec4183] + + * source/encoder/encoder.cpp: + encoder: issue warning and disable --pmode if rdlevel < 2 + [4e7f9bca6f39] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: remove resiYuv from Mode, keep tmpResiYuv in m_rqt[] + + The residual buffer is always very short lived; there is no reason + to keep a copy of it per mode. + [08be12894acd] + + * source/encoder/search.h: + search: updateCandList() can be a static method + [4e8edad1f2e6] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline updateModeCost + [e69a8546897a] + + * source/encoder/encoder.cpp: + encoder: issue warnings and explicitly disable tskip or culossless + if rd < 3 + + the analysis code is quite incapable of making these RDO decisions + at these RD levels. It's best that these tools never appear to be + enabled at these RD RD levels, and to explain why + [b2aa1fd68ffa] + + * source/encoder/search.cpp: + search: prevent warnings about unused bCheckSplit value + [5e8e0e5fb760] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h: + primitives: remove unused calcrecon primitive (assembly needs + cleanup) + [daa0e77083a7] + + * source/encoder/analysis.cpp, source/encoder/frameencoder.cpp, + source/encoder/motion.cpp, source/encoder/search.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp: + trim x265_emms(), try to only use prior to floating point operations + [64bb88dc7cb6] + + * source/encoder/search.cpp: + search: avoid a context save at the last recursion depth + [d7fbf10efe61] + + * source/encoder/search.cpp: + search: improve comments and readability of + residualTransformQuantIntra + [58545ea1f6af] + + * source/encoder/search.cpp, source/encoder/search.h: + search: keep recon QT in pixels, instead of shorts + + This changes outputs, apparently because SSE is now comparing fenc + against the clipped recon instead of the un-clipped recon. This was + punishing residuals which were close to the pixel dynamic range + limits. The user never sees un- clipped pixels, and external + distortion metrics always use clipped recon, so it makes sense to do + the same here (never mind the obvious perf benefits) + [567491c02bf7] + + * source/encoder/search.cpp, source/encoder/search.h: + search: remove tskip analysis out of luma chroma normal path + [1ea467c9bb22] + + * source/encoder/search.cpp: + search: nit. splitted is not a word + [794bf8c060d4] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: rename methods that read coeff and recon from RQT struct at + final depths + + also reorder arguments and pass reconYuv as a reference + [69ee86fd7284] + + * source/encoder/search.cpp: + search: give offsetSubTUCBFs a basic comment + [67ae716977fd] + +2014-10-24 Steve Borho + + * source/encoder/search.cpp, source/encoder/search.h: + search: simplify RDO chroma intra coding, changes tskip outputs + + Since the TU layers above tskip's 4x4 are not encoding their + residual (they only need distortion, not RD cost) there is no reason + to try to preserve the entropy coder state. This gives slightly + better compression than before, when tskip is enabled, and I believe + it makes the code a lot more maintainable. + [2261ad40ffe8] + +2014-10-25 Steve Borho + + * source/encoder/search.cpp: + search: simplify initTrDepth + [ddafaee9bf39] + + * source/encoder/search.cpp: + search: reconYuv as ref + [f97c6f14a975] + + * source/encoder/search.cpp: + search: improve a variable name + [b51aceca9bd8] + + * source/encoder/search.cpp, source/encoder/search.h: + search: rename a couple chroma intra helper methods + [6f964d4cc8ef] + +2014-10-24 Steve Borho + + * source/encoder/search.cpp, source/encoder/search.h: + search: rename tmpCoeff to coeffRQT, tmpShortYuv to reconQtYuv / + resiQtYuv + + Explain why these buffers are allocated to max CU size at every + layer, fix a few nits + [847c45521c19] + + * source/encoder/search.cpp: + search: fix 4:2:2 chroma tskip bit-cost estimation + [0fc9c36d0c92] + + * source/common/yuv.cpp, source/common/yuv.h: + yuv: add copyPartToPart* methods for recon RQT finalization + + We're switching reconQt to be kept in pixels rather than shorts + [d918b786a3e6] + + * source/common/shortyuv.cpp, source/common/shortyuv.h: + shortyuv: use absPartIdx for CU/TU part offset like everywhere else + [0922d96a74a6] + +2014-10-24 Praveen Tiwari + + * source/common/x86/pixel-util8.asm: + weight_sp: pshufd to handle width 6 for SSE version of asm code + + Backout of 2cb8cdaa7df5 + [1a07740f85f5] + +2014-10-24 Ashok Kumar Mishra + + * source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/search.cpp: + search: refactored xEstimateResidualQT() to remove cbf flag settings + [759c6cbf54fa] + + * source/encoder/search.cpp: + search: remove redundant cbf flags setting in xEstimateResidualQT() + [363bd8ef6c6b] + +2014-10-24 Praveen Tiwari + + * source/common/quant.cpp: + quant.cpp: nits + [5f0838850cb5] + +2014-10-24 Ashok Kumar Mishra + + * source/encoder/search.cpp: + search: remove unnecessary set of cbf flags in xEstimateResidualQT() + [a0c07b8e583b] + +2014-10-23 Steve Borho + + * source/CMakeLists.txt: + cmake: add -Wno-array-bounds if the compiler supports it + + These warnings have proven to always be spurious + [e3a3d17b821c] + +2014-10-23 Ashok Kumar Mishra + + * source/encoder/search.cpp: + [OUTPUT CHANGED for 444] : considering cbf bits for best cost + estimation for 444 format + [cfa3750f72a5] + +2014-10-23 Steve Borho + + * source/encoder/analysis.cpp: + analysis: cleanup checkInter functions + [daed2d3f67ba] + + * source/encoder/analysis.cpp: + analysis: remove unnecessary set of skip flags in checkInter_rd5_6() + + initSubCU() does this already, and the pred's cu is not being reused + [79f0d5f296ef] + + * source/common/quant.cpp, source/common/quant.h, + source/encoder/analysis.cpp, source/encoder/encoder.cpp, + source/encoder/search.cpp, source/encoder/search.h: + search: large mostly mechanical change to pass cu by reference + [b2005914aeb7] + + * source/common/cudata.h: + cudata: remove unused method + [260eee4634a5] + + * source/common/predict.cpp, source/common/predict.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + predict: enforce calling conventions, fix wrong side-effects + + use references and consts where possible, order arguments to follow + the convention of memcpy (dest, src) + + This exposed a bug in addWeightBi() and addWeightUni(), they were + modifying the PU size variables directly instead of making chroma + versions. This explains why it seemed to best necessary at times to + make seemingly redundant calls to prepMotionCompensation. + + as a side-effect, this commit also removes the 1k 'avg' buffer that + bidir allocated on the stack and instead uses the existing + tmpPredYuv + [ff804d8ab03d] + + * source/common/predict.cpp, source/common/predict.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + predict: rename members for clarity, save work in + singleMotionEstimation() + + The first thing singleMotionEstimation() did was call + getPartIndexAndSize() to get the PU part index and dimensions. Then + it called prepMotionCompensation() which did the exact same thing, + storing its outputs into member variables. (after predInterSearch() + had already done it twice as well) + + Now singleMotionEstimation() and predInterSearch() both directly use + the variables initialized by prepMotionCompensation(). Now when the + master thread calls its own singleMotionEstimation(), there is much + less redundant work + [c942de89cbed] + + * source/encoder/search.cpp: + search: fix a change of outputs from f3bd6e5a880a, always zero + unused refs + + it's not clear why this affects outputs, but it seems better to err + on the side of the data being initialized. + [8ac590040e8c] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: re-combine --pme with --no-pme code paths + [f8ee24fbbede] + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: cache numPartInCUSize as a class static + + The obliviates a lot of pointer dereferences in some key functions + [2763d49b2e23] + + * source/encoder/entropy.cpp: + entropy: drop last use of g_winUnitX, g_winUnitY + [77210e81c4ad] + + * source/common/cudata.cpp, source/common/deblock.cpp, + source/common/framedata.cpp, source/common/framedata.h, + source/common/predict.cpp, source/common/slice.h, + source/encoder/encoder.cpp: + slice: move numPartitions and numPartInCUSize from FrameData to SPS + + these fields never change, so it made no sense to have copies in + every FrameData they are based on CTU size, so SPS made sense + [17c5d2cc1335] + + * source/common/cudata.h: + cudata: remove default arguments for getPUAboveRightAdi(), + getPUBelowLeftAdi() + [077015265a08] + + * source/common/cudata.cpp, source/common/cudata.h, + source/common/framedata.cpp, source/encoder/analysis.cpp: + cudata: simplify allocation / initialization interfaces + + the callers shouldn't need to know details about partitions or coeff + buffer sizes + [ebaeb6aa5dda] + + * source/encoder/search.cpp: + search: use intptr_t for picture stride variables + [fa3e1744f125] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/frameencoder.cpp: + cudata: push more data type casts out to callers + [bb5814a49de5] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp: + cudata: use static array of absolute depth broadcast set functions + + this commit changed the value arguments to these set functions to + match the data type of their array, forcing one cast in analysis.cpp + to avoid a warning. + [f593e0455cbc] + +2014-10-22 Steve Borho + + * source/encoder/analysis.cpp: + analysis: encodeResidue rewrite, much improved --rd 0 + + it's not clear --rd 0 is always correct, but I can encode long clips + without hash mistakes and at reasonable bitrates (compared to + previous --rd 0). I suspect there is still problems with passing in + residual to residualTransformQuantInter() and getting it back in the + same ShortYuv instance + [bd865dd464bc] + + * source/common/frame.cpp, source/common/frame.h, + source/common/framedata.h, source/common/quant.cpp, + source/common/quant.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/search.cpp: + nr: move noise reduction arrays to Quant, simplify its upkeep + + This cleans up a number of layering violations and makes the array + management more robust + [ce304756a6e4] + + * source/CMakeLists.txt, source/Lib/COPYING.HM, source/Lib/README.txt, + source/Lib/TLibCommon/ContextTables.h, source/common/CMakeLists.txt, + source/common/contexts.h, source/common/quant.cpp, + source/common/quant.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + cleanup enough of the context tables to bring them into common/ + + This was the last file under Lib/ so the whole folder is now + removed. + [fd03d43c1a97] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h, source/common/CMakeLists.txt, + source/common/common.cpp, source/common/common.h, + source/common/constants.cpp, source/common/constants.h, + source/common/cudata.cpp, source/common/cudata.h, + source/common/dct.cpp, source/common/deblock.h, + source/common/intrapred.cpp, source/common/ipfilter.cpp, + source/common/loopfilter.cpp, source/common/param.cpp, + source/common/picyuv.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/quant.h, + source/common/scalinglist.cpp, source/common/shortyuv.h, + source/common/slice.h, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-sse41.cpp, source/common/vec/dct-ssse3.cpp, + source/common/x86/asm-primitives.cpp, source/common/yuv.h, + source/encoder/bitcost.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/motion.cpp, + source/encoder/search.cpp, source/encoder/slicetype.cpp, + source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp: + pull TComRom into common/constants. bring CommonDef and TypeDefs + into various + + A number of enums were pulled into cudata.h, one went to slice.h. + All the defines went into common.h + [bddf8ccf4c94] + + * source/common/cudata.cpp: + cudata: nits, use m_encData directly + [63cb0c68d0c0] + + * source/encoder/search.h: + search: use proper allocation size for m_rqt, fix --preset placebo + + The entropy contexts need to be addressed from 0..4 (full depth) + even if the buffers are only allocated to CU depth 0..3 + [a09b45ead8e0] + + * source/encoder/analysis.cpp: + analysis: fix --rd 1,2 behavior (broken by 84933c3136ec) + + Also includes some fixes for --rd 0 + [372a8230110a] + + * source/common/cudata.cpp: + cudata: avoid 'char subscript' warnings from GCC + [6716ce0bb043] + + * source/common/framedata.cpp, source/common/framedata.h, + source/encoder/encoder.cpp: + framedata: param is now unused + [01e865efd595] + + * source/common/frame.cpp, source/common/framedata.cpp, + source/common/framedata.h, source/common/slice.h, + source/encoder/encoder.cpp, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp: + slice: move numCUsInFrame from FrameData to SPS + [9b55f47f2043] + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framefilter: optimize row-height operations + [a485e3377861] + + * source/common/cudata.cpp, source/common/frame.cpp, + source/common/frame.h, source/common/picyuv.cpp, + source/common/picyuv.h, source/common/slice.cpp, + source/common/slice.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/framefilter.h, + source/encoder/ratecontrol.cpp, source/encoder/reference.cpp, + source/encoder/sao.cpp, source/encoder/search.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp: + slice: move numCuInWidth, numCuInHeight to SPS, remove Frame pointer + + Since the SPS defines the picture size and CTU size, it makes sense + to keep these values there. It's a bit annoying removing m_frame + from Slice, but all of its users didn't really want the frame + itself, except in a couple instances (most already had the frame + pointer) + [fe8200af773f] + + * source/common/cudata.cpp, source/common/cudata.h, + source/common/deblock.cpp, source/common/predict.cpp, + source/encoder/framefilter.cpp: + cudata: keep a FrameData pointer instead of a Frame pointer + + 90% of the dereferences of m_frame were to access + m_frame->m_encData. For good reason, all of the encode data needed + by the CU is in m_encData. A few places need to get to the orig pic. + [57f9b1f41b0c] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/frameencoder.cpp: + cudata: pass picWidth and picHeight to calcCTUGeoms() + + this was the only method which used m_frame for anything other than + accessing m_encData + [5d4828d1b706] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp: + cudata: simplify setQPSubCUs() + [7b87661b8fef] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + cudata: consistent names for consistent functionality + [19f79fa18526] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + cudata: no need to pass part size to setAll*, it can look it up + itself + + It only needed the part size in the past when these were + TComCUMvField methods and they had no access to the part size array + [c55083482d74] + + * source/encoder/search.cpp: + search: use helper function to set transform depth + [583484b4a5e7] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/search.cpp: + cudata: simplify clearCbf() + [59a39ac95108] + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: hoist a number of trivial functions to cudata.h for inlining + [3a4708b0116c] + + * source/common/common.h, source/common/cudata.h, + source/common/frame.h: + cudata: clean up a couple defines + [fa10e7328429] + + * source/encoder/search.cpp: + search: remove unnecessary parens + [2c42168b58b8] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + cudata: remove depth argument to setInterDirSubParts() + + Inter dir is coded at the PU level, so it is always at the current + depth + [9430c148eb87] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/common/CMakeLists.txt, source/common/cudata.cpp, + source/common/cudata.h, source/common/deblock.cpp, + source/common/framedata.cpp, source/common/framedata.h, + source/common/predict.cpp, source/common/predict.h, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/entropy.cpp, source/encoder/search.cpp, + source/encoder/search.h: + cudata: merge motion field data directly into the CUData + + TComCUMvField didn't add anything except overhead + [dbfa9c03d8a3] + + * source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp, + source/common/cudata.h, source/encoder/analysis.cpp, + source/encoder/search.h: + mvfield: move TComMvField into cudata as MVField + + The structure is only used for collecting merge candidates now, and + does not need any constructor + [b7ca971f2a57] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h: + mvfield: combine mv buffers into a single allocation + [bbe34d78a627] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + mvfield: remove unnecessary depth arguments to set functions + + mv and refidx are CU level data, not TU level, so there is never any + need to set the values for a depth other than 0. Remove support for + NxN parts from the switch statement, made the template function + protected + [62f736e7ca47] + + * source/encoder/search.cpp: + search: nits + [6da134a66e0f] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + mvfield: remove setAllMvField wrapper + + When you see how the code unwound, certain optimizations may become + possible + [cdbb818ed024] + + * source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp: + mvfield: remove setMvField() method + [c1453d40d1cf] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp, + source/common/cudata.h, source/common/framedata.cpp, + source/encoder/analysis.cpp: + mvfield: pass objects by reference, const when possible + [d5d7033e1a04] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp: + cudata: replace clearMvField() with m_partSet calls + [4831aa891f00] + + * source/common/cudata.cpp: + cudata: copy mvfield data from CTU in copyFromPic (more --rd 0 + fixes) + [b0c6b3a9ccf4] + + * source/encoder/entropy.cpp: + entropy: remove only reference to TComCUMvField outside of cudata.h + [564f07d41a6e] + + * source/encoder/search.cpp: + search: avoid unnecessary memcopies in inter prediction + + Calling setAllMvField() twice is wasteful. At least one, perhaps + both, of them will be overwritten by the final prediction. Better to + set the REF_NOT_VALID manually for uni-predition on the 'other' + list. + [f3bd6e5a880a] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp, + source/encoder/search.cpp: + defs: remove last traces of REF_PIC_LIST enums, NOT_VALID -> + REF_NOT_VALID + [ed57e2e5c2b6] + + * source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp: + mvfield: remove getRefIdx and getMvd access methods + [7b94b7de0af5] + + * source/Lib/TLibCommon/TComMotionInfo.h, source/common/cudata.cpp, + source/common/deblock.cpp, source/common/predict.cpp: + mvfield: remove getMv access method + [eec157891d46] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/encoder/analysis.cpp: + mvfield: remove default arguments, use consistent variable names, + cleanup + [a453285d756d] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, source/encoder/entropy.cpp, + source/encoder/search.cpp: + mvfield: class to struct, remove setMvd method + [7b17ecc90937] + + * doc/reST/cli.rst: + docs: improve documentation for tskip options + [8aa71d43db99] + + * source/encoder/search.cpp: + search: fix --rdpenalty 2, make logic explicit (closes #71) + [ca0090c8fc69] + +2014-10-21 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: avx2 asm code for 8bpp and 16bpp vesion of scale1D_128to64 + module + [47095aafe91a] + +2014-10-21 Praveen Tiwari + + * source/common/x86/pixel-util8.asm: + weight_sp: sse version of asm code optimization + [2cb8cdaa7df5] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + weight_pp: avx2 asm code as per new interface + [b6bd42615b37] + +2014-10-21 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + fix checked build errors + [61ce6f790f25] + + * source/common/picyuv.cpp, source/common/picyuv.h, + source/encoder/encoder.cpp, source/encoder/encoder.h: + picyuv: cache offset arrays in the top-level encoder + + All PicYuv generated for a given encoder would generate the same + offset arrays, so they might as well all point to the same memory + [3465ef1eb7fc] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp: + cudata: add a helper function for not-present CUs + [f8c3748eb24d] + + * source/common/cudata.h: + cudata: comment nits + [7a2b895d7577] + + * source/encoder/search.cpp: + search: move auto var initialization to avoid goto warning + [884bb04709a4] + + * source/common/cudata.cpp, source/common/cudata.h, + source/common/deblock.cpp, source/common/quant.cpp, + source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/search.cpp: + cudata: consistent naming rules for part data (singular, no b prefix + for flags) + [943ebf4f2cf4] + + * source/common/cudata.cpp: + cudata: cleanup copy methods, fill in missing copies in + copyFromPic() + [70cb93a0fc26] + + * source/common/cudata.h: + cudata: reorder and white-space nits + [ea81e3999545] + + * source/common/cudata.cpp, source/common/cudata.h: + cudata: inline single caller of getPartPosition + [2e5df75752c4] + + * source/common/cudata.h: + cudata: consistent use of absPartIdx + [bdffb50ff2fd] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp: + cudata: remove unused setCUTransquantBypassSubParts + [708b65ea888d] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/framefilter.cpp: + cudata: m_cuTransquantBypass will not be set without lossless being + enabled + [19baf4ddfe2f] + + * source/common/cudata.cpp: + cudata: nits + [e4fcdc4f802a] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + cudata: define copy and broadcast set methods + + Pruned a couple of functions which were only called once and were + redundant or could have been done more simply. + [196c2544685b] + + * source/common/cudata.cpp, source/common/cudata.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + cudata: simplify setPartSizeSubParts and friends + + These fields are always broadast-set to all sub-parts of the CU + [668dbdd70654] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/CMakeLists.txt, + source/common/cudata.cpp, source/common/cudata.h, + source/common/deblock.cpp, source/common/deblock.h, + source/common/framedata.cpp, source/common/framedata.h, + source/common/predict.cpp, source/common/predict.h, + source/common/quant.cpp, source/common/quant.h, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/sao.cpp, + source/encoder/search.cpp, source/encoder/search.h: + bring TComDataCU into common/ as CUData + [a560d44d2cbd] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/predict.cpp, + source/common/predict.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/search.cpp, source/encoder/search.h: + rebrand CU/cuData as CUGeom/cuGeom + [b7503f180eb4] + + * source/encoder/encoder.cpp: + encoder: improve comment, frame encoders to not use the worker pool + [34c830359f33] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: pre-calculate the set of unique geoms for the picture + size + + Now the geom sets only need to be calculated once, and they occupy a + minimum amount of memory. + [fe480c4b66be] + + * source/encoder/frameencoder.h: + frameencoder: group pointer members together for better alignment + [ca1be14a7d81] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: nits + [d76b2094d5e6] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: use memset in setQPSubParts() + [0f452547704c] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + cu: inline single use of setSubPart into setInterDirSubParts + [50d30aaa9823] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: remove unused enum NDBFBlockBorderTag + [902fdb066b70] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: repair original intent of getLastCodedQP(), fix 73c6c9086577 + [8ea4c8a389cd] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp: + cu: remove m_cuLocalData from CU, use child offsets instead of + absolute indices + + This commit also removes a redundant + 'tld.analysis.m_quant.setQPforQuant()' from frameencoder.cpp. This + belonged in compressCTU() and was actually one of the first things + it does. + [411149951603] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/frameencoder.cpp: + cu: pass m_cuLocalData to loadCTUData() + + This is prep-work for removing m_cuLocalData from TComDataCU and + having only a few instances of the array. + [13d410a45434] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: style nits, no behavior change + [8ab3817fcf97] + + * source/encoder/search.cpp, source/encoder/search.h: + search: m_qtTempCoeff[ttype][qtLayer] -> + m_rqt[qtLayer].tmpCoeff[ttype] + [f7f4d9b59430] + + * source/encoder/search.cpp, source/encoder/search.h: + search: m_qtTempShortYuv[qtlayer] -> m_rqt[layer].tmpShortYuv + + one less malloc to fail + [5a3e8a4a51de] + + * source/encoder/search.cpp, source/encoder/search.h: + search: move inter search temp Yuv buffers into RQTData + + This makes their allocation (and stride) per depth, which is + hopefully a little more cache friendly + [062c06517722] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: rename m_rdContexts to m_rqt since its purpose has expanded + [1d3b861ff5bb] + +2014-10-21 gopi jayaraman + + * source/encoder/search.cpp: + search: --pme bug fixes with slave threads tie up case + [f5fc662b07cb] + +2014-10-20 Steve Borho + + * source/encoder/search.cpp: + search: move the destroy() method to after initSearch() + [60633acf5a3a] + + * source/common/frame.cpp: + frame: initialize m_reconPicYuv pointer + [4cff05a46557] + +2014-10-21 Deepthi Nandakumar + + * source/encoder/search.cpp: + search: make split choice logic cleaner + [e66f78a6df4f] + + * source/encoder/search.cpp: + search: rename variables to mightSplit and mightNotSplit + [b507a636f7e6] + +2014-10-20 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: use small struct to accumulate costs + [535b9ca17dd3] + + * source/encoder/search.cpp: + search: remove shadow variable + [06f5d1594eca] + + * source/encoder/search.cpp: + search: improve comments + [dad2c503c21e] + + * source/encoder/search.cpp: + search: avoid redundant work in typical path + [0dd4b62f7331] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single caller of xGetIntraBitsLuma, avoid extra + copies + [ce76838b769f] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single call of calcIntraLumaRecon + + This allows some much needed clarity and to avoid some redundant + work + [791d9a3ad651] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: rename xRecurIntraCodingQT to codeIntraLumaQT, save a bit of + work + [2cb88ddefc43] + + * source/encoder/search.cpp: + search: combine xRecurIntraCodingQT() bCheckFull sections + [8eae86316959] + + * source/encoder/search.cpp: + search: add tmpBits to xRecurIntraCodingQT + [4a0498664e3f] + + * source/encoder/search.cpp: + search: remove remnants of tqbypass from xRecurIntraCodingQT, + improve var names + [f81c1e3a8788] + + * source/encoder/search.cpp, source/encoder/search.h: + search: create a per-depth temp buffer for temporary recon blocks + [9642b0dc0798] + + * source/encoder/search.cpp: + search: fix --rdpenalty=2 at --rd 0,1 + + It needed the same fix for not skipping 32x32 if TU recursion is too + restricted + [0995d74b9b9c] + + * source/encoder/search.cpp: + search: use member variable access to current slice and frame, not + the cu's + [8d170703b186] + + * doc/reST/cli.rst, source/encoder/search.cpp: + search: clarify --rdpenalty + [d0725a830b8d] + + * source/encoder/search.cpp: + search: simplify checkTransformSkip logic + + drop !!qp condition, this doesn't appear necessary. we already check + for TQ bypass + [6df8c27c184a] + + * source/encoder/encoder.cpp: + encoder: make assignments to Frame::m_intraData and m_interData + unconditional + + The check doesn't help anyone. If pic_in->analysisData.intraData was + NULL, the Frame fields are entirely uninitialized. + [b874ac8f5427] + + * source/encoder/search.cpp: + search: remove redundant clearCbf call + + The logic would not get here if cu->getQtRootCbf(0) didn't return + zero, which implies all three CBF flags are already zero + [5f15e0a7ce9a] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + search: move checkIntra from Analysis to Search + + This function didn't belong in Analysis + [42bc2e852217] + + * source/encoder/search.cpp, source/encoder/search.h: + search: rename xSetResidualQTData to saveResidualQTData and simplify + [7c5713ba1712] + + * source/encoder/search.cpp, source/encoder/search.h: + search: do not return CBF=0 distortion from xEstimateResidualQT() + + it can be calculated easily if needed + [1882fb0d0d53] + + * source/encoder/analysis.cpp: + analysis: nit + [3fad706c702e] + + * doc/reST/cli.rst: + doc: update description of --cu-lossless + [e16426ff0679] + + * source/encoder/analysis.cpp: + analysis: do not try lossless if best mode had no distortion + [eb284bc32580] + + * source/encoder/analysis.cpp: + analysis: copy inter prediction when evaluating lossless + [60bf2a917454] + +2014-10-20 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: fix error in vbv due to access of unreferenced refFrame data + [6d1b8b8c0d7a] + +2014-10-20 Praveen Tiwari + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/encoder/reference.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp, source/test/pixelharness.cpp: + weighted prediction pixel, interface simplification + [b2f534e54325] + +2014-10-20 Gopu Govindaswamy + + * source/encoder/analysis.cpp: + analysis: share the depth, best modes and partitions based on cuAddr + - bug fix + [87515a42e79c] + +2014-10-20 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: clear transform skip flags in lossless CU copy + [18d9c6d2f212] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: re-order part data defines to match allocation order (nit) + [562f844b8f5a] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/search.cpp: + analysis: try lossless encode of only the best mode for each CU + + This involved some subtle changes in behavior. TComDataCU::initCTU + and initSubCU now initialize m_cuTransquantBypass to the global + --lossless flag, so in the first pass of analysis it will always be + all non-lossless (typical) or all lossless (when --lossless is + enabled). + + Before this change, when "--cu-lossless and !--lossless" the encoder + would try both lossless and non-losseless coding of every option + evaluated for RDO. This roughly doubled the work performed by the + encoder, and because the same TComDataCU instance was being used for + both, it was fragile + + After this change, the encoder will only try lossless once per CU + using the best mode found during non-lossless RDO, using a seperate + TComDataCU instance. + [cf127f22ef3b] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: --pme needs master instance to be passed to slave function + + The slave was read its own m_listSelBits instead of those pre- + computed by the master, and then even worse it was updating its own + m_bestME (holding its own output lock) instead of the master's. + [5179bb833cea] + + * source/encoder/analysis.cpp: + analysis: nits + [c02dda2693b9] + + * source/encoder/analysis.cpp: + analysis: only call checkDQP once per CU, use a consistent reconPic + write policy + [089e256fb32a] + +2014-10-20 Deepthi Nandakumar + + * source/encoder/search.cpp: + search: cleanup + [7eab67ffff81] + +2014-10-20 Steve Borho + + * source/common/framedata.cpp, source/common/framedata.h: + framedata: add pointer to active param structure for that picture + + today, this is always the single global param. in the future it may + be different + [c15bb6a0d01f] + +2014-10-19 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/framedata.h, + source/encoder/analysis.cpp, source/encoder/frameencoder.cpp: + cu: move more statistics into FrameData + [32b5ab08cb51] + + * source/encoder/search.cpp: + search: nit + [1e09d0395826] + + * source/encoder/analysis.cpp: + analysis: fix for --rd 0, always generate chroma prediction for best + merge + [75e42b9db526] + +2014-10-20 Deepthi Nandakumar + + * source/common/common.h, source/common/pixel.cpp: + pixel: add signed accumulators, these were causing errors in new + primitives + [d24d7c1a43f5] + + * source/encoder/analysis.cpp: + analysis: split flag cost is added only if the depth is less than + max depth + + removes (some) CHECKED_BUILD errors. + [d1cd4a753d9e] + +2014-10-19 Steve Borho + + * source/encoder/analysis.cpp: + analysis: add emergency intra check for --pmode + + if all other choices somehow fail, we should always be able to code + intra + [9da983792c7d] + + * source/encoder/frameencoder.cpp: + frameencoder: handle non-present CUs correctly when collecting stats + [b575d5bf01e8] + + * source/encoder/analysis.cpp: + analysis: handle no-valid-merge more cleanly, simplify + compressInterCU_rd0_4 + + checkMerge2Nx2N_rd0_4() will set md.bestMode if any of them were + valid, else it will be left NULL. Fixes some rare decoder asserts + when no valid modes is coded + [84933c3136ec] + +2014-10-18 Steve Borho + + * source/encoder/ratecontrol.cpp: + rc: fix error message + [a480618a85a3] + + * source/encoder/ratecontrol.cpp: + rc: use curEncData.m_numCUsInFrame + + Note that the CU count was wrongly calculated before as height * + height, broken by me a couple days ago in 14d5345e257a + [426a279be2b8] + + * source/common/framedata.cpp, source/common/framedata.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp: + framedata: change 'structure of arrays' to 'array of structures' for + RC stats + + This uses fewer mallocs, fewer memsets, and generally better data + locality. It doesn't hurt that the code is more readable. + [6a5d1543b769] + + * source/common/frame.cpp, source/common/frame.h, + source/common/framedata.cpp, source/common/framedata.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + frame: move rate control data to FrameData, cleanup variable names + [16cf099c8a78] + + * source/common/framedata.cpp: + framedata: fix license header + [3810e534830c] + + * source/encoder/framefilter.cpp, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: move lossless sample restoration functions to framefilter.cpp + + These are only called by the framefilter code, and are not directly + related to SAO itself. + [99fccc2a5063] + +2014-10-15 Satoshi Nakagawa + + * source/common/common.h, source/encoder/frameencoder.cpp, + source/encoder/sao.cpp, source/encoder/sao.h: + sao: refine sao merge mode + [f38d218c62d4] + +2014-10-17 Ashok Kumar Mishra + + * source/encoder/search.cpp: + search: removed redundant context store + [0a94dccf7c9a] + + * source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/search.cpp: + search: removed redundant getAllowedChromaDir function call + [15a7b8d0698e] + +2014-10-18 Steve Borho + + * doc/reST/cli.rst, source/common/param.cpp, + source/encoder/analysis.cpp: + analysis: allow --b-intra to work with RD levels < 5 (no behavior + change) + + This changes no existing presets. This flag used to default to be + on, but it was ignored at RD levels 0..4. Now it defaults to off, + but is enabled by all the presets that use RD 5 and 6. This commit + re-orders the tools log line to place similar options near each + other + [b9e3cec471c7] + + * source/encoder/CMakeLists.txt: + cmake: use -Wno-uninitialized, to avoid confusing clang or older GCC + [66687fc129ff] + +2014-10-17 Steve Borho + + * source/common/param.cpp: + param: do not enable fast CBF at faster presets (no behavior change) + + This flag has never had any effect at the RD levels these presets + use, so this never had any effect, except showing 'cfm' in the + logged tools list. + [6e7567637dcb] + + * source/encoder/analysis.cpp: + analysis: fix --pmode crash, do not look at intra results if not + P_SLICE + [0123cf455f17] + + * doc/reST/cli.rst, source/encoder/analysis.cpp, + source/encoder/analysis.h: + analysis: allow --amp to actually work with RD levels < 5 + + This changes none of the presets or defaults, it just allows + --preset medium --rect --amp to perform as the user would expect + [ff8fb2e06847] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/encoder.cpp: + analysis: don't pass global variables to create methods + [03f5fcca39be] + + * source/encoder/analysis.cpp: + analysis: nits + [8bc599ad83ea] + + * source/encoder/analysis.cpp: + analysis: consistent use of subPartIdx + [e089d34046d5] + + * source/encoder/analysis.cpp: + analysis: streamline parallelModeAnalysis + [807ef6c02d64] + + * source/common/predict.cpp, source/common/predict.h: + predict: keep ref indices, rather than mvField pointers + [3f100d527ab3] + + * source/common/CMakeLists.txt: + cmake: remove source group for TLibCommon/ in MSVC + + Allow the remaining files to be grouped with the rest of common/ for + now + [73ed4c1d2387] + + * source/test/CMakeLists.txt, source/test/testharness.h: + cmake: newer GCC/MinGW define __rdtsc and do not like our + redefinition + [b6eb92d35ccb] + + * source/encoder/CMakeLists.txt: + cmake: quiet -Wmaybe-uninitialized warnings in encoder/ + + gcc >= 4.8 is catching false positives from Clip3 + (std::min<>(std::max<>)) when the arguments are all known at compile + time. Just shut-up, please. + [c818548cf7c1] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, source/common/CMakeLists.txt, + source/common/deblock.cpp, source/common/frame.cpp, + source/common/frame.h, source/common/framedata.cpp, + source/common/framedata.h, source/common/piclist.cpp, + source/common/predict.cpp, source/common/slice.cpp, + source/encoder/analysis.cpp, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/sao.cpp, source/encoder/search.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp: + pull TComPicSym into common/ as FrameData + + Document the nature in how these are reused by multiple pictures. + Frame::m_picSym was renamed to Frame::m_encData to make the + relationship more clear. + [f3ede27baeee] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, source/common/deblock.cpp, + source/common/lowres.h, source/common/picyuv.cpp, + source/common/picyuv.h, source/common/pixel.cpp, + source/common/predict.cpp, source/common/predict.h, + source/common/primitives.h, source/common/slice.cpp, + source/encoder/analysis.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sao.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp: + picsym: remove trivial access methods, cleanup + + Remove redundant data, use intptr_t more consistently for picture + strides. Yuv stride can stay as int, the strides are never more than + 64 bytes. Use uint32_t for block counters + + Rename getCU() as getPicCTU() for clarity + [14d5345e257a] + + * source/common/picyuv.cpp: + picyuv: fix 16bpp warning + [7cccfdf67502] + +2014-10-17 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: avx2 assembly code for 16bpp version of transpose(8, 16, 32 and + 64) + [f590933a138a] + +2014-10-17 Steve Borho + + * source/encoder/analysis.cpp: + analysis: actually count numRefs, combine depth counters + [53f8765fadbd] + + * source/encoder/analysis.cpp: + analysis: further simplify top-skip math + + By multiplying both sides by (cuData.numPartitions >> 2) the dynamic + range of the algorithm is increased by up to 8 bits. Removing the + intermediate value allow the combination of the two conditional + expressions + [0829d64dd762] + + * source/encoder/analysis.cpp: + analysis: fix msvc warnings + [5eb2916cc8ed] + + * source/encoder/analysis.cpp: + analysis: prevent redundant recon work for --rd 3 and 4 + [daf4e836f261] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: cleanup getPartIndexAndSize() + [56fe32e3b7eb] + +2014-10-16 Steve Borho + + * source/encoder/analysis.cpp: + analysis: fix for --rd 0, it does not generate recon (just + prediction) + [6b0b2b6cd3b5] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: split --pmode into its own recursive analysis function + + This simplifies compressInterCU_rd0_4() and allows us to limit + --pmode to sane rd levels 2, 3, and 4. + [90fffd9a337f] + + * source/encoder/analysis.cpp: + analysis: it should only be necessary to copy recon when not split + [bad798574b22] + + * source/encoder/analysis.cpp: + analysis: combine stat collections at the end of CU analysis + [c53f9f7df61e] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: move recursion early-out logic to a helper function + [6e772638b9e3] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: move topSkip into a helper function + + This will allow us to break up compressInterCU_rd0_4 + [774ab320f9c9] + + * source/encoder/analysis.cpp: + analysis: simplify addSplitFlagCost slightly + [35c589d92f51] + + * source/encoder/analysis.cpp: + analysis: fix typo in addSplitFlagCost + [d5777c2b4179] + + * source/Lib/TLibCommon/TComDataCU.h, source/encoder/entropy.cpp: + cu: remove trivial getCtxInterDir() + [f33aad115fa1] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + cu: cleanup more copy and init methods + [79a8b694ae25] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/frameencoder.cpp: + cu: streamline initCTU and initSubCU + [cb5875b1e89f] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/frameencoder.cpp: + cu: rename initCU() to initCTU() + [d7496c92aa33] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/frameencoder.cpp: + cu: pass initial QP to initCU + [cfe4c070c59d] + + * source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/search.cpp: + cu: remove merge index wrappers, document how mvp idx is reused + [2febbee05401] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + cu: remove unused isFirstAbsZorderIdxInDepth + [b0c8b1ab5603] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/entropy.cpp, + source/encoder/search.cpp: + cu: remove m_mvpIdx access methods + [d790ac5a2ed9] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + cu: store each CU's data sequentially in memory + + This should be more cache friendly + [9d489eb535d3] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: use consistent absPartIdx for PU offset + [44d321c78d21] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/search.cpp: + cu: remove m_lumaIntraDir access methods + [776da39f9999] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/entropy.cpp, + source/encoder/search.cpp: + cu: remove m_chromaIntraDir access methods + [71d6ff5b0e09] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/entropy.cpp: + cu: remove interDir access methods + [c46d2e9e43ef] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: white-space cleanups + [c9978e9f94dd] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: make setSubPart protected, group getSCUAddr() with encode helper + functions + [ff8a34e35d8d] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/search.cpp: + cu: remove m_bMergeFlags access methods + [72b5cc8ea5f7] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + cu: remove m_cbf access methods + [2cd6eec80775] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/common/quant.cpp, source/encoder/entropy.cpp, + source/encoder/search.cpp: + cu: remove m_trIdx access methods + [404a8abfbad5] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/common/predict.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/search.cpp: + cu: remove m_log2CUSize access methods + [fc42e88982af] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/quant.cpp, + source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/search.cpp: + cu: remove m_predModes access methods + [9f51ef2afdc6] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp: + cu: remove m_skipFlag access methods + [99315bb8142c] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/entropy.cpp, + source/encoder/search.cpp: + cu: remove m_transformSkip access methods + [cdeaea376021] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/common/quant.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/search.cpp: + cu: remove m_cuTransquantBypass access methods + [b8938daa0e69] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/search.cpp: + cu: remove m_partSizes access methods, rename copyCodedToPic() to + updatePic() + [263c8b73b7aa] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: remove wrong or useless comments + [e44ce1d54da3] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + cu: remove x prefixes + [c16837999997] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/common/quant.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/search.cpp: + cu: remove m_qp access methods + [f1064a18fd1d] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/sao.cpp, + source/encoder/search.cpp: + cu: remove m_depth access methods + [ba3862977dfc] + + * source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + cu: remove getCUMvField + [6c7bee958e2a] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/search.cpp: + cu: remove trivial access methods for m_trCoeff + [47ea715be755] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/encoder/search.cpp: + cu: unify array data types to uint8_t, group like methods together + [a17e618c20cc] + + * source/encoder/search.cpp: + search: use pre-calculated partSize + [bedbad51e3de] + + * source/encoder/rdcost.h, source/encoder/search.cpp, + source/encoder/search.h: + Merge + [59096255eafe] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: add sa8dBits field to avoid conflict with RDO's totalBits + [7db432da4269] + + * source/encoder/frameencoder.cpp: + frameEncoder: fix eoln + [15fd6c844410] + + * source/encoder/search.cpp, source/encoder/search.h: + search: getBlkBits uses no data members, could be a static method + [323876f98fbf] + + * source/encoder/search.cpp: + analysis: rename variables for consistency + [78a112dc65fd] + + * source/encoder/analysis.cpp: + analysis: fix cut-paste bug in early-out code + [71173ff67162] + + * source/common/frame.cpp: + frame: m_totalBitsPerCTU must be allocated even if AQ is disabled + [9d4b9867a685] + + * source/encoder/analysis.cpp: + analysis: fix for --rd 0 check failure + [643c6dc10f99] + +2014-10-15 Steve Borho + + * source/encoder/encoder.cpp: + Merge + [96c144bf9b3f] + + * source/encoder/analysis.cpp: + analysis: minimal documentation for RD levels + [8fe07f3a2311] + + * source/encoder/analysis.cpp: + analysis: --rd 1 needs reconYuv to be copied from sub-cu + [abc464814a26] + + * source/encoder/analysis.cpp: + analysis: --rd 1 needs reconYuv to be copied to reconPic + [9e623a96246a] + + * source/encoder/analysis.cpp: + analysis: move temp array out of conditional + + Keeping a pointer to a stack array is not well defined behavior. + Renamed buf_trans to bufTrans, also + [3930ee35b182] + + * source/encoder/analysis.cpp: + analysis: only --rd 0 should use encodeResidue() + [ea7d008b9457] + + * source/encoder/analysis.cpp: + analysis: at --rd 1, split and best modes only have sa8d cost + [6861366e673f] + + * source/encoder/analysis.cpp: + analysis: do not call checkDQP() for --rd 0, the CU is not encoded + [ec1595bc8a76] + + * source/encoder/analysis.cpp: + analysis: fix hash mistakes at --rd 2 + + Do not re-encode merge blocks, as we learned with --rd 6 this does + not work without re-initializing skip flags. But since the CU was + already coded once there is no need to repeat the work. + [6d8bb90381fd] + + * source/common/shortyuv.h, source/common/yuv.h: + yuv: short descriptions + [3d8d0dcd1ef9] + + * source/common/yuv.cpp, source/common/yuv.h: + yuv: consistent variable naming for clarity + + absPartIdx is always the part index (in zorder) of a CU/PU/TU within + a CTU + [e6892e7c73bc] + + * source/common/yuv.cpp, source/common/yuv.h: + yuv: inline addClip sub-functions + [632305ea202e] + + * source/common/yuv.h: + yuv: nit + [2f1d633afec4] + + * source/common/shortyuv.h: + shortyuv: remove width argument to getChromaAddrOffset + [a66f34e6bcd7] + + * source/common/predict.cpp, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/common/yuv.cpp, + source/encoder/analysis.cpp, source/encoder/search.cpp: + shortyuv: take only a single size (width == height) + [bfd27a43b034] + + * source/common/shortyuv.cpp, source/common/shortyuv.h: + shortyuv: remove m_height and m_cheight + [d60c862b9ae1] + + * source/common/yuv.h: + yuv: nits + [84c610604cc0] + + * source/common/yuv.cpp, source/common/yuv.h: + yuv: remove width argument to getChromaAddrOffset + [469c92ffccd3] + + * source/common/predict.cpp, source/common/shortyuv.cpp, + source/common/yuv.cpp, source/common/yuv.h, + source/encoder/analysis.cpp, source/encoder/search.cpp: + yuv: take only single size (width == height) + [171267587546] + + * source/encoder/encoder.cpp: + encoder: ensure TLD.nr is initialized + [829ebca8ff0c] + + * source/common/yuv.cpp, source/common/yuv.h: + yuv: remove m_height and m_cheight + [058fb0238cb5] + + * source/common/yuv.cpp, source/common/yuv.h: + yuv: remove unused clear() method + [8a1563987a01] + + * source/common/frame.cpp, source/common/frame.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + rc: fix for ABR behavior - we need frame->m_totalBitsPerCTU to + replace CU stats + + m_totalBitsPerCTU always needs to be allocated and filled in with + the total bit size of each CU, for VBV and non-VBV modes to function + correctly + [ff1123105f64] + + * source/common/common.h, source/common/frame.cpp, + source/common/frame.h, source/common/quant.cpp, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/search.cpp, + source/encoder/search.h: + Merge + [999815f31962] + +2014-10-13 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: avx2 assembly code for 8bpp version of transpose(8, 16, 32 and + 64) + [51416472d14c] + +2014-10-14 Steve Borho + + * source/encoder/search.cpp: + search: add a couple TODO comments + [0161608f481a] + + * source/encoder/search.cpp: + search: make encodeResAndCalcRdInterCU() a bit more readable + [7bb09720f59c] + + * source/encoder/search.cpp: + search: fix one obviously wrong psy-rd check + + If measuring psy cost of CBF=0, you measure fenc against pred, not + against zero + [ae8bbb881b9e] + + * source/encoder/search.cpp: + search: remove I slice checks from encodeResAndCalcRdInterCU + + We do not call this function for I slices, and it already checks for + it + [b78070d1eb47] + + * source/encoder/search.cpp: + search: fix initial value of bCodeDQP in encodeResAndCalcRdInterCU + [6c9b28899c8f] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single caller of getInterSymbolBits() + + the code now uses temp vars instead of modifying the mode costs + directly, since the final mode costs are set at the end of this + function + [9d435b9a76ad] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single caller of xLoadIntraResultChromaQT + [47761a49a3cf] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single caller of xLoadIntraResultQT + [1c9566ae8dae] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single caller of xEncIntraHeaderLuma + [27b996ccb80b] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single caller of xEncIntraHeaderChroma + [532a9a6f4713] + + * source/encoder/search.cpp: + search: reorder xRecurIntraChromaCodingQT() for clarity + [bcc8b4f42f53] + + * source/encoder/search.cpp, source/encoder/search.h: + search: inline single callers of xGetIntraBitsChroma and + xGetIntraBitsQTChroma + [983e96789f5e] + + * source/encoder/sao.cpp: + sao: fix eoln + [d27721f55ea5] + + * source/common/common.h: + common: fix eoln, fix merge bug + [a30d82d4d2e7] + + * Merge + [badc6dec6b34] + +2014-10-14 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComRom.h, source/common/common.h, + source/encoder/frameencoder.cpp: + noiseReduction: replace magic values with readable names + [961240f1ac16] + +2014-10-13 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicSym.cpp, source/encoder/analysis.cpp, + source/encoder/sao.cpp, source/encoder/search.cpp, + source/encoder/search.h: + cu-lossless: remove redundant lossless buffer + + The pixel values in lossless mode can be obtained from the original + yuv buffer itself. + [207d1e432240] + +2014-10-14 Steve Borho + + * source/encoder/analysis.cpp: + analysis: match old merge analysis behavior at --rd 5,6 + [8ecb949326f9] + + * source/encoder/analysis.h: + analysis: reorder prediction modes for easier initialization at RD + <= 4 + [2542439de2e6] + + * source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/frameencoder.cpp: + cu: make CU's frame pointer const, so it's clearly read-only to CU + logic + + Required passing a non-const Frame pointer to compressCTU, which is + fine since the frame encoder by definition owns the non-const + object. + [82f22df1bb64] + + * source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/common/predict.cpp, source/common/predict.h, + source/encoder/entropy.cpp, source/encoder/search.cpp: + cu: make CU's slice pointer const, so its clearly read-only to CU + logic + [fb8b65d7812f] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + cu: return merge candidate count from getInterMergeCandidates() + + do not set by reference + [18dbef3d92ea] + + * source/encoder/analysis.cpp: + analysis: fix --rd 6 hash mistakes, clear skip flags between merge + candidates + [12a5dfb5bfb9] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: inline deriveTestModeAMP + [b0df34295958] + + * source/encoder/analysis.cpp: + analysis: uhm, try that again + [f5c3121f88b4] + + * source/encoder/analysis.cpp: + analysis: checkIntra() is used by all slice types now + [6ed24d5e4a64] + + * source/encoder/search.cpp: + search: minor cleanups + [842c664c64ad] + +2014-10-13 Steve Borho + + * source/encoder/analysis.cpp: + analysis: simplify addSplitFlagCost(), encode split flag with mode's + context + [c025b698a2d5] + + * source/encoder/analysis.cpp: + analsys: add split flag cost to the split prediction (--rd 5/6) + [4ec03ab5a463] + + * source/encoder/search.cpp: + search: nits, remove debug memsets + [609cbaffdd9b] + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + nits + [2e6bccf560f5] + + * source/encoder/analysis.cpp: + analysis: split out more logic for rdlevel 0 & 1 + + Eventually rdlevels 0 and 1 would have a different compress function + from rdlevels 2, 3 and 4 since they have much different logic around + split and cost decisions. But the first step should be to pull the + topSkip and avgCost into helper functions that could be used by both + functions (to avoid duplicating those features, which I expect will + get lots of attention in the comming weeks) + [898b47f1082f] + + * source/encoder/analysis.cpp: + analysis: re-introduce RD level 0 intra hack + + It didn't have a comment before, so I didn't understand its purpose. + Placing up a level and adding a comment makes it more clear + [64e516e6d865] + +2014-10-11 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: add a helper function to deal with split flag bits + + we can reasonably do something smart based on RD level in a + consistent manner. + * for high RD levels, we can account for split flag context states + * For low RD levels, we just increment bit counters + + this fixes the fact that the skip flag must be considered in the + mvBits stat since all other bits are considered coeff bits. This + should help 2-pass slightly + [ff2142620928] + + * source/encoder/analysis.h: + analysis: remove unused m_initialContexts + [9d2b67812d19] + + * source/encoder/analysis.h, source/encoder/frameencoder.h: + analysis: move StatisticLog to frameencoder.h + [817192eda811] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: add attributions for myself + [0024124be46a] + + * source/encoder/analysis.cpp: + analysis: remove redundant pointer check + [5461beda34e4] + + * source/encoder/search.cpp: + search: fix variable name shadow warnings + [4be77cb8bbea] + + * source/common/slice.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/sao.cpp, + source/encoder/sao.h, source/encoder/search.h: + Merge + [20644db535dd] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + analysis: clarify and fix CTU stats logging, move to frame encoders + [60bf1877612d] + + * source/encoder/analysis.cpp: + analysis: if avoiding skip analysis for lossless encodes, set max + rdcost + [825cd3ca0495] + + * source/encoder/analysis.cpp: + analysis: consistent naming of MC PU part index, try not to use temp + CU pointers + + Pointers to 'bestCU' tend to cause bugs; best to dereference + md.bestMode->cu directly in case the pointer moves. The compiler is + good about caching derefs + [c12b3abd8d92] + + * source/encoder/analysis.cpp: + analysis: merge only ever has one part, within checkMerge2Nx2N_rd0_4 + [b3fdd6dcf8b6] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/analysis.h: + cu: fix sub-cu pel positioning + + The cuData.encodeIdx already included the sub-part offset during + splits (there is one cuData instance per sub-part, not per depth). + So it was wrong to pass the partUnitIdx to the next recursion depth. + + It was only being used at the next depth to pass to initSubCU() + which used it to calculate the relative pixel offsets from the + parent CU. But in the new code the parent CU is now the parent CTU, + and so the pel offsets must be calculated relative from the CTU pel + positions. This is what the g_zscanToPel? tables are for. + [c50ae2a64cd9] + + * source/encoder/analysis.cpp: + analysis: remove a temp variables used in only one place + [1192a7e45953] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: remove useless numPartition variable from copyPartFrom() + [75e389fd9b3e] + + * source/encoder/analysis.cpp: + analysis: simplify handling of not-present sub-cus + [f8c0d8f0cc71] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: m_bEncodeDQP should never have been a member variable + + The value should always be initialized to m_slice->m_pps->bUseDQP + prior to calling m_entropyCoder.codeCoeff(), and then allow that + function to modify it as it recurses. + [d08c691b11ad] + + * source/common/frame.cpp: + frame: fix uninitialized pointer + [06235f8119ca] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp: + cu: make TComDataCU::copyPartFrom() take a const ref, cleanup + + this function did not need to copy m_cuAbove and friends + [69ae969b5d04] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/sao.cpp: + cu: remove getLumaOrigYuv() access methods + [88cec6cb806f] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: more ref religion + [44dacb1a44de] + + * source/encoder/analysis.cpp: + analysis: use m_slice consistently, not the pointer in the CU + [5b15e657ace9] + + * source/encoder/analysis.cpp: + analysis: simplify encodeIntraInInter + [ee7eb64c1c9c] + + * source/encoder/search.cpp: + search: minor cleanups and comment improvements + [2e3c07bd52db] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: white space and comment nits, initSubCU() no longer inits CTU + costs + [e2eea276e6b5] + + * source/encoder/analysis.cpp: + analysis: give the splitCU the correct partition index (0..3) + + this doesn't currently affect outputs, but it is definitely more + correct + [c7561cbb7b0f] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/frame.cpp, + source/common/frame.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/search.cpp, + source/encoder/search.h: + cu: move cost variables from TComDataCU to Mode + + we use md.bestMode != NULL to determine if no best mode has been + found yet, so we do not have to initialize costs to MAX_INT64 to + simulate that condition. Hopefully this is more robust. + + this changes outputs, in a very good way, so I think this fixed a + hidden bug or two + [6bf0d3f1c93d] + + * source/common/yuv.h: + yuv: nit + [7b2c95729183] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: settle on fencYuv for 'frame being encoded' or 'original + pixels' + + there was no point in analysis and search having different names for + the same pixels. this patch moves responsibility of filling fencYuv + to the caller in a consistent basis + [6f7f54438424] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp: + cu: remove access methods for neighbor CTUs + [37c229d6f8cc] + + * source/encoder/analysis.cpp: + analysis: fixes for --pmode and --rect with --rd 0..4 + [c8b4c9be2559] + + * source/encoder/analysis.cpp: + analysis: initCU() is only necessary for picSym CTUs, not analysis + CUs + [c218dd4a4c71] + +2014-10-10 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: move fillOrigYUVBuffer to Search + + it is not really an analysis functions, it is a coding function + [d1f8ae7710d4] + + * source/encoder/analysis.cpp: + analysis: fixes for merge/skip behavior at --rd 6, improve clarity + [f44a21cae332] + + * source/encoder/analysis.cpp: + analysis: fix precendence problem in split logic + [8778bf23678b] + + * source/encoder/analysis.cpp: + analysis: fix a couple of obvious --rd 6 bugs + [4728d78c0f4d] + + * source/encoder/analysis.cpp: + analysis: fix --rd 2 crash, only check intra for P slices + [633056237fb2] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: greatly simplify checkMerge2Nx2N_rd5_6 + [aef32e8b25c1] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: fix comments + [d82691c50f7a] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: pass merge and skip modes to merge analysis functions + + It was a layering violation for these functions to access md.pred[] + themselves + [e573d484cd33] + + * source/encoder/analysis.h: + analysis: PRED_NxN is no more + [f4aba985d261] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: nit rename + [083138d1dce1] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: most merge shared intra path into the main path + + The overhead for the nominal I frame path is one NULL pointer check + per CU + [8f77003f3f33] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove checkIntraInInter_rd5_6(), it is the same as + checkIntra now + [e5d3c1eba674] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: move parallel ME from Analysis to Search + [489614317ca8] + +2014-10-09 Steve Borho + + * source/encoder/search.cpp, source/encoder/search.h: + search: pass mode to xIntraCodingLumaBlk + [f397acd9d822] + + * source/common/CMakeLists.txt, source/common/predict.cpp, + source/common/predict.h, source/encoder/CMakeLists.txt, + source/encoder/predict.cpp, source/encoder/predict.h: + move predict.cpp and predict.h to common/ + + they don't need any encoder structures + [49c3dd1cad1e] + + * source/encoder/search.cpp, source/encoder/search.h: + search: reorder arguments to xIntraCodingChromaBlk for clarity + [4698e9d56cc4] + + * source/encoder/search.cpp, source/encoder/search.h: + search: pass mode to getBestIntraModeChroma + [d178113b3111] + + * source/encoder/search.cpp, source/encoder/search.h: + search: do not pass fencYuv to residualTransformQuantIntra + [6abf5ed381d7] + + * source/encoder/search.cpp, source/encoder/search.h: + search: do not pass fencYuv to xIntraCodingChromaBlk + [dc3be2051459] + + * source/encoder/search.cpp, source/encoder/search.h: + search: do not pass fencYuv to xRecurIntraChromaCodingQT + [2b49010309c5] + + * source/encoder/search.cpp, source/encoder/search.h: + search: do not pass fencYuv to xEstimateResidualQT() + [2f0020df9464] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComRom.h, source/common/CMakeLists.txt, + source/encoder/analysis.cpp, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/search.cpp, + source/encoder/search.h, source/test/intrapredharness.cpp: + predict: merge intra prediction helper routines into Predict + [7505862f3220] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to xRecurIntraCodingQT() + [031eeb5ab33f] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: return distortion from main intra coding functions + + This is much cleaner than magically incrementing the CU variables + within these functions. Leave some comments for future work + [2c9b47e8f5ba] + + * source/Lib/TLibCommon/TComDataCU.h: + cu: fix a comment + [bbd23323e465] + + * source/encoder/search.cpp: + search: use absPartIndex for sub-CU indexing + [c57651640ca8] + + * source/encoder/analysis.cpp: + analysis: nits + [8419f83fe8e2] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: merge sharedEstIntraPredQT with estIntraPredQT + [e90faaac6e4a] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to sharedEstIntraPredQT() and + estIntraPredChromaQT() + [d9e76daff312] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to estIntraPredQT() + [4adb8b6c440b] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to encodeResAndCalcRdSkipCU() + [7cc8d78543e2] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to encodeResAndCalcRdInterCU() + [615a8dc3d8db] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to generateCoeffRecon() + [875b4c959919] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: do not pass fencYuv to residualTransformQuantInter + [be855ecfb805] + + * source/encoder/search.cpp, source/encoder/search.h: + search: make fencYuv pointers const + + in preparation of using mode.origYuv directly. this requires using + const_cast when getting pixel pointers since our performance + primitives do not have const declarations for pointers which are + unharmed. + [c7616e988752] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: move temp residual buffer into Search structure + + It is only used by Search methods; the analysis functions do not + need to be aware of it + [d0d3188303b8] + + * source/encoder/analysis.cpp, source/encoder/search.h: + analysis: give each Mode a const pointer to its origYuv + + This is for convenience, and to save in function parameter overhead + [d1b3de579557] + + * source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/ratecontrol.cpp: + Merge + [7d67da2fc327] + +2014-10-09 Deepthi Nandakumar + + * source/encoder/analysis.cpp, source/encoder/entropy.cpp: + analysis: remove Inter Part_NxN analysis and encode. This condition + will never be hit. + + Also remove check for intra slice in compressInterCUrd5_6 + [67aefaf69a6b] + +2014-10-08 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: vbv fix for scene changes + + Also, some spacing nits + [d3ee2f362116] + +2014-10-09 Steve Borho + + * source/encoder/analysis.cpp: + analysis: parentCTU is now always a reference to the output picSym + CTU + [9b66c54258c2] + +2014-10-08 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/quant.cpp, + source/common/quant.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/frameencoder.cpp, + source/encoder/rdcost.h, source/encoder/search.cpp, + source/encoder/search.h: + analysis: pass parentCTU as const reference + [a12edeca1cb8] + +2014-10-07 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPattern.cpp, source/common/deblock.cpp, + source/common/frame.cpp, source/common/frame.h, + source/common/piclist.cpp, source/common/slice.cpp, + source/common/slice.h, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/predict.cpp, + source/encoder/ratecontrol.cpp, source/encoder/sao.cpp, + source/encoder/search.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp: + frame: remove trivial access methods, major cleanups + [aa90924a8619] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/common/CMakeLists.txt, source/common/deblock.cpp, + source/common/frame.cpp, source/common/frame.h, + source/common/lowres.cpp, source/common/lowres.h, + source/common/picyuv.cpp, source/common/picyuv.h, + source/common/quant.cpp, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/common/yuv.cpp, + source/common/yuv.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/reference.cpp, + source/encoder/reference.h, source/encoder/sao.cpp, + source/encoder/sao.h, source/encoder/search.cpp, + source/encoder/search.h, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp: + yuv: bring Yuv, PicYuv classes into common/ + + Standardized variable naming scheme + + pixel *foo Yuv *fooYuv PicYuv *fooPic Frame *fooFrame + [89c388e280e4] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComYuv.h, source/common/scalinglist.cpp, + source/common/scalinglist.h, source/common/slice.cpp, + source/common/slice.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/sao.cpp, + source/encoder/search.cpp, source/encoder/search.h: + encoder: pass cu and cudata by const reference when they are not + modified + + This is the beginning round of a sweeping refactor to enforce some + const discipline. The goal is to always pass objects as const + references when a NULL pointer is never allowed and when the object + is unharmed. un-const refs should be used when NULL is not allowed + but the object is modified by the function. + + As we add more and more parallelism, we need to enlist the + compiler's help in figuring out which functions are safe to call + without side-effects + [639abb765708] + +2014-10-06 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: pass mode to more methods, write directly to mode.contexts + + rename RDContexts.temp to rqtTemp; only used RQT analysis now + + There were a number of cut-paste bugs fixed by removing this extra + step after calling encodeResAndCalcRdInterCU. Nearly half of them + were wrong. + [fc22f7534def] + +2014-10-08 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: use memset to initialize + [82ee45403840] + +2014-10-06 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, source/encoder/analysis.cpp, + source/encoder/analysis.h: + cu: break pool objects out of TComDataCU + [90c59faddf95] + +2014-10-08 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: revisit the minDepth logic, outputs much closer to + original now + + Bitrates are actually a bit lower than the old version, with + slightly lower SSIM + [be17e2daf0aa] + +2014-10-06 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + cu: remove m_ prefix from tqBypassYuvMemBlock + [36270e784afa] + + * source/encoder/analysis.cpp: + analysis: use md auto-var in parallelAnalysisJob() + [561fd92417f0] + +2014-10-08 Steve Borho + + * source/encoder/analysis.cpp: + analysis: use correct 'nextContext' from split iterations for split + prediction + [affb40a174da] + + * source/encoder/analysis.cpp: + analysis: fixes for recursion from Min Chen + + When a part is not present, we must call + splitCU->copyPartFrom(splitCU, ... just to setup the appropriate + depth fields in splitCU, which is horrid but it works for now. This + hack fixes DQP behavior. + + The rest of the changes ensure we always pass a valid CABAC context + to the next coded CU (important in the presence of uncoded blocks on + the edges) + + With these fixes, I slices match the behavior of the previous + design. + [ce64a6330c5f] + + * source/encoder/predict.cpp, source/encoder/predict.h: + pred: rename Prediction::m_slice to Prediction::m_predSlice + + to avoid conflict with Analysis::m_slice + [b26d0774e682] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: pass partUnitIdx to intra functions for initSubCU + [4b888ab917a3] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/frameencoder.cpp: + analysis: remove m_log member variable; an auto-var is sufficient + [400fa7c74b6b] + + * source/encoder/analysis.cpp: + analysis: implement bidir MVP refine for --pme + + This was the last deliberate difference in output between --pme and + --no-pme + [2876b18af448] + + * source/encoder/analysis.cpp: + analysis: handle situation in compressInterCU_rd0_4() where only + split is tried + [9b0eda20ac43] + + * source/encoder/analysis.cpp: + analysis: bring compressInterCU_rd5_6() up to date + [5fffec35339e] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: update split logic in compressInterCU_rd0_4(), simplify + [c2d4a2a72ae1] + +2014-10-07 Steve Borho + + * source/encoder/analysis.cpp: + analysis: continue to optimize split logic + [677cdc979e6d] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/analysis.h: + analysis: pass depth via cuData, add support for split CU in + checkDQP + [7657c461a1b1] + + * source/encoder/entropy.cpp: + nit + [cdab6a9393b1] + +2014-10-06 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp: + cu: don't pass cuData to copyPartFrom(), it only needs numPartitions + + This change came from Min + [46c7f7f15226] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: nits + [7eb3ef2e19bc] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: add obviously missing logic + [790d3127dd85] + + * source/encoder/analysis.cpp: + analysis: remove best mode logic from checkMerge2Nx2N_rd0_4() + + It is always done by the caller based on rd level + [ef3e21a93f3b] + +2014-10-06 gopi jayaraman + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: bring mode structure from analysis for intra functions + [47edb2892258] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + search: bring mode structure from analysis + [96fcb12ba04f] + +2014-10-04 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: major refactor of analysis data structures; prep for + --pmode --rd 5 + [2d0d40208a59] + +2014-10-16 Deepthi Nandakumar + + * source/common/pixel.cpp, source/encoder/rdcost.h: + pixel: resolve build errors at high bit depth + [b7eeae24aae6] + + * source/encoder/search.cpp, source/encoder/search.h: + psy-rd: modify psy-rd cost decisions in search::xEstimateResidualQT + + This function looks to minimizing rate/distortion in the "residual". + Rate is bits taken to encode residual, and distortion is + ssd(original residual, reconstructed residual). It is wrong to + calculate psyCost with psy-energy = E(src, recon) here. psyCost is + now modified as E(orig residual, recon residual). + [d8f1f982eeb7] + + * source/common/pixel.cpp, source/common/primitives.h, + source/encoder/rdcost.h: + primitives: add support for 16-bit psyCost calculations + [20946a10c348] + +2014-10-15 Steve Borho + + * source/encoder/encoder.cpp: + encoder: ensure nr pointer is initialized + [79702581ec82] + + * source/encoder/analysis.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + nr nits + [7127239d9404] + +2014-10-15 Deepthi Nandakumar + + * source/encoder/search.cpp, source/encoder/search.h: + search: rename intra functions that generate recon + + Remove wrong comments + [56a04b7c1af0] + +2014-10-14 Deepthi Nandakumar + + * source/common/common.h, source/common/frame.cpp, + source/common/frame.h, source/common/quant.cpp, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + noiseReduction: make noiseReduction deterministic for a given number + of frameEncoders. + [f8e0aae88dc8] + +2014-10-13 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: avx2 assembly code for 8bpp version of transpose(8, 16, 32 and + 64) + [02ff8eaad632] + +2014-10-14 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComRom.h, source/common/common.h, + source/encoder/frameencoder.cpp: + noiseReduction: replace magic values with readable names + [38b5733cc629] + +2014-10-13 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicSym.cpp, source/encoder/analysis.cpp: + TComDataCU: remove redundant arguments + [f26e81eb555a] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/sao.cpp: + cu-lossless: remove redundant lossless buffer + + The pixel values in lossless mode can be obtained from the original + yuv buffer itself. + [e7682fc6e2aa] + +2014-10-10 Satoshi Nakagawa + + * doc/reST/cli.rst, source/common/slice.h, source/encoder/entropy.cpp, + source/encoder/level.cpp: + ptl: RExt profiles + [57a30b1c13ea] + +2014-10-09 Satoshi Nakagawa + + * source/common/common.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: refine, fix sao-non-deblock [CHANGES OUTPUT (RExt, sao-non- + deblock)] + [1e8ba81a5ee3] + +2014-10-09 Deepthi Nandakumar + + * source/encoder/analysis.cpp: + Backed out changeset: f231820645fe + [4495af3b30bb] + + * source/encoder/analysis.cpp, source/encoder/entropy.cpp: + analysis: remove Inter Part_NxN analysis and encode. This condition + will never be hit. + + Also remove check for intra slice in compressInterCUrd5_6 + [c880eced4a30] + +2014-10-08 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: vbv fix for scene changes + + Also, some spacing nits + [96609efaa877] + +2014-10-08 Steve Borho + + * doc/reST/cli.rst, doc/reST/threading.rst: + docs: try to explain behavior of related thread pool options + [4b6a71d53f83] + +2014-10-08 Gopu Govindaswamy + + * source/encoder/analysis.cpp: + analysis: fix for csp:444 decoder crash - CABAC context state + handling + [f231820645fe] + +2014-10-08 Steve Borho + + * source/common/param.cpp: + param: combine CTU size and RQT depth log lines (just saves space) + [0a18adcecd7d] + + * source/common/param.cpp, source/common/threadpool.cpp, + source/encoder/api.cpp, source/encoder/encoder.cpp: + encoder: move thread pool initialization to create(), handle pool + options better + + Previously WPP essentially owned the thread pool and the two + configurations were tied together (disabling WPP disabled the thread + pool and vice-versa). Now we have three features which use the + thread pool and they can be enabled or disabled independently of + WPP. + + After this commit, if all the pool features (WPP, PMODE, PME) are + disabled then the thread pool is not created. If --threads 1 is + specified, then no pool is created. When no pool is present, all + pool options are disabled. + + Reporting for all the pool options has been moved into the one log + line: x265 [info]: WPP streams / frame threads / pool : 8 / 2 / 4 / + pme / pmode + + They didn't really belong on the 'tools' log line since they are not + coding tools like SAO or AMP. + + This commit also fixes CTU stats logging in the absence of WPP + [0a0f686606c9] + + * source/common/wavefront.cpp, source/encoder/frameencoder.cpp: + frameencoder: fix handling of no thread pool being present + + The encoder should run properly without a thread pool instance + [35f422e7e1a1] + +2014-10-08 Gopu Govindaswamy + + * source/encoder/frameencoder.cpp: + frameencoder: allocate memory for SEIPictureTiming when + interlaceMode is enabled + [52677ba0c694] + +2014-10-06 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/search.cpp, + source/encoder/search.h: + analysis: make CU object used in --pme a const object + [46c4b98d92ec] + +2014-10-07 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + encoder: fix handling of --no-wpp and either --pmode or --pme + + in --no-wpp mode, we need to allocate TLD for workers and frame + encoders + [f40aff61e42c] + +2014-10-06 Steve Borho + + * doc/reST/cli.rst: + doc: make it clear fast cbf only affects rdlevel 5 and 6 + [2c65d39c989e] + +2014-10-05 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: use a sliding window to calculate moving avg SatdCost for ABR + Reset logic + + contains improvements for detection of scene changes within Rate + Control to stabilize qp and prevent extreme spikes in bitrate. + Removes the blockiness or distortions in the frames that a streak of + low-detailed frames and prevents vbv from overreacting at the points + of scene cuts. + [4b7c473c3ef4] + +2014-10-05 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.h: + cu: nits + [dad10cdb3573] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/search.cpp: + cu: drop m_totalPsyCost, just use m_totalRDCost whether or not psy + is enabled + [d33bc21e3bae] + + * source/encoder/analysis.cpp: + analysis: fix CABAC context state handling after splits [CHANGES + OUTPUTS] + + In RDlevel<=4, if split is chosen then copy depth+1 next to depth + next + + This fixes a long standing bug in presets slow and above, and + improves compression efficiency. + [d07fbd3bdecc] + + * source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/search.cpp, + source/encoder/search.h: + entropy: add a mechanism to detect reads without writes in checked + builds + [ead3d26c7747] + + * source/Lib/TLibCommon/TypeDef.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/sao.cpp, source/encoder/sao.h, + source/encoder/search.cpp, source/encoder/search.h: + entropy: give each Search instance its own set of RD contexts + + This gives each ThreadLocalData a complete set of working contexts + so each thread can measure RD cost (for the same row) independent of + one other. There were content problems with the 'temp' and 'rqtRoot' + and 'rqtTest' contexts. + + For this to work we have to sync the 'cur' context to the slave + prior to it performing any RD measurements. + + This commit finally removes the CI_IDX enums and uses a simple + struct to hold the contexts per depth; and the member variables were + renamed from "m_rdEntropyCoders" to "m_rdContexts" since these + coders are only ever used to save and restore CABAC state (never to + code with) + + This change exposed a bug. The next patch adds some tools to catch + this class of bug and the patch after that fixes it. + [5420f2a29522] + + * source/common/threadpool.cpp: + threadpool: nit + [ed5b9320afca] + + * source/common/threading.cpp: + threading: nits + [1867fb89298c] + + * source/CMakeLists.txt: + cmake: nit + [fc856c00d49b] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: make copy methods properly const + [997b210ab94a] + + * source/CMakeLists.txt: + cmake: bump X265_BUILD for new parallelism params + [3c0b9a637349] + +2014-10-04 Steve Borho + + * source/encoder/encoder.cpp: + encoder: prevent broken combinations of options + + with --no-wpp; something in the slave state is not being initialized + correctly causing crashes in motion estimation + [f312deb51d55] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: encodeIntraInInter() must write directly to given cabac + context + + writing to m_rdEntropyCoders[depth][CI_TEMP_BEST] was not thread + save since the slave thread is using the same m_rdEntropyCoders + objects as the master thread + [45ef431c1490] + +2014-10-03 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: use enums to enumerate prediction buffers + + value 4 was never used, so this reduces the buffer count by one + [c61dca79ea0f] + + * source/encoder/analysis.cpp: + analysis: add a hack to try and match --pmode with --no-pmode + [39e5b26733b8] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: cleanups + [9872cc99362e] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: measure best pmode intra RD cost in worker thread + + it required adding storage for the best intra recon and entropy + state includes prep work for supporting --pmode with --rdlevel > 4 + [51f689bede6a] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: do not pass member vars to checkMerge2Nx2N_rd0_4 as + pointer references + [5e4aa3b6d136] + + * source/common/param.cpp: + param: show when pmode and pme are enabled + [5849804cd0c3] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove default argument for compressInterCU_rd5_6 + [2351a963a676] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComYuv.h, source/common/deblock.cpp, + source/common/quant.cpp, source/common/quant.h, + source/common/shortyuv.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/predict.cpp, + source/encoder/sao.cpp, source/encoder/search.cpp: + TComDataCU: make most get methods const, remove some trivial access + methods + [2e6163426c95] + + * source/encoder/analysis.cpp: + analysis: use slave instance for MVP eval for --pme + + this avoids a race hazard with Predict::m_immedVals + [15dff1469408] + + * source/encoder/analysis.cpp: + analysis: replace prepMotionCompensation() calls in + parallelInterSearch() + [71de0b881801] + + * source/encoder/analysis.cpp: + analysis: --pme workers do not need m_origYuv in slave instance + [a6b9e8e235d7] + + * source/encoder/analysis.cpp: + analysis: cleanup variable names + [ec0cb8779f84] + +2014-10-02 Steve Borho + + * source/encoder/analysis.cpp: + analysis: prevent race hazard in parallel ME state variables + + the inserted comment should explain the risk, which manifested in a + deadlock + [70c5681f56b9] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: parallel ME can get partsize and depth from ME CU + [cf722336b836] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: support --pme for all rd levels + [dd2acbbd545c] + + * source/common/param.cpp, source/encoder/analysis.cpp, + source/x265.cpp, source/x265.h: + api: add --pme to enable parallel motion estimation + [bc99dfbed4b7] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: further parallelME progress + [4cdc0528e0aa] + +2014-10-01 Steve Borho + + * source/common/param.cpp, source/encoder/analysis.cpp, + source/x265.cpp, source/x265.h: + api: add --pmode to enable parallel mode decision + [57f93ba0df23] + +2014-10-02 Steve Borho + + * source/encoder/encoder.cpp: + encoder: correct logging of number of WPP streams + [b6d49505b179] + + * source/encoder/search.cpp: + search: remove redundant calls to prepMotionCompensation() + [50490cd35e57] + + * source/encoder/search.cpp: + search: use sad cost directly to pick MVP + + there's no point in use SAD RD cost if all the candidates have the + same estimated bit cost. + [bd6e2cdd5938] + + * source/encoder/rdcost.h, source/encoder/search.cpp, + source/encoder/search.h: + search: make some helper methods const + [b17293bb0f19] + +2014-10-01 Steve Borho + + * source/encoder/analysis.cpp: + analysis: remove unused variables, fixes warnings + [898a2546aff1] + +2014-10-02 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: correct the threshold for resetABR function + [0212e9832ce7] + + * source/encoder/ratecontrol.cpp: + rc : correct max AU size for first frame + [4579fc590099] + +2014-10-02 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + fix bug in 73c6c9086577 for rdLevel=0 + [b57c63127527] + +2014-10-01 Steve Borho + + * source/encoder/analysis.cpp: + analysis: further work on parallel ME + [c0bbd8d01257] + + * source/encoder/analysis.cpp: + analysis: nit, remove obviously wrong comment + [bd3046c4bb36] + + * source/encoder/analysis.cpp: + analysis: use source buffer for source stride + + it was always a coincidence that the output stride matched + [61e028b5a04e] + + * source/encoder/analysis.cpp: + analysis: initialize job counters + [7daea9e6c5ae] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove bMergeOnly argument to checkInter_rd0_4, always + false + [589d4d7e5a72] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove PartSize argument to checkIntraInInter_rd0_4 + [0c6fe4a39a32] + +2014-10-01 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: fix float absolute check + [2a55baeb89cf] + + * source/encoder/ratecontrol.cpp: + ratecontrol: replace an imprecise comparison with a more precise + check to ensure consistency. + [f9922ce58a20] + + * source/common/slice.h: + slice: better structure packing + [d0fa09e9cca5] + +2014-09-30 Steve Borho + + * source/encoder/analysis.cpp: + analysis: fixup + [3bd852b225b5] + + * source/encoder/analysis.cpp: + analysis: move non-distributed path into else clause + + this is done in a second patch since it touches a lot of code + trivially so it + [b17ddb5d71f4] + +2014-09-27 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.h: + stub in framework for parallel mode analysis and parallel ME + [5c1a4804c42d] + +2014-09-30 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove default arguments to checkInter_rd5_6 and + checkInter_rd0_4 + [4d9ff684c80f] + + * source/x265.cpp: + cli: display param->bSaoNonDeblocked as bool in CLI help + [1af64c8c2d28] + + * source/encoder/analysis.cpp: + analysis: nit + [a4859c266a59] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/deblock.cpp, + source/common/loopfilter.cpp, source/common/primitives.h, + source/encoder/analysis.h, source/encoder/sao.cpp: + replace lcu with ctu in variable names + [ab92196f8b7b] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComRom.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/sao.h: + replace LCU with CTU globally in comments + [382384729d60] + + * source/encoder/sao.h: + sao: nits + [8b89e74d848d] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: rename resetLcuPart to resetCtuPart + [f2b5e4d8da59] + + * source/common/common.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/sao.cpp: + sao: rename saoLcuParam to ctuParam + [ca1e3afddc4f] + + * source/common/common.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: rename SaoLcuParam to SaoCtuParam + [7692ab2a28d3] + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/common.h, + source/common/param.cpp, source/encoder/frameencoder.cpp, + source/encoder/sao.cpp, source/x265.cpp, source/x265.h: + api: rename --sao-lcu-bounds to --sao-non-deblock + + The acronym LCU doesn't appear anywhere else in our param interface + [5a06c0462363] + + * source/encoder/search.cpp, source/encoder/search.h: + search: make bidir temp YUVs Search members + + I can't believe we've been allocing them every predInterSearch() + call for the last 2 months. + [2486149ff9de] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp: + datacu: coding style rename of m_DataCUMemPool and m_CULocalData + [832ad99093a0] + + * source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h: + motioninfo: coding style rename of m_MVFieldMemPool + [ec91821451f3] + + * source/common/common.h: + common: break into debugger when check fails in debug build + [a5103ad3df8b] + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComData: do not leave m_tqBypassOrigYuv uninitialized + [a360de22f48b] + +2014-09-30 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 assembly code for idct4x4 + [49aaafeb707a] + +2014-09-30 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockfill_s_32x32 avx2 asm code: performance improved from 1354.05 + cycles to 705.81 cycles, over sse version of asm code + [154aa8bfe042] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockfill_s_16x16 avx2 asm code: performance improved from 389.21 + cycles to 204.38 cycles, over sse version of asm code + [8be59e140210] + +2014-09-29 Aarthi Thirumalai + + * source/common/slice.h, source/encoder/level.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: apply maxAU size restrictions while encoding each frame + [8740d938dbb7] + +2014-09-30 Santhoshini Sekar + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/analysis.cpp, + source/encoder/search.cpp, source/encoder/search.h: + TComDataCU: replace getTotalNumPart() with CU structure details + [1706e30042ef] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/search.cpp, + source/encoder/search.h: + TComDataCU: replace getZorderIdxInCU() with encodeIdx of CU + structure + [73c6c9086577] + +2014-09-30 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/rdcost.h, source/encoder/search.cpp, + source/encoder/search.h: + rd: move lambda and analysis qp init to rdcost.h + + This will make it possible for Search instances to copy QP data + between each other + [a40ff330a525] + +2014-09-30 Satoshi Nakagawa + + * doc/reST/cli.rst, doc/reST/threading.rst, source/common/common.h, + source/common/param.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/sao.cpp, source/encoder/sao.h, source/x265.cpp, + source/x265.h: + sao: remove frame-based SAO + [28db2410d5de] + +2014-09-29 Steve Borho + + * source/Lib/TLibCommon/TComMotionInfo.h: + TComMvField: class to struct, white-space cleanups + [5a6845566d14] + +2014-09-25 Steve Borho + + * source/encoder/search.cpp: + search: reorder nits, no effect + + do allocations after simple configurations + [0c4e39a2965b] + + * source/common/quant.cpp, source/common/quant.h, + source/encoder/search.cpp: + quant: pass entropy instance through init function + [a8fca9f05102] + + * source/encoder/search.cpp: + search: white-space nits + [991527f1a9c6] + + * source/encoder/analysis.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/search.cpp, + source/encoder/search.h: + search: give each Search instance an Entropy encoder (no output + changes) + + This essentially relocates the "active" entropy coder used during + all analysis from CTURow to ThreadLocalData. This actually reduces + the number of Entropy instances in the encoder, and solves the + problem of sharing the entropy coder between worker threads + cooperating on the same CTU. + [a4d27e19ed09] + +2014-09-26 David T Yuen + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/frameencoder.cpp: + Changes for loadCTUData + + Replaced getDepthScanIdx() with table g_depthScanIdx Moved + Analysis::loadCTUData to TComDataCU::loadCTUData since it only works + with TComDataCU fields Replaced CU.offsets[2] with local variables + in loadCTUData since that is the only place it was set and used + minor changes to reduce the number of local variables in loadCTUData + [0d0558b82a9c] + +2014-09-26 Steve Borho + + * source/encoder/analysis.cpp, source/encoder/encoder.cpp: + nits + [32f50df7fa76] + +2014-09-26 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 assembly code for idct32x32 + [4b18a27b52ac] + +2014-09-25 David T Yuen + + * source/encoder/frameencoder.cpp: + Removed unnecessary call to loadCTUData + [8119b3d8d260] + +2014-09-25 Steve Borho + + * source/encoder/analysis.cpp: + analysis: more style nits, code simplifications. no behavior change + [bd0e23d7d394] + + * source/encoder/analysis.cpp: + analysis: remove #define conditionals for control flow + + The non-default paths are not being tested (or even compiled) and + are thus assumed broken. The defines simply make the code harder to + read. + [f5f7c23fedd6] + + * source/encoder/analysis.cpp: + analysis: coding style and comment nits + [e6cc918fb18e] + + * source/encoder/analysis.cpp: + analysis: remove unused LAMBDA_PARTITION_SELECT + [391282b02731] + + * source/encoder/analysis.cpp: + analysis: hoist local function into anonymous namespace (file local) + [d26780e43a87] + +2014-09-25 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 assembly code for idct8x8 + [8492a3250fef] + +2014-09-25 Santhoshini Sekar + + * source/Lib/TLibCommon/TComDataCU.cpp, source/common/deblock.cpp, + source/common/frame.h, source/common/slice.cpp, + source/encoder/analysis.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/sao.cpp, source/encoder/search.cpp: + remove getNumPartInCU() and replace it with macro + [37f33ab176fa] + +2014-09-26 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/deblock.cpp, source/common/deblock.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + refine deblocking filter + [b47d794a0372] + +2014-09-24 Steve Borho + + * source/common/slice.h, source/encoder/predict.cpp, + source/encoder/predict.h: + predict: split weighted prediction values from WeightParam + + The arguments passed to addWeightBi() and addWeightUni() are just + the "w, o, offset, shift, round" integers. They don't need the + fields which were signaled in the slice header or vice-versa. + [7dccbbed0349] + + * source/encoder/predict.cpp, source/encoder/predict.h, + source/encoder/search.cpp: + predict: combine and check allocations and return failures + [982040e91112] + + * source/encoder/predict.cpp, source/encoder/predict.h, + source/encoder/search.cpp: + predict: remove check for reallocations, comment nits + + we don't do this anywhere else; there would be huge leaks if the + Search object were initialized multiple times. there's no reason to + check here. + [7c88fb6128cf] + + * source/encoder/analysis.cpp, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/search.cpp: + predict: inline predInterUni(), getWpScaling() and simplify + motionCompensation() + + After this refactor, motionCompensation no longer needs the cu + parameter. It was only used to pass to another member function to + gain access to cu->m_slice which is now a member variable. + + This refactor removed a number of arguments to addWeightBi and + addWeightUni which were always member variables. + [a961728f906c] + + * source/encoder/predict.cpp, source/encoder/predict.h: + predict: inline single call of predInterBi() + [b6c9a51d9201] + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + encoder: rename cuCoder to analysis for better clarity + + the data type of cuCoder changed from TEncCu to Analysis weeks ago + [a3f952bcada5] + +2014-09-25 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPicYuv.h, source/common/deblock.cpp, + source/common/deblock.h: + Backed out changeset: 940cec3bf0b4 + + This commit causes hash mismatches in vc11 x86_64 Release mode + consistently, when lft is enabled. Stack/heap corruption likely. + [0d330611fa97] + +2014-09-24 Steve Borho + + * source/common/vec/dct-sse3.cpp: + Backed out changeset: eb011fa1d2d8 + [e47e127da779] + +2014-09-24 David T Yuen + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + Changed FrameEncoder::m_tld to a pointer and set it to one of + Encoder's ThreadLocalData instances. + + This uses less memory since m_tld isn't used in --wpp and Encoder's + ThreadLocalData instances are not used in --no-wpp Also there was a + small performance increase on my system + [3e1bfb2e4592] + +2014-09-24 Steve Borho + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-ssse3.cpp: + vec: make a note for why we keep some of the remaining vector + routines + [f6a0b0a97a5b] + + * source/common/vec/dct-sse3.cpp: + vec: remove idct8, we have SSSE3 assembly for it + [eb011fa1d2d8] + + * source/common/CMakeLists.txt, source/common/vec/blockcopy-sse3.cpp, + source/common/vec/vec-primitives.cpp: + cmake: remove blockcopy-sse3.cpp + [c79590d89389] + + * source/common/pixel.cpp, source/common/primitives.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + primitives: remove unused block copy primitives + [63b7cb39e9f1] + +2014-09-24 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + bloccopy_pp avx asm code: 32x32, 32x48, 32x64 improved by 803.69 -> + 514.90, 1126.36 -> 655.24, 1454.09 -> 835.76 cycles + [3fe7e7975eae] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockcopy_pp_32x24: avx asm code, improved 621.84 cycles -> 371.94 + [fe901487b7cc] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockcopy_pp_32x16: avx asm code, improved 477.74 cycles -> 309.99 + [b51e34a4b828] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockcopy_pp_32x8: avx asm code, improved 281.20 cycles -> 165.47 + [2d8adf9a4ab0] + +2014-09-24 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPicYuv.h, source/common/deblock.cpp, + source/common/deblock.h: + refine deblocking filter + [940cec3bf0b4] + +2014-09-20 Steve Borho + + * source/encoder/predict.cpp, source/encoder/predict.h: + predict: remove checkIdenticalMotion() + + We will not insert the same reference picture into L1 and L0 at the + same time, so this check is utterly redundant. + [532d0266e333] + + * source/encoder/analysis.cpp, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/search.cpp: + predict: remove list argument from motionCompensation(), always + REF_PIC_LIST_X + [cf90338bbc87] + + * source/encoder/predict.cpp: + predict: streamline getWpScaling() + [e26ce61cd2e3] + + * source/encoder/predict.cpp: + predict: use faster unidir prediction for B frames when weighting + not enabled + [30dd73bb8a93] + + * source/encoder/predict.cpp, source/encoder/predict.h: + predict: combine redundant logic paths in predInterBi() + + removes weightedPredictionBi(), which is no longer called + [3f1681901fb4] + +2014-09-24 Deepthi Nandakumar + + * source/test/testbench.cpp: + Backed out changeset: fa2f1aa1456e + + This commit allocated the harness instances on the heap, thus no + longer respecting __declspec(align) directives for the member + fields. We could probably circumvent this by overloading operator + new with aligned_malloc, but I'm not sure this is good practice. + [b2b7072ddbf7] + +2014-09-23 Sagar Kotecha + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockcopy_ss: 64x16, 64x32, 64x48, 64x64 AVX version of asm code, + approx double speedup comapre to SSE + [e2b577330c9b] + +2014-09-23 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 code for dct8x8 + [271e5eb1e396] + +2014-09-23 Min Chen + + * source/common/x86/dct8.asm: + asm: replace mova by movu to avoid AVX2 testbench crash in dct16, + dct32, denoise_dct, its same speed on Haswell + [02253e0800ea] + +2014-09-23 Sagar Kotecha + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + add avx version for chroma_copy_ss 16x4, 16x8, 16x12, 16x16, 16x24, + 16x32, 16x64 based on csp, approx 1.5x-2x speedup over SSE + [1f5ffdc453ee] + +2014-09-22 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, source/encoder/predict.cpp: + simplify intra filter (with fix for da61cf406f16) + [ee76b64fd051] + +2014-09-22 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + Backed out changeset: 25dde1ffab66 + + This commit needs more investigation, with specific VBV use cases + like 1-sec GOPs. + [82bab5587bf1] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/common/deblock.h, + source/common/quant.cpp, source/encoder/api.cpp, + source/encoder/entropy.cpp, source/encoder/sao.cpp, + source/encoder/search.cpp: + nits: use parantheses to improve readability in shifts + [fd435504f15e] + +2014-09-20 Steve Borho + + * source/encoder/predict.cpp: + predict: don't bother keeping refidx as an array + + it is always indexed explicitly + [1c172c1822e4] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/common/CMakeLists.txt, source/encoder/predict.cpp, + source/encoder/predict.h: + predict: merge TComWeightPrediction functions into Predict + + * TComWeightPrediction had no member vars, the constructor was + useless + * half of the functions were not used, they were dropped + * default arguments were removed, none were actually required + * x prefixes removed from method names + * comments were cleaned up + [0be03e280b3d] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + TComWeightPrediction: combine duplicate inline functions (refs #80) + [c7cc07fd21a7] + + * source/encoder/encoder.cpp: + encoder: use %u to sprintf unsigned ints (refs #80) + [a58aea624122] + + * source/encoder/entropy.cpp: + entropy: fix SAO enable detection (refs #80) + + Apparently our analysis never toggles luma separately from chroma + because this bug has not resulted in any bad bitstreams, that I know + of. This bug was found via static analysis + [c39538f0c59b] + + * source/common/bitstream.cpp: + bitstream: add paren to avoid ambiguous precedence in X265_CHECK + [2599fd87b72e] + + * source/Lib/TLibCommon/TComPicSym.cpp, source/common/frame.cpp, + source/encoder/api.cpp, source/encoder/encoder.cpp: + nits: do not check for NULL from new operations + + By the C++ spec, new is incapable of returning NULL. If an + allocation failure actually occurs, an exception is issued (which we + do not catch) Long term, all of these new operations need to be + replaced by malloc and explicit initialization and destruction. In + the short term, these return value checks are redundant. + [6e450860475a] + + * source/encoder/bitcost.h: + bitcost: use enums for special constants rather than static const + ints + + enums require no storage + [3fd2d7acb6bb] + + * source/encoder/motion.cpp: + motion: avoid extra iterations when no subpel motion found + + subsequent iterations would have also returned zero, which would be + pointless. this is an adaption of a patch by Sheva Xu. + [2c1d4c7d85ba] + +2014-09-22 Deepthi Nandakumar + + * source/encoder/search.cpp, source/encoder/search.h: + search: clean xRecurIntraCodingQT + [d1ffc125f0a3] + + * source/encoder/analysis.cpp: + analysis: nits + [39d0ba6012d5] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + psy-rd: fix bug in chroma psyEnergy for intra 4x4 + + Also add TODO, for all psyCost calculations + [d1c2b82de4db] + +2014-09-21 Deepthi Nandakumar + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove CheckBestMode from CheckIntra + [817abe294c8b] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: remove redundant variables, cleanup variable names + [6334cc645407] + +2014-09-20 Steve Borho + + * source/common/param.cpp: + param: do not allow VBV without WPP + + VBV row restarts cannot function correctly without WPP (per-row + CABAC starts) + [c8f53398f8ce] + +2014-09-18 Gopu Govindaswamy + + * source/encoder/search.cpp: + search: simplify and remove redundant variables in + getBestIntraModeChroma + [9b9986cc084b] + + * source/encoder/search.cpp: + search: remove redundant loacal variables in + encodeResAndCalcRdSkipCU + [5c067b643591] + + * source/encoder/search.cpp: + search: cleanup and remove redundant variable in checkintra + [7c1e793722f9] + +2014-09-19 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/intrapred.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/encoder/analysis.cpp, source/encoder/predict.cpp, + source/encoder/search.cpp, source/encoder/slicetype.cpp, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + primitives: intra_pred[4][35] => intra_pred[35][4] (avoid *35) + [da61cf406f16] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/deblock.cpp, + source/common/frame.cpp, source/common/frame.h, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/motion.h, source/encoder/predict.cpp, + source/encoder/predict.h, source/encoder/search.cpp: + inline simple functions + [c07038ca0e07] + +2014-09-17 Steve Borho + + * source/test/testbench.cpp: + testbench: allocate test harnesses on heap, for better valgrind + coverage + [fa2f1aa1456e] + +2014-09-18 Praveen Tiwari + + * source/test/mbdstharness.cpp: + denoiseDct: align performance data while reporting speedup + [4680ab4f92b8] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + copy_cnt_32: avx2 asm code, improved 1521.17 cycles -> 934.46 cycles + [6908388bf26f] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/const-a.asm: + copy_cnt_16: avx2 asm code, improved 514.32 cycles -> 313.66 cycles + [9b672a7b3ea9] + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm: + denoise_dct: avx2 asm code + [e83cc4a15dc9] + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + denoise_dct asm code: SSE version + [d6759701fdd7] + +2014-09-18 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 assembly code for idct16x16 + [7e82d0abf6fb] + +2014-09-18 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: improvements for cbr + [25dde1ffab66] + +2014-09-17 Praveen Tiwari + + * source/common/x86/dct8.h: + denoiseDct: nit unused asm function declarations + [54ad38a84a69] + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm: + denoiseDct asm code: nit faulty code, need a new SSE version + [55a50a362def] + + * source/test/mbdstharness.cpp: + denoiseDct unit test code: fixed bound value problem + [b162185198fe] + +2014-09-11 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + copy_cnt_4 avx2 asm code: nit, same speedup by sse version + [123db3a255a7] + +2014-09-17 Aarthi Thirumalai + + * source/encoder/analysis.cpp: + rc: fix bugs in using boundary condition for cu while encoding each + frame. + + fixes the binary mismatch in 2 pass completely. + [4e17a5c3ed64] + +2014-09-17 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: fix VBV row resets when SAO is disabled + + When SAO is disabled, the row bitstream is generated as CTU analysis + progresses. We must reset the row bitstream after a restart to avoid + coding errors. The CAABAC state is already reset correctly when CTU + col 0 is re-coded. + [86686bd153db] + +2014-09-16 Sagar Kotecha + + * source/common/param.cpp, source/common/param.h, source/x265.cpp: + add fanout validation module to check param compatibility + [199e8f2e0d54] + +2014-09-16 Gopu Govindaswamy + + * source/encoder/api.cpp: + api: do not reuse the analysisData buffer for more then one picture, + set it NULL + [d71d363c0dbb] + +2014-09-16 Santhoshini Sekar + + * source/encoder/analysis.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp: + analysis: add CU specific details to encodeCU() + [06bac60ee4cf] + +2014-09-16 Steve Borho + + * source/encoder/analysis.cpp: + analysis: nits + [b276d567d771] + +2014-09-16 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/search.cpp, source/encoder/search.h: + analysis: intra picture estimation (mode and split + decision)information sharing + + when --analysis-mode=save - the encoder runs a full encode and dump + the best split and mode decisions into x265_analysis.dat(default + file name if file name is not provided) file when --analysis- + mode=load - the encoder reads the best split and mode decisions from + x265_analysis.dat and bypass the actual split and mode decisions, + and therefore perform a much faster encode + [7784ad03d6d4] + +2014-09-16 Praveen Tiwari + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h, + source/test/testharness.h: + denoiseDct: test bench code + [63a78516630c] + +2014-09-15 Steve Borho + + * source/encoder/search.cpp: + search: save a few cycles + [a1fc4e9bba51] + +2014-09-15 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp: + rc: fixes for 2 pass + vbv to calculate frameSizePlanned accurately. + [7c1aba99f40d] + +2014-09-15 Steve Borho + + * doc/reST/presets.rst, source/common/param.cpp: + param: preset tuning changes + + 1. disable SAO in superfast + + Recent changes have made --no-sao substantially faster than SAO, + which has made ultrafast preset much much faster than superfast. By + disabling SAO in superfast, it is now roughly half-way between + ultrafast and veryfast again. + + 2. Enable weighted prediction for B slices in slower, veryslow, and + placebo + + Weighted prediction for B can sometimes be beneficial, so turn it on + for slower encodes. + [1de67321275e] + + * doc/reST/threading.rst: + doc: describe performance impact of SAO + [dff0cd55b520] + + * doc/reST/threading.rst: + doc: fix typo and nit in threading page + [76240da72c38] + +2014-09-12 Satoshi Nakagawa + + * source/common/common.h, source/common/x86/loopfilter.asm, + source/encoder/entropy.cpp, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: some cleanups + [098a00de4a72] + +2014-09-15 Steve Borho + + * source/Lib/TLibCommon/CommonDef.h, source/encoder/search.h: + search: header cleanups, no functional change + [db063839c8fe] + +2014-09-10 Steve Borho + + * source/encoder/search.cpp, source/encoder/search.h: + search: measure RDO of intra modes within 12% of least cost [CHANGES + OUTPUTS] + + This version adaps the number of RD measured modes by param.rdLevel + (aka preset) and by depth. This gives a non-trivial speedup to the + very fast presets which use frequent keyframes and helps improve + compression in slower presets. + + all presets use this function to encode I slices, so every encode is + affected. + + Previous behavior: RD measure top N least sa8d cost intra modes and + all most probable modes where N was depth-based: intraModeNumFast[] + = { 8, 8, 3, 3, 3 }; // 4x4, 8x8, etc + + New behavior: RD measure up to N modes that are within 12% of best + sa8d cost or are most probable. where N if a function of rd-level + and depth + + The new behavior may measure fewer modes than before may skip some + most-probable modes if there are plenty of other modes which are + near the best cost. Since mode signal cost is included already, this + seems ok. + + The general idea is that if 1-2 modes have much better sa8d cost + than all the others, then we are likely wasting our time RD + measuring 8-11 modes. We're betting that sa8d cost is a somewhat + decent predictor of RD cost. + + Note that I initially tried without a limit (measure all within 12% + or MPM) but for some clips this was a horrible perf trade-off. In + some situations all the intra modes might measure close together + (flat source block) and we would end up measuring most or all of the + intra modes for very little gain. So this version re-introduces a + "top N candidate list" but does not bother trying to keep the list + sorted since it is small + [02353d20f051] + +2014-09-15 Steve Borho + + * source/encoder/search.cpp: + search: comment nits + [017ceb9d2b06] + + * source/encoder/ratecontrol.cpp: + Merge with stable + [70c836fef6d9] + +2014-09-15 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: bug fix for 2 pass when bframes = 0. fixes Issue #77 + [e6a80fb007e8] + + * source/encoder/ratecontrol.cpp: + rc: check for changes in scenecut input between multiple passes. + + wpp/no-wpp doesn't affect slice type decisions. they can differ + between the passes in multipass encode. + [67ee212bbf78] + + * source/encoder/ratecontrol.cpp: + rc: bug fix for 2 pass when bframes = 0. fixes Issue #77 + [9107dc4a2632] + +2014-09-10 Ashok Kumar Mishra + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + Search: remove redundant encode coefficients in intra for + performance + [8972169f252d] + +2014-09-15 Murugan Vairavel + + * source/common/x86/dct8.asm: + asm: fix mismatch due to dct32 avx2 assembly code + [b5fb734517c0] + +2014-09-12 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 assembly code for dct32x32 + [184e56afa951] + +2014-09-13 Deepthi Nandakumar + + * source/x265.h: + x265: add missing typedefs + [2fb61cc75152] + +2014-09-11 David T Yuen + + * source/x265.cpp: + Resolved gcc compiler error of mismatched type + [cd8fd0afd4e8] + +2014-09-12 Deepthi Nandakumar + + * source/common/x86/asm-primitives.cpp: + Merge with stable + [fda32ff40246] + + * source/common/x86/asm-primitives.cpp: + asm: disable buggy denoise primitives until the bugs are fixed + [d522e7662111] + +2014-09-11 Sagar Kotecha + + * doc/reST/cli.rst, source/common/common.h, source/common/param.cpp, + source/x265.cpp: + cli: add cli options analysis-mode and analysis-file + + analysis-mode: save|1 - Dump analysis buffers into file, load|2 - + read analysis buffers from the file analysis-file: Specify file name + used for either dumping or reading analysis data + [7e29b10982d2] + + * source/common/frame.cpp, source/common/frame.h, + source/encoder/analysis.cpp, source/encoder/encoder.cpp: + store analysis information in buffers + [b0d006337801] + + * doc/reST/api.rst, source/CMakeLists.txt, source/encoder/api.cpp, + source/x265.def.in, source/x265.h: + api: introduce methods to allocate and free analysis buffers + [baf07b965909] + + * source/x265.h: + api: add analysis data structures and param options + [e5a24e5ba46e] + +2014-09-11 Steve Borho + + * source/encoder/analysis.cpp: + analysis: minor comment and code cleanups, no behavior change + [3d6cc40ebbf7] + + * source/encoder/search.cpp, source/encoder/search.h: + search: remove x prefixes from ME helper functions + [e9e71ece1344] + + * source/encoder/dpb.cpp: + dpb: does not need to include frameencoder.h + [6bb0d3a25b08] + +2014-09-10 Steve Borho + + * source/encoder/analysis.h: + analysis: nit + [012f315d3eda] + + * source/encoder/analysis.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/search.cpp, + source/encoder/search.h: + search: don't pass top-level encoder to initSearch() + + removes three Encoder members that were only used to communicate + data to initSearch() + [0bc83ba57dad] + + * source/encoder/search.cpp: + search: store rd costs in first pass through intra modes + + no behavior change, this is prep work for further refactors + [ff2fd6923f1a] + + * source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/framefilter.h: + cleanup header dependencies; dpb should not need encoder.h + [7eddb265f512] + +2014-09-11 Deepthi Nandakumar + + * source/common/x86/asm-primitives.cpp: + asm: enable copy_cnt8 + [a522a2549cac] + +2014-09-10 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 assembly code for dct16 + [18d0461eb4d0] + +2014-09-10 Praveen Tiwari + + * source/common/x86/blockcopy8.asm: + copy_cnt_8, AVX2 asm code as per new interface, performance improved + from 5.13x to 7.59x on HASWELL-I5 + [0d27befcc874] + +2014-09-10 Steve Borho + + * source/encoder/encoder.cpp: + encoder: nits + [81c9f704ae38] + +2014-09-09 Steve Borho + + * source/encoder/search.cpp: + search: re-enable chroma tskip + + this passed regression testing + [bdd477050097] + +2014-09-09 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp: + rc: use m_frameDuration instead of rce->frameDuration to derive + complexity for each frame in 2nd pass. + + don't store and use frameDuration from stats file per frame - losing + precision. + [139b41632c64] + +2014-09-10 Deepthi Nandakumar + + * source/common/param.cpp, source/x265.h: + param: apply missing default values to param, mostly zero and + reorder + [3fc141aa74b5] + +2014-09-09 Steve Borho + + * source/encoder/search.cpp, source/encoder/search.h: + search: reverse meaning of bCheckFirst to bAllowRQTSplit + + I find this much more readable + [408e2e6f0f70] + + * source/encoder/analysis.cpp, source/encoder/search.cpp: + search: !a ? b : c; -> a ? c : b; + [f45f3ed38951] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: rename some helper functions without x prefixes + [719284d910b6] + + * source/common/x86/blockcopy8.asm: + backout 0dc2cbc36ee5 to 331ef5121676 + [491e74c58e51] + +2014-09-09 Ashok Kumar Mishra + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: modified compressInterCU_rd0_4() with CU-specific + information + [2d9eb8cebb71] + +2014-09-09 Steve Borho + + * source/encoder/search.cpp, source/encoder/search.h: + search: fix camel case of residualQTIntraChroma + [d85792b9f373] + + * source/encoder/search.cpp: + search: don't pass a zeroDistortion pointer if you don't want the + answer + [84b1d287333f] + + * source/encoder/search.cpp, source/encoder/search.h: + search: return distortion from xEstimateResidualQT + [7d8e4935c1ca] + + * source/encoder/search.cpp, source/encoder/search.h: + search: pass depthRange uniformly as uint32_t depthRange[2] + + effectively the same as uint32_t but compilers and debuggers can + often do more with the length info. plus it just makes the code more + readable + [cead9fe7ff30] + + * source/encoder/analysis.cpp, source/encoder/search.cpp, + source/encoder/search.h: + search: return distortion from xRecurIntraCodingQT + [68ac5ca5d676] + + * source/encoder/search.cpp, source/encoder/search.h: + search: return distortion from xIntraCodingChromaBlk, do not pass by + ref + [62f6924be843] + + * source/encoder/search.cpp, source/encoder/search.h: + search: return distortion from xRecurIntraChromaCodingQT, do not + pass by ref + [b0a018562d29] + + * source/encoder/search.cpp, source/encoder/search.h: + search: return distortion from xIntraCodingLumaBlk, do not pass by + reference + [a7f4f750e9d4] + +2014-09-09 Praveen Tiwari + + * source/common/x86/blockcopy8.asm: + copy_cnt_8 AVX2 asm code, as per new interface + [331ef5121676] + + * source/common/x86/blockcopy8.asm: + copy_cnt_4: faster AVX2 code + [f7f8206a70bd] + + * source/common/x86/blockcopy8.asm: + copy_cnt_4: combine mova and paddb to reduce code size, same speedup + [5edcbcbb338f] + + * source/common/x86/blockcopy8.asm: + copy_cnt_4: enable fast non zero coefficient count path + [0dc2cbc36ee5] + +2014-09-09 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: use simple shifts to scale 2-pass CU type counters + + the cu type counters are summed at the end and turned into + percentages, so it doesn't matter what base unit is used, only that + each depth has 4x the value as depth+1 + [ebd5a0cac758] + + * source/encoder/frameencoder.cpp: + frameencoder: use x265_emms() prior to double QP clipping for VBV + [a414ca1c9067] + +2014-09-09 Deepthi Nandakumar + + * source/encoder/search.cpp: + search: remove warning from MS compiler + [44cb33846e0e] + +2014-09-08 Min Chen + + * source/common/dct.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/const-a.asm, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm: + asm: avx2 version of quant, improve 16.6k cycles -> 8.4k cycles + [c4fb044c901b] + + * source/common/x86/pixel-util8.asm: + asm: improve quant by replace variant shift to fixed shift, 19k + cycles -> 16.6k cycles + [277c1e05c247] + + * source/common/dct.cpp, source/test/mbdstharness.cpp: + testbench(quant): the qBits value must be more than or equal to 8 + [5dbf9e8f4028] + + * source/test/mbdstharness.cpp: + testbench(quant): the Round value must be less than (2 ^ qbits) + [53e0969c605f] + +2014-09-08 Steve Borho + + * source/encoder/search.cpp: + search: prune more unnecessary work from estIntraPredQT() + [b5f81a839403] + + * source/encoder/search.cpp: + search: remove some redundant work from estIntraPredQT + [033299d2bd00] + + * source/encoder/search.cpp, source/encoder/search.h: + search: remove m_qtTempTrIdx + [00b86119f0ad] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/CMakeLists.txt, + source/encoder/analysis.h, source/encoder/search.cpp, + source/encoder/search.h: + pull search class into encoder/ (TLibEncoder is no more) + + TEncSearch -> Search use consistent comment style and argument lists + make destructor no longer virtual (no more vtable) moving + StatisticLog into x265 namespace came for free + [f2688d840261] + + * source/encoder/api.cpp: + api: remove include of frameencoder + [796cc966e2fe] + + * source/Lib/TLibCommon/TComMotionInfo.h, source/encoder/analysis.h: + nits + [f2da9f3db824] + +2014-09-08 Ashok Kumar Mishra + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: modified compressInterCU_rd5_6() with CU-specific + information + [27581134f442] + + * source/encoder/analysis.cpp, source/encoder/analysis.h: + Analysis: compressIntraCU clean up + [5b377a411463] + +2014-09-08 Steve Borho + + * source/encoder/framefilter.cpp, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: move frame/slice initialization into SAO::startSlice + [80b2e91156d3] + + * source/encoder/sao.cpp: + sao: minor cleanups, no behavior change + [b9ed1dcacf9e] + + * source/encoder/frameencoder.cpp: + frameencoder: avoid another call to resetEntropy(), they are + expensive + [cfe197e3044d] + + * source/encoder/frameencoder.cpp: + frameencoder: combine some conditional expressions + [cb67f6f65577] + + * source/encoder/frameencoder.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: move FrameStats to ratecontrol.h + + rate control shouldn't need to include frameencoder.h + [89e682182a7a] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/ratecontrol.cpp: + frameencoder: rename percent fields for clarity + [9581a45d4344] + + * source/encoder/frameencoder.cpp: + frameencoder: do more CU stat math as integer + [406d92c860d5] + + * source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: merge more of encodeSlice() into processCU + + this commit fixes no-WPP after the previous change. the per-row or + per-frame (+- WPP) bistreams are flushed as they are finished (and + cache hot) and the per CU stats are summed per row and then + summarized all in one place. + [60289c638600] + +2014-09-05 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: remove second encodeCU() pass over CTUs when SAO is + disabled + + This is a performance optimization, it allows the encoder to + generate the final bitstream of each CTU as it is compressed and + cache hot. + + When SAO is enabled, SAO analysis must be performed and coded at the + start of the CTU but SAO analysis currently requires surrounding + CTUs to be encoded making the second pass unavoidable. + + Note that this commit changes the way non-WPP encodes are performed, + for the better. Now it always uses row 0's CI_CURR_BEST entropy + coder instance to communicate entropy state between all CTUs and + between rows. This better models how encodeSlice() works and makes + RDO work better + [a117564df3ef] + +2014-09-08 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: remove redundant clear of frame stats + + they were being zero'd in the constructor, init(), and in + compressCTURows. techincally only the last is truly necessary, but + I'm leaving the memset in the contructor. + [a7465d789c64] + + * source/encoder/entropy.h, source/encoder/frameencoder.cpp: + nits + [a29aa966336e] + + * source/common/x86/x86inc.asm: + Merge with correct x86inc.asm patch + [c55d69561948] + +2014-09-05 Min Chen + + * source/common/x86/x86inc.asm: + x86inc.asm: fix vpbroadcastd bug on Mac platform + [51930084e148] + +2014-09-08 Deepthi Nandakumar + + * source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp: + entropy: change top-level encode to encodeCTU + [de5614144bce] + +2014-09-07 Satoshi Nakagawa + + * source/encoder/sao.cpp: + fix sao + [8cbfec8d6b4d] + + * source/common/dct.cpp, source/encoder/analysis.cpp: + fix CHECKED_BUILD + [845e82c5d607] + +2014-09-05 Min Chen + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm: + asm: AVX2 version of dequant_normal, improve 9.3k Cycles -> 4.2k + Cycles + [ed4c9acafc11] + + * source/common/dct.cpp, source/common/x86/pixel-util8.asm: + asm: reduce number of movd in dequant_normal + [bc9025648270] + + * source/common/x86/x86inc.asm: + x86inc.asm: fix vpbroadcastd bug on Mac platform + [27364e9f97e4] + +2014-09-05 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: remove unnecessary call to resetBits(), improve + comment + [795878af3973] + + * source/encoder/frameencoder.h: + frameencoder: cleanup CTURow::init + + According to my understanding, only rdEntropyCoders[0][CI_CURR_BEST} + really needs to be initialized, but this changes outputs if I do + this. It tells me there is likely a bug in the entropy state + management + [b05e3141c766] + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + frameencoder: CTURow class -> struct, remove m_ prefixes + + Still no output changes in this patch series + [5d9433930735] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp: + tld: remove m_ prefixes from struct members + [85b93c9b6f0d] + + * source/encoder/CMakeLists.txt, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + cturow: merge CTURow and TLD objects into frameencoder.h + [1b69af135b48] + + * source/encoder/frameencoder.cpp: + frameencoder: remove unused variable, nit + [6d84a175723a] + + * source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.cpp: + cturow: inline processCU() + + The function is no longer complicated enough to justify having a + separate function, given that the arguments mainly dealt with the + wavefront cabac propogation. The CTU class is about to be further + simplified. + [0c97aad4038e] + + * source/CMakeLists.txt: + cmake: enable MACOSX_RPATH on Mac shared libraries + + http://www.cmake.org/cmake/help/v3.0/prop_tgt/MACOSX_RPATH.html + + Setting the policy CMP0042 to NEW makes MACOSX_RPATH=1 the default, + but we go ahead and set it manually anyway so cmake versions older + than 3.1 behave the same way + [01ba626b850d] + +2014-09-05 Praveen Tiwari + + * source/common/x86/blockcopy8.asm: + copy_cnt 4x4 AVX2 asm code, as per new interface + [f30c13d8143a] + +2014-09-04 Min Chen + + * source/common/x86/x86util.asm: + asm: reenable IACA support, it remove by 'inappropriate + instruction...' patch + [d5034e68aa40] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: avx2 version of nquant(), improve 9.8k cycles -> 5.3k cycles + [0f1f5f8a4981] + + * source/common/dct.cpp, source/common/x86/pixel-util8.asm: + asm: optimize nquant by PSIGND, improve 11k cycles -> 9.8k cycles + [dd04a9ec73a8] + +2014-09-05 Praveen Tiwari + + * source/common/x86/pixel-util8.asm: + count_nonzero asm code, reduceded code size by combining mova and + packsswb + [fdfe5c83a2d7] + + * source/common/x86/blockcopy8.asm: + copy_cnt: nits + [7bb07b6e1b0f] + + * source/common/x86/blockcopy8.asm: + copy_cnt 4x4, eliminated move instructions, +1x improvement + [d5efb8daf975] + +2014-09-05 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp: + fix getQuadtreeTULog2MinSizeInCU() + [efd6178a07f5] + +2014-09-04 Min Chen + + * source/common/x86/asm-primitives.cpp: + asm: enable SSE2 version of pixel_ssd_ss[] + [fd1e285675eb] + + * source/common/x86/ssd-a.asm: + asm: fix output mistake in pixel_ssd_ss_4xN + [7909696bf148] + +2014-09-05 Satoshi Nakagawa + + * source/Lib/TLibCommon/ContextTables.h: + fix cbf context + [5cd9f797d6f4] + +2014-09-04 Min Chen + + * source/common/x86/ssd-a.asm: + asm: replace ssse3 instruction in pixel_ssd_ss_*_sse2 + [93db2f53fe57] + +2014-09-04 Steve Borho + + * source/common/x86/pixel-a.asm: + asm: fix SSSE3 in SSE2 build warning in unused primitive + + Bug fix back-ported from x264. The function was not connected to any + primitive pointer in x264 or x265, so this is really just quieting + the warning + [1833be32ff21] + + * source/common/quant.cpp: + quant: fix check macro + [1f07a2a99eef] + + * source/common/quant.cpp: + quant: nits + [7ae1ffa141c1] + +2014-09-02 Praveen Tiwari + + * source/common/common.h, source/common/dct.cpp, + source/common/primitives.h, source/common/quant.cpp, + source/common/quant.h, source/encoder/entropy.cpp: + quant path cleanup + [735ba376211e] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/test/mbdstharness.cpp: + quant_c optimization, downscaling qCoef from int32_t* to int16_t* + [43e127b048da] + + * source/common/quant.cpp: + optimize cvt32to16_shl by replacing copy_shl + [7a5073a95658] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + added copy_shl primitive + [811d242433eb] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + added copy_shr primitive + [ec631987a40d] + + * source/common/x86/asm-primitives.cpp: + temporarily disable avx2 version of copy_cnt primitive, need to + update as per new interface + [6d2a546f52ce] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + conv16to32_count renamed to copy_count as per new interface + [75da68b8a8f1] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp: + conv16to32_count C interface modification, downscaling coeff from + int32_t* to int16_t* + [91e5a6476b1b] + +2014-09-01 Praveen Tiwari + + * source/common/dct.cpp, source/common/x86/blockcopy8.asm, + source/common/x86/const-a.asm, source/test/pixelharness.cpp: + cvt16to32_cnt optimization + [55039043fa39] + +2014-08-25 Praveen Tiwari + + * source/common/quant.cpp, source/common/quant.h: + signBitHidingHDQ optimization, downscaling coeff from int32_t* to + int16_t* + [aea5cf3ff1b2] + + * source/common/quant.cpp, source/common/quant.h: + rdoQuant optimization, downscaling dstCoeff fron int32_t* to + int16_t* + [dc9652f407f9] + +2014-09-02 Praveen Tiwari + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/test/mbdstharness.cpp: + nquant optimization, downscaling qCoef from int32_t* to int16_t* + [422184fd72a7] + +2014-08-25 Praveen Tiwari + + * source/common/quant.cpp: + quant.cpp, cleaned redundant code + [edc1675f4414] + +2014-09-02 Praveen Tiwari + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/vec/dct-sse41.cpp, + source/test/mbdstharness.cpp: + dequant_scaling optimization, downscaling quantCoef from int32_t* to + int16_t* + [e85f4a9235e3] + + * source/common/x86/pixel-util8.asm: + dequant_normal asm code optimization as per new interface + [ac73ca63b9f3] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/pixel-util.h, + source/test/mbdstharness.cpp: + dequant_normal optimization, downscaling quantCoef from int32_t* to + int16_t* + [ebfd59fb6351] + +2014-08-25 Praveen Tiwari + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/encoder/entropy.cpp, + source/test/mbdstharness.cpp: + count_nonzero primitive optimization, downscaling quantCoef from + int32_t* to int16_t* + [dc7c61a01000] + +2014-09-04 Anton Mitrofanov + + * source/common/x86/x86inc.asm: + x86asm: warn when inappropriate instruction used in function with + specified cpuflags + [cff4428396ac] + +2014-09-02 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h: + TComDataCU: Reduced repeated function call to calculate depth range + [73450cfe71bd] + +2014-09-03 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + change index of m_buOffsetY[] from raster to zscan + [95eaa1854b78] + +2014-09-03 David T Yuen + + * source/Lib/TLibCommon/TComDataCU.cpp: + Cleaned up TComDataCU::getQuadtreeTULog2MinSizeInCU for clarity and + a bit of performance + [e1c4b45b9749] + +2014-09-01 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm: avx2 asm code for dct4 + [1e4dfa1b1a04] + +2014-09-04 Deepthi Nandakumar + + * Merge + [b686cb0abd71] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: cleanup codeQtRootCbf + [7d81a06a1281] + + * source/Lib/TLibCommon/TComDataCU.h, source/common/quant.cpp: + quant: use table for ctxCbf + [99754bc87944] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: cleanup codePredMode + [0c50af24fdeb] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: cleanup codeCUTransQuantBypassFlag + [7b364065ee45] + +2014-09-01 Ashok Kumar Mishra + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + Entropy: Replaced getCtxQtCbf() with table + [8e4a0aeed2ca] + +2014-09-04 Steve Borho + + * Merge with stable + [a184fed3e30c] + + * source/common/version.cpp: + version: detect OpenBSD for version string (closes #76) + + Fix from Brad Smith + [821c2eef4d52] + + * source/CMakeLists.txt: + cmake: fix BSD link, only link with librt if it exists (closes #75) + [139d6b2a1b19] + +2014-09-03 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: enable 16bpp primitives of cvt32to16 and cvt16to32 for all + block sizes + [d122845b9cc8] + +2014-09-03 Min Chen + + * source/test/mbdstharness.cpp: + testbench(nquant): the Round value must be less than (2 ^ qbits) + [cf9245ee6a72] + +2014-09-03 David T Yuen + + * source/common/common.h, source/encoder/analysis.cpp: + Resolve gcc warnings + + * more parenthesis for macro + * changed signed to unsigned int + [0b13c852f029] + +2014-09-03 Ashok Kumar Mishra + + * source/encoder/analysis.cpp: + fix: hash/binary mismatch for new CU structure holds CU-specific + info + [00c381bf6158] + +2014-08-28 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.h, source/common/common.h, + source/encoder/analysis.cpp, source/encoder/analysis.h: + analysis: CU structure now holds CU-specific information, + + Member fields include location inside CTU, boundary flags, offsets + from CTU origin. This will help replace the soon-to-be-gone initCU + and initSubCU functions. + [62c4779fb0bb] + +2014-09-01 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + Merge with stable + [c5624effb73c] + + * source/encoder/ratecontrol.cpp: + Backed out changeset: 35b2d9e774c8 + + This patch disrupted CBR runs where we were unable to hit the target + bitrate. It is logically correct, but likely incomplete. + [dde992b96623] + +2014-08-30 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/x86/pixel- + util8.asm, source/encoder/encoder.cpp: + Merge with stable + [44b95661db56] + +2014-08-25 Steve Borho + + * source/encoder/encoder.cpp: + encoder: re-enable --cu-lossless + [269ba0a6ce8c] + +2014-08-25 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: fix decoder intra crash with --cu-lossless + [572988a922a1] + +2014-08-26 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + cu-lossless: fix inter hash mistake + + The CU needs to be re-encoded if lossless is chosen as the best + mode. + [5a9e01a195a0] + +2014-08-30 Satoshi Nakagawa + + * source/common/dct.cpp, source/common/x86/pixel-util8.asm: + asm: fix dequant_normal + [3c309e5d9c8f] + +2014-08-29 Steve Borho + + * source/encoder/sao.cpp: + sao: fix signed loop bounds bug + [4e2d9ac6d489] + +2014-08-25 Steve Borho + + * source/encoder/encoder.cpp: + encoder: re-enable --cu-lossless + [c21ddfe6a6d9] + +2014-08-26 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + cu-lossless: fix inter hash mistake + + The CU needs to be re-encoded if lossless is chosen as the best + mode. + [07291dff4048] + +2014-08-27 Steve Borho + + * source/test/mbdstharness.cpp: + mbdst: cleanup dequant test + [b18ae1fe86b8] + + * source/test/mbdstharness.h: + mbdst: make buffers nice even size + [2c70fa36659e] + + * source/test/ipfilterharness.cpp, source/test/testharness.h: + test: fix MSVC warnings + [dd540cb2c6b6] + + * source/test/mbdstharness.h: + mbdst: align buffers to 32byte boundary + [6f620106312e] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + mbdst: fix memset lengths, change MEM_CMP_SIZE to MAX_TU_SIZE + [bebafa8cf718] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + mbdst: reshuffle varnames + [58323bef4ae1] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + mbdst: remove redundant mintbuf buffers + [1d3ba60370be] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + mbdst: remove unused mbuf4, rename other short output buffers + [e807dcd89eae] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + mbdstharness: remove completely unused arrays + [8a20115abc04] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + pixel: use fixed buffer allocations + [cc0887e5a958] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + ipfilter: use fixed buffer allocations + [d626070bc8c6] + + * source/test/testbench.cpp: + testbench: move test benches from stack allocations to global + [8a247934f2d1] + + * source/test/intrapredharness.cpp, source/test/intrapredharness.h: + intra: use fixed buffer allocations + [01f3a8e5fcc2] + + * source/encoder/ratecontrol.cpp: + Merge with stable + [5426270aee62] + + * doc/reST/cli.rst: + docs: fix a typo + [ce614aebb395] + + * source/test/mbdstharness.cpp: + mbdst: add missing test of dequant_scaling + [d471b6de26c9] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + mbdst: use fixed buffer allocations, cleanup some tests + [b5642a14c51f] + +2014-08-27 Ashok Kumar Mishra + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove redundant function call + [5d246ebff000] + +2014-08-27 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: fix bug in predicting B frame bits in vbv + [35b2d9e774c8] + +2014-08-27 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + Merge with stable + [77fe0cc583e8] + +2014-08-27 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: fix bitrate accuracy for vbv in fast presets. + [a6c318bd69ce] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: rename m_bframes to m_leadingBframes + [c34b059dff1c] + +2014-08-26 Steve Borho + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/common.h, source/common/deblock.cpp, + source/common/param.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/ratecontrol.cpp: + common: rename QP range macros to be consistent with x264 + + I find QP_MAX_SPEC to be a lot more self-explanatory than MAX_QP + [32891b95f669] + + * source/Lib/TLibCommon/TypeDef.h: + types: remove generic TEXT_CHROMA enum, no longer used + [14fae9208078] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + pattern: use isLuma instead of restricted TextType range + [45359413afe6] + +2014-08-25 Steve Borho + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: don't pass member variables to functions + [2d386372d543] + +2014-08-26 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framefilter: move SAO init logic into the frame filter + [78804e5e360c] + +2014-08-26 Min Chen + + * source/common/x86/x86inc.asm: + x86inc: Make INIT_CPUFLAGS support an arbitrary number of cpuflags + + Ported from Henrik Gramner's recent commit to x264 + [090480360cb9] + + * source/common/x86/pixel-util8.asm: + asm: Minor pixel_ssim_end4 improvements + + Reduce the number of vector registers used from 7 to 5. Eliminate + some moves in the AVX implementation. Avoid bypass delays for + transitioning between int and float domains. + + Ported from Henrik Gramner's recent commit to x264 + [391e1fbb92cf] + +2014-08-26 Satoshi Nakagawa + + * source/encoder/frameencoder.cpp: + fix m_initSliceContext (uninitialised m_sliceQp) + [863faab1a004] + +2014-08-25 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: fix decoder intra crash with --cu-lossless + [a0028e5b6177] + +2014-08-25 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComYuv.h, source/common/deblock.cpp, + source/common/param.cpp, source/common/shortyuv.h, + source/common/slice.cpp, source/common/slice.h, + source/encoder/analysis.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/sao.cpp: + replace g_rasterToPelX[g_zscanToRaster[idx]] by g_zscanToPelX[idx] + [5acfb12ec5d1] + +2014-08-25 Steve Borho + + * source/common/quant.cpp: + Merge with stable + [44433a2d65dd] + +2014-08-25 Aarthi Thirumalai + + * source/x265.cpp: + rc: don't read slicetypes from qpfile in 2nd pass. + + slicetype has to be taken from the stats file from prev pass. + [5fe473327183] + + * source/encoder/ratecontrol.cpp: + rc: write I/i slice in stats file based on whether openGop is + enabled or not. + [f0de8275ed4d] + +2014-08-23 Satoshi Nakagawa + + * source/common/quant.cpp: + fix lossless + [ad31cbb8c754] + +2014-08-23 Steve Borho + + * source/encoder/sao.cpp: + sao: nits + [e1f1e836e833] + +2014-08-22 Steve Borho + + * source/encoder/sao.h: + sao: remove unused m_qp member + [e95c628b90ab] + + * source/encoder/sao.cpp: + sao: nit + [a683fffa5245] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h, + source/encoder/sao.cpp, source/encoder/sao.h: + sao: simplify entropy coder initialization [CHANGES OUTPUTS] + + Remove the hack from the HM that preserves fract_bits from the first + row to all rows; it never made any sense and we don't pretend to be + anywhere near close to bit accurate with the HM anymore. + + This commit removes the Entrppy coder in FrameFilter since it was + only used by SAO and moves it into the SAO class. The startSaoEnc() + method is removed entirely and another call to resetEntropy() (an + expensive method) is replaced with a load of the frame encoder's + m_initSliceContext + [f2fc8b1e1c3f] + + * source/encoder/sao.cpp: + sao: fix inf loop bounds + [6e6756f94b27] + + * source/encoder/framefilter.cpp, source/encoder/sao.cpp: + sao: fix MSVC warnings + [c9cf8ae5f7a0] + +2014-08-21 Steve Borho + + * source/encoder/sao.cpp: + sao: minor logic simplifications and cleanups + [bad70432b1d3] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: use typedefs to simplify multi-dimensional mallocs + [3881d3af896f] + + * source/encoder/sao.cpp: + sao: sChroma should have been isChroma + [ff799bffbaef] + + * source/encoder/sao.cpp: + sao: handle malloc failures more cleanly + [e95ae26c6afa] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: rename shared table to m_tableBo + [e02ad51211b6] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: share m_lumaTableBo with chroma + [47ab66abd2f2] + + * source/encoder/framefilter.cpp, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: use checked mallocs, disable SAO if any mallocs fail + [e93cf3c94bc4] + + * source/common/common.h: + common: define a CHECKED_MALLOC_ZERO method to auto-reset allocated + memory + + This is mostly useful when allocating buffers containing pointers, + to ensure the pointers are safe to free regardless of later malloc + failures + [36079a3dce67] + + * source/Lib/TLibCommon/TComPicYuv.h, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: move getPicYuvAddr() to TComPicYuv and simplify + [eaf5eb7ae2d6] + + * source/encoder/sao.h: + sao: remove unused m_tmpYuv and an obsolete comment + [87c08d2aaf56] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: luma and chroma can share a clip table + [50542271caba] + + * source/encoder/sao.cpp: + sao: unify signOf + [d70e2bac56b5] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: use more compile-time values + [7fc8855656e2] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: we don't support different bit depths between luma and chroma + [d36617a8512d] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: remove vars which are dups of params + [5d67f4e4a351] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: fix warnings + [02ec546246ad] + + * source/encoder/encoder.cpp: + Merge with stable + [2e17cb106b17] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: nit + [282c2c6f5ed1] + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + nits + [9cb0dfeac129] + + * source/encoder/sao.cpp, source/encoder/sao.h: + sao: convertLevelRowCol2Idx did not need to be a member function + [577e65afdb8d] + + * source/encoder/framefilter.cpp, source/encoder/sao.cpp, + source/encoder/sao.h: + sao: merge create/destroy methods + [ed9d78aa312a] + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h, + source/encoder/sao.cpp, source/encoder/sao.h: + sao: use m_param directly for user-configurables + [11d427f9ef26] + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/sao.cpp, source/encoder/sao.h: + sao: cleanup member prefixes, remove createPicSaoInfo and endSaoEnc + [a2e74591f674] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/common/CMakeLists.txt, source/common/common.h, + source/encoder/CMakeLists.txt, source/encoder/dpb.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h, source/encoder/sao.cpp, + source/encoder/sao.h: + pull SAO code into encoder/ and begin cleanup + [6a8df5dcb412] + + * .hgtags: + Added tag 1.3 for changeset c1e4fc0162c1 + [2e7d1569845e] + + * source/encoder/encoder.cpp: + encoder: disable --cu-lossless until coding errors can be fixed + (refs #73) + [c1e4fc0162c1] [1.3] + +2014-08-21 Satoshi Nakagawa + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/bitcost.h: + simplify AMVPInfo to MV[2] + [97ea21754381] + +2014-08-20 Steve Borho + + * source/encoder/encoder.cpp: + encoder: nit, remove obsolete comment + [9461fc801cd2] + + * source/encoder/encoder.cpp: + Merge with stable + [2585ce839546] + + * source/encoder/encoder.cpp: + encoder: do not allow scaling lists to be used with 4:4:4 color + space (refs #72) + + It causes hash mistakes + [3badda1ceda6] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: nit + [ed7509b29be2] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge with stable + [b4f833d41667] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: do not skip intra tu 32x32 unless TU intra depth is deep + (refs #71) + + This may not be the best workaround for the bug, and needs a proper + long term fix. + [203c87c55bb3] + +2014-08-19 David T Yuen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Removed code that will never be executed. + [4f2556521155] + +2014-08-19 Min Chen + + * source/encoder/frameencoder.cpp: + fix single thread encode issue + [80129b1b6260] + + * source/encoder/entropy.cpp: + cleanup: remove reduce condition check + [5a174ca360f3] + +2014-08-19 Steve Borho + + * source/encoder/frameencoder.cpp: + Merge with stable + [63cd8539e7d4] + + * source/encoder/frameencoder.cpp: + frameencoder: do not allocate noise reduction struct unless NR is + enabled + + This fixes output mismatch between asm / no-asm for 4:4:4 encodes. + Why? I am not entirely sure yet, but this is an obvious bug fix. + [c97a9a62bce8] + +2014-08-19 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: nit + [b137f5c97c46] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: cleanup variable names and lossless comments + [c2fc2a087d4a] + + * source/encoder/encoder.cpp: + Merge with stable + [75d01da6df07] + + * source/encoder/encoder.cpp: + psy-rdoq: eliminate dependency between psy-rd, psy-rdoq and rdoq. + + RDOQ is turned on by default in rdLevels 4 and above. Psy-rdoq can + be turned on if rdoq is enabled at that preset. Psy-rd can be turned + on with or without rdoq/ psy-rdoq. + [549c61d0436f] + +2014-08-18 Steve Borho + + * source/encoder/encoder.cpp: + param: correctly detect adaptive quant usage when checking --psnr + [dca792ce9786] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + Merge with stable + [d15367360097] + +2014-08-18 Min Chen + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + sao: move m_upBuff* to stack, fixes (#69) + [c979b32c8b51] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h: + cleanup: remove unused m_upBuff2 + [8320283a7fe0] + +2014-08-18 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + encoder: fix deadlocks/crashes when picture is a single CTU row + + 1. if only one row per frame, disable WPP it is pointless 2. if only + one row per frame, frame parallelism is (mostly) impossible 3. clamp + rateControlUpdateStats() to be within row count range + [eb2b3e34ea0d] + + * source/common/param.cpp: + Merge with stable + [529ccfbed281] + + * source/common/param.cpp: + param: typo + [b8356365135e] + +2014-08-17 Steve Borho + + * source/common/quant.cpp: + quant: combine psy-rdoq scale and lambda into one variable + + Reduces the number of 64bit multiplies required per psy-value + evaluation + [21506c40f704] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: pass bIsLuma to estBit() instead of limited TextType range + + if the only options were TEXT_LUMA or TEXT_CHROMA, it is better to + use a bool argument and avoid ambiguity about TEXT_CHROMA_U or + TEXT_CHROMA_V + [92d07932c93c] + + * source/common/quant.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: simplify EstBitsSbac + + There was no reason for significantBits to be a different type than + all the other fix15 bit costs. And NUM_QT_ROOT_CBF_CTX is 1 + [b37e0a8471ca] + +2014-08-18 Steve Borho + + * source/common/param.cpp: + Merge with stable + [13651e65b820] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: remove m_maxCuDQPDepth, simplify dqp logic + [9f038f7a2eb6] + +2014-08-18 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/deblock.cpp, + source/common/deblock.h, source/common/frame.cpp, + source/common/frame.h, source/common/param.cpp, + source/common/slice.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp: + refine depth related. + + maxCUDepth: CU depth maxFullDepth: CU+TU or CU+PU depth unitSize: + always 4 + [0eab29da63d3] + +2014-08-18 Steve Borho + + * source/x265.cpp: + cli: fix help syntax + [6a1b41049f71] + + * source/common/param.cpp: + param: add psy-rd and psy-rdoq to param2string (for info SEI and + stats file) + [0e4558e42ccc] + + * source/x265.cpp: + cli: update psy-rdoq rangw in CLI help + [cda8e5c21a9b] + +2014-08-16 Steve Borho + + * source/encoder/cturow.h, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + ctu: optimize away per-row bitstream coders, no output changes + + They were really only being used to hold the proper initial state + for each row. This avoids an entropy load/save per CU in + encodeSlice() and while I was in the area I removed another pile of + resetEntropy() calls. The initial slice entropy state is calculated + once and then kept in the frame encoder for all future uses. + [dd594e7362e2] + + * source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/cturow.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp: + analysis: move encodeCU() methods and friends to Entropy + + This function only encodes a fully finished CTU; it is performing no + analysis and it needs none of the features or members of the + Analysis class. It is much more clear as a member of the Entropy + class. + + Note: the way bEncodeDQP is communicated is a huge mess and needs to + be cleaned up soon. And finishCU() can also likely be hugely + simplified + [bb6ea1337d82] + +2014-08-18 Deepthi Nandakumar + + * Merge with stable + [2c68dc7e3827] + + * source/encoder/analysis.cpp: + analysis: missing braces caused a bug in CABAC context save for rd=2 + [5d5a4c09182d] + +2014-08-18 Tom Vaughan + + * doc/reST/introduction.rst: + introduction.rst edited online with Bitbucket + [9a0d24274357] + + * doc/reST/introduction.rst: + introduction.rst edited online with Bitbucket - added link to MPEG- + LA + [bff42bfd0652] + +2014-08-16 Steve Borho + + * doc/reST/cli.rst: + rest: fix typo, document default psy-rdoq value + [87400d5c90aa] + + * build/regression/01build-and-smoke-test.bat, + build/regression/02perftest.bat, build/regression/commandlines- + example.txt, build/regression/config-example.txt, build/regression + /email-csv.py, build/regression/run.bat, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/compress.cpp: + merge with default, feature freeze for 1.3 + [24a79f4d6f72] + + * doc/reST/cli.rst, doc/reST/presets.rst, source/common/param.cpp, + source/x265.cpp: + param: enable --fast-intra for all fast modes and turbo first-pass, + document + [866f21378d94] + + * source/encoder/slicetype.cpp: + slicetype: use fast intra scan unconditionally in lookahead + + We don't want the lookahead output to be variable depending on + --fast-intra since this would make two-pass turbo mode more + difficult. + [c01a334a1043] + +2014-08-15 Steve Borho + + * doc/reST/cli.rst: + rest: improve documentation for psycho-visual options + [184748d40276] + +2014-08-16 Steve Borho + + * source/encoder/analysis.cpp: + analysis: nits + [6b14a488d586] + + * source/common/quant.cpp, source/common/quant.h, + source/encoder/cturow.cpp, source/encoder/frameencoder.cpp: + quant: set lambdas at the same time quant QP is configured + [eb23702aacd8] + +2014-08-15 Steve Borho + + * source/common/param.cpp: + param: re-enable psy-rdoq range check for [0..10], print with two + digits + [1710b46949f2] + + * source/encoder/rdcost.h: + rdcost: add arbitray scale to make --psy-rd 0..2 range give + acceptable quality + [8d48e5e726cd] + + * source/common/quant.cpp, source/common/quant.h, + source/encoder/frameencoder.cpp: + quant: scale psy-rdoq by lambda + + Using L1 lambda in RDO is unorthodox, but we already do the same + thing with psy-rd energy and it seems to work decently well here + weighting the reconstructed DCT coeff against the RD cost in psy- + rdoq. + + The goal is for psy-rdoq 1 to be a safe and sane default + [810151a75aed] + +2014-08-14 Steve Borho + + * source/encoder/analysis.cpp: + analysis: avoid MSVC uninitialized variable warning + [4c09d2ef5f1b] + + * source/common/quant.cpp: + quant: use optimized primitive for transquant bypass in + invTransformNxN + [ef25a0e7de3b] + + * doc/reST/cli.rst, source/common/param.cpp: + param: don't allow turbo mode to increase rd-level, improve docs + [c4f0bbad98cf] + + * source/encoder/analysis.cpp: + analysis: use macro and for-loop to simplify fast-intra + + this changes behavior a bit; it's trying both +/-1 offsets instead + of just one. and it has to do one extra check at the end since mode + 34 isn't reached by the other previous loops + [07138e6ac952] + +2014-08-14 David T Yuen + + * source/encoder/analysis.cpp: + Added fast intra search option to Analysis::checkIntraInInter_rd0_4 + [213f17c1492c] + +2014-08-14 Steve Borho + + * source/encoder/analysis.cpp: + nit + [08e9d25e2c72] + + * source/common/x86/asm-primitives.cpp: + asm: fix HIGH_BIT_DEPTH build again + [7da2b731de30] + + * source/encoder/analysis.cpp: + analysis: white-space nits, possibly exposing a logic bug + [e5083cf64df0] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp: + remove predict.h includes from TLibCommon/ + [e66dbb50403b] + + * source/common/quant.cpp: + quant: + has higher precedence than ?; use parens to fix + unquantShift logic + [e62f74712a16] + +2014-08-14 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/common.h, source/common/intrapred.cpp, + source/common/x86/asm-primitives.cpp, source/encoder/predict.cpp: + replace IntraFilterType[][] to IntraFilterFlags[] (aboid *35) + [5ccdcc853810] + +2014-08-14 Deepthi Nandakumar + + * source/encoder/slicetype.cpp: + slicetype: initialise lowmode to 4. + [6b741cce14ac] + +2014-08-13 Steve Borho + + * source/common/common.h, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-util.h, source/encoder/predict.h: + fix HIGH_BIT_DEPTH compile on Windows + + something included by predict.h breaks pixel-util.h for MSVC; so + move the filter table to common.h. It wasn't quite proper for asm- + primitives.cpp to include a header from encoder/ anyway. + [3d3bc732b9e7] + + * source/common/quant.cpp: + quant: typo + [889f69d56b7e] + + * source/common/quant.cpp: + quant: update comment for UNQUANT macro + [7c7d35fe1a14] + + * source/common/quant.cpp: + quant: simplify UNQUANT() variables + [9ad7668756f7] + +2014-08-13 Deepthi Nandakumar + + * source/common/quant.cpp, source/common/scalinglist.cpp: + quant: remove extra upshift by 4 in s_invQuantScales + + Adding 4 to unQuantPer was a bug, since unQuantScale was already + upshifted by 4. + [1921583cf797] + +2014-08-13 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/quant.cpp, + source/common/quant.h, source/encoder/cturow.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp: + move m_estBitsSbac from Quant to Entropy + [d66e257ace32] + + * source/common/common.h, source/common/dct.cpp, + source/common/primitives.h, source/common/quant.cpp, + source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + asm: integrate denoise_dct, malloc NoiseReduction for alignment + [da6077b713c9] + +2014-08-13 Steve Borho + + * source/common/scalinglist.cpp, source/common/scalinglist.h: + scaling: remove array of rarely used trivially calculated values + [ebf2d550938c] + + * source/common/scalinglist.cpp, source/common/scalinglist.h, + source/encoder/entropy.cpp: + scaling: remove s_numListsAtSize[], it is 6 at every size + [a2368a224ab3] + + * source/encoder/predict.cpp, source/encoder/predict.h: + predict: nits + [0a9d6594872d] + + * source/encoder/predict.cpp: + predict: prevent MSVC warning about int to bool conversion + [c87f14ecb186] + +2014-08-13 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/encoder/frameencoder.cpp: + refactor: cleanup setting up of lambda + + move initialising qp and lambda manipulations in one place. remove + TEncSearch::setQP - misleading function name. + [0f9e8b2c0ece] + + * doc/reST/cli.rst, source/x265.cpp: + cli: nits. fix typo + [740b794a19fe] + + * source/encoder/frameencoder.cpp: + rc: bug fix - m_avgQpAq can be from 0-69. + + It is already set within RateControl + [1dc741719013] + +2014-08-13 Steve Borho + + * source/x265.cpp: + cli: free param structures on early-outs + [03ea314a6e9d] + + * source/common/x86/asm-primitives.cpp: + asm: fix HIGH_BIT_DEPTH link problems + [4ac29f78756c] + + * source/encoder/slicetype.cpp: + slicetype: adjust variable scoping to avoid GCC warnings + [12c265f0e413] + + * source/encoder/slicetype.cpp: + slicetype: nit + [d43e9a6a7cce] + + * source/CMakeLists.txt: + cmake: bump build number + [44b784c61455] + +2014-08-12 David T Yuen + + * source/common/param.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/x265.cpp, source/x265.h: + Added fast intra search option + + This version calls intra_pred_allangs to create the predictions then + the faster search with satd + [d22dfb9df30f] + + * source/Lib/TLibCommon/TComPattern.cpp, source/common/intrapred.cpp, + source/encoder/predict.cpp, source/encoder/predict.h: + Moved IntraFilterType from intrapred.cpp to predict.h to make it + available + + and replace Predict::filteringIntraReferenceSamples + [f47c3e4be76c] + +2014-08-12 Sumalatha Polureddy + + * source/encoder/analysis.cpp: + analysis: check for proper cost + + Throws error when psyrd is enabled since cost is stored in + m_totalPsyCost and m_totalRDCost has MAX_INT64 + [91ccff0ca74b] + +2014-08-12 Steve Borho + + * source/common/quant.h: + quant: initialize qp to impossible value so first set always works + + A bug spotted by valgrind + [8a7f4bb1d1be] + +2014-08-12 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/param.cpp, source/encoder/encoder.cpp: + replace g_convertToBit[] to g_log2Size[] const table + [945e071f491f] + +2014-08-11 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm: AVX2 version cvt32to16_shl + [23d58a1819c7] + +2014-08-10 Satoshi Nakagawa + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/test/mbdstharness.cpp: + quant: remove scaledCoeff from nquant() + [2bdcfcc1bb33] + +2014-08-10 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: nits; add LF in frame stats of 2 pass + [94d9a0d50837] + +2014-08-10 Steve Borho + + * source/encoder/level.cpp: + level: signal profile/level NONE if non-compliant configuration is + found + + Once you get above a certain resolution, the CTU size must be at + least 32. There is no level that supports a smaller CTU, so signal + level NONE. It's my understanding that you cannot signal a profile + if the level is NONE, so we reset profile to NONE as well. + + The same is true if NumPocTotalCurr is greater than 8; there are no + levels which support values that high. Instead of signaling the + closest level, we should signal profile/level NONE + [7965aacd35ae] + +2014-08-09 Steve Borho + + * source/common/quant.cpp: + quant: improve flow and comments for last non-zero refinement + [6e4eb8542203] + + * source/common/quant.cpp: + quant: comment improvements + [c9dd47a21b48] + + * source/common/quant.cpp, source/common/quant.h: + quant: remove floating point operations from RDOQ [CHANGES OUTPUTS] + + The output changes are minor. On modern CPUs the performance benefit + of this change is negligable since SSE double operations are similar + in performance to int64 operations. As a future optimization, we + need to figure out how to multiply lambda2 (FIX8 24bits) by signal + cost (FIX15 24bits) using 32-bit integers since 32bit multiply is + significantly cheaper than 64bit integer multiply. + + Similarly, unquantAbsLevel can be larger than 16bits so multiplation + is done with int64. Note that we use signed int64 because with psy- + rdoq the costs could go negative. + [4f1ce079b4a4] + + * source/common/quant.cpp, source/common/quant.h: + quant: cleanup chroma QP function + + With a unique function name, protected access, and only called from + one location, the ttype check could be removed. + [5132c37cdb38] + + * source/common/quant.h: + quant: header cleanups, no functional change + [33c6c661905c] + + * source/common/quant.cpp: + quant: use standard rd cost formula for sign-hiding [CHANGES + OUTPUTS] + + The previous RD formula was simply inexplicable, though it did work + fairly well. + + Old approach: deltaU[blkPos] = (scaledCoeff[blkPos] - ((int)level << + qbits)) >> (qbits - 8); int64_t invQuant = + ScalingList::s_invQuantScales[rem] << per; int64_t rdFactor = + (int64_t)((invQuant * invQuant) / (lambda2 * 16) + 0.5); costUp = + rdFactor * (-deltaU[blkPos]) + rateIncUp[blkPos]; + - wat? - + + New approach: int d = abs(signCoef) - UNQUANT(absLevel + 1); costUp + = (((uint64_t)(d * d)) << scaleBits) + lambda2 * rateIncUp[blkPos]; + + Using this approach the results are nearly the same (they appear to + be slightly better) but now we can probably add psycho-visual + tunings to the sign hiding feature + [e18b85eeb6c5] + + * source/common/quant.cpp: + quant: do not check CG bitmap for implied-present coeff groups + [220e217152cf] + + * source/common/quant.cpp: + quant: reduce conditional expression depths (mostly for readability) + [95b1d7535af8] + +2014-08-08 Steve Borho + + * source/common/quant.cpp: + quant: avoid an extra shift by adjusting the unquant coeff shift + [4003cbf60782] + + * source/common/quant.cpp: + quant: improve variable names and comments (no behavior change) + [d6723db1e8ec] + + * source/encoder/entropy.cpp: + entropy: simplify sign hide flag + [84acc8eb8d9c] + +2014-08-07 Min Chen + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + asm: cvt32to16_shl[*] for TSkip + [091a63164c41] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + asm: cvt16to32_shr[*] for TSkip + [8cd2e8c9a3ba] + +2014-08-08 Satoshi Nakagawa + + * source/common/quant.cpp: + denoise: fix numCoeff (bug from 42b1d7c17510) + [ef2602935c59] + +2014-08-07 Steve Borho + + * source/common/x86/asm-primitives.cpp, source/test/testbench.cpp: + main10: create a hybrid all-angs primitve for 16bpp compiles + + The all-angs primitive is highly optimized assembly code that avoids + a lot of redundant work. The all-angs C ref is horribly slow, doing + redundant work to mimic the output of the all-angs assembly code. + Since we have no high bit depth assembly for these functions, we'll + use a shim C function that works very similar to the C ref but it at + least uses optimized primitives. + + intra_allangs4x4 3.64x 6619.54 24097.30 intra_allangs8x8 5.66x + 13722.49 77694.97 intra_allangs32x32 4.57x 246943.81 1129159.50 + + before: encoded 1253 frames in 104.37s (12.01 fps), 366.08 kb/s, + SSIM Mean Y: 0.9889624 (19.571 dB) + + after: encoded 1253 frames in 95.62s (13.10 fps), 366.08 kb/s, SSIM + Mean Y: 0.9889624 (19.571 dB) + [33702c567e50] + + * source/encoder/entropy.cpp: + entropy: remove implicit memset from constructor + + Before we do further refactors, we want Entropy instances allocated + on the stack to not perform any needless initialization work + [49b593197330] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: pad size of context array to 32 * 5 bytes + [f6e38749049c] + + * source/Lib/TLibCommon/ContextTables.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: remove ContextModel structure, use uint8_t directly + [04567c40dae5] + + * source/Lib/TLibCommon/ContextTables.h, source/encoder/entropy.cpp: + entropy: remove bBinsCoded from ContextModel (no more users) + [4297617da24c] + + * source/common/slice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp: + entropy: disable signaling of CABAC init state + + This flag, which was already disabled when frame parallelism is in + use (which is nearly always) was of limited utility. It did not + improve compression efficiency by any measurable amount, and it was + expensive to compute. But the quality which made it expendable was + that it was the only user of the bBinsCoded flag in the + ContextModel; forcing us to copy twice as much data every time we + copy a context. + + With this feature removed, the context model can be reduced to a + single uint8_t state variable. + [3fdb78507aea] + + * source/encoder/frameencoder.cpp: + frameencoder: nit + [b89417dfa782] + + * source/encoder/frameencoder.cpp: + frameencoder: avoid redundant calls to resetEntropy() + + All of the entropy coders need to be reset to the same state at the + start of the frame's analysis. There is no point in re-calculating + this initial state repeatedly for each row + [83880abea807] + + * source/common/quant.cpp: + quant: avoid runtime check of transform shift size + [8e68a1db7c04] + +2014-08-07 Ashok Kumar Mishra + + * source/encoder/analysis.cpp: + analysis: cleanup + [8e45fc7c5521] + +2014-08-07 Aarthi Thirumalai + + * source/common/param.cpp: + rc: set rdlevel to 2 in fast first pass for multipass encode. + + increases speed of the first pass by over 70% in the slower presets + with almost the same quality in the final pass. + [5a0e2a7f9ad3] + +2014-08-07 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/shortyuv.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter16.asm, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/common/x86/pixeladd8.asm, + source/common/x86/x86util.asm, source/encoder/analysis.cpp, + source/test/pixelharness.cpp: + asm: cleanup unused sub_ps and add_ps + + sub_ps and add_ps are used by CU or TU level, not PU level. + [c29e37317d46] + +2014-08-07 Steve Borho + + * source/Lib/TLibCommon/ContextTables.h, source/encoder/entropy.cpp: + entropy: remove m_ prefix from ContextModel.state, other nits + [029563495b6e] + +2014-08-06 Steve Borho + + * source/common/param.cpp: + param: log noise reduction use/strength + [619633a933f6] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/slice.h, + source/encoder/frameencoder.cpp: + refactor: move motion reference pointers from TEncSearch to Slice + + The actual storage for the MotionReference instances lives in the + FrameEncoder, to save memory, but we don't want the analysis code to + be aware of the FE so we give it a pointer to this array. Having + that pointer live in TEncSearch forces us to update this pointer + each time a worker thread starts a new job. + + Having the pointer in Slice means we only have to set it once, when + the slice first starts to be encoded. And having this pointer next + to the RPS data in the slice header makes this more coherent. + [03de9eef4244] + + * source/common/common.h, source/common/quant.cpp, + source/encoder/frameencoder.cpp: + denoise: further cleanups + + x264 used buffer pointers so that they could swap between normal + denoise and "emergency denoise" when the QP became very high. We do + not have an emergency denoise and thus we don't need these pointers + at this time. This simplifies initialization and update logic. + [94cefc095d2f] + +2014-08-06 Praveen Tiwari + + * source/common/common.h, source/encoder/frameencoder.cpp: + noise reduction, cleaned unused buffer space + [ba6729e34f77] + + * source/common/quant.cpp, source/encoder/frameencoder.cpp: + noise reduction, use all eight categories of transform for inter + blocks + [e0d08da20e7f] + +2014-08-06 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc : nit + [02d805ee3d38] + +2014-08-05 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/const-a.asm: + asm: cvt16to32_cnt[32x32] for TSkip + [ca70276334d2] + +2014-08-05 Satoshi Nakagawa + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/common/x86/ipfilter16.asm, source/common/x86/ipfilter8.asm, + source/common/x86/ipfilter8.h, source/common/x86/mc-a.asm, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/common/x86/pixel.h, source/common/x86/pixeladd8.asm, + source/test/ipfilterharness.cpp, source/test/pixelharness.cpp, + source/test/testbench.cpp, source/test/testharness.h: + primitives for RExt + [c473f49e2818] + +2014-08-05 Santhoshini Sekar + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + TComPattern: remove redundant functions used for CIP + [56eaac5d6410] + +2014-08-05 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + me: clip motion search area to signaled motion vector length limits + [0d4723a0080c] + +2014-08-04 Steve Borho + + * source/common/cpu.cpp, source/common/x86/const-a.asm, + source/common/x86/cpu-a.asm, source/common/x86/mc-a.asm, + source/common/x86/mc-a2.asm, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h, source/common/x86/sad-a.asm, + source/common/x86/ssd-a.asm, source/common/x86/x86inc.asm: + asm: asm header updates + [22b1b01b95aa] + +2014-08-04 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/const-a.asm: + asm: cvt16to32_cnt[16x16] for TSkip + [1760c267c1e9] + +2014-08-04 Steve Borho + + * source/common/quant.cpp, source/common/scalinglist.cpp, + source/common/scalinglist.h: + quant: change how RDOQ measures distortion [CHANGES OUTPUTS] + + RDOQ, as it was written in the HM, expects scaled level values to be + output by quant; these are the output levels multiplied by the + quantizing coeffificient but without the rounding factor and without + the downshift. It would then measure distortion as the difference + between this scaled level and level << qbits (a rough unquant). To + make this math work, it was pre-calculating an error scale factor + (per block position, since the quantization coefficients can vary) + which divided the result by the squared scale factor and upshifting + to simultaneously account for the FIX15 nature of the signaling + costs and the uniform scaling of the forward transform. To roughly + summarize: + + errScale = (1 << (15 - 2 * transformShift)) / (quantCoeff[i] * + quantCoeff[i]) levelScaled = level * quantCoeff[i] distortion = + levelScaled - (level << qbits); cost = distortion * distortion * + errScale + lambda2 * bitsFix15 + + It was forced to use floating point math for the errScale and + distortion calculations, and thus did not bother with fixed point + math for lambda2. + + This commit changes the distortion measurement to be the difference + between the original (pre-quantization) DCT coefficient and the + unquantized level. + + unquantAbsLevel = (level * quantCoeff[i] + pad) >> shift; + distortion = unquantAbsLevel - abs(signCoef); distScale = 1 << (15 - + 2 * transformShift); cost = distortion * distortion << distScale + + lambda2 * bitsFix15 + + Note that the same scale factor is still required to account for the + FIX15 bit cost and the forward DCT scale but now it is a simple + shift operation. + + This commit does not change the data types; that will be a later + commit once the dynamic ranges have been properly evaluated. And + deltaU[], used by sign hiding, is left using the scaled level cost + basis for now. + [da57b1e8ac58] + + * source/common/quant.cpp: + psy-rdoq: fix unquant shift factors + + dequant coefficients are made with s_invQuantScales[rem] << 4, so to + perform an unquant we must remove those four bits from dequantCoeff + [08304a298065] + + * source/common/quant.cpp: + quant: remove TODO comment + + Yes, there is a reason to check maxAbsLevel < 3 here, diffLevel + below can only be 0, 1, or 2. + [ae7c5f4a842d] + +2014-08-01 Aarthi Thirumalai + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/common/param.h, source/encoder/api.cpp, source/x265.cpp, + source/x265.h: + rc: add slow first pass as option. set turbo first pass as default + in 2 pass + [c5f2a20e6f4c] + +2014-08-04 Santhoshini Sekar + + * source/encoder/encoder.cpp: + bugfix: disable aq only when both aq-strength and cu-tree is off + + AQ can be on when cutree is on even if aq-strength is 0.Do not force + aq to be off whenever aq strength is 0. + [44eb5e05423a] + +2014-08-04 Steve Borho + + * source/common/quant.cpp: + psy-rdoq: include psy-cost in uncoded coefficient distortion + + Without this adjustment, uncoded coefficients were possibly being + penalized over coded ones with less psy value. + [8edb2a5f3379] + +2014-08-03 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: fix MSVC warning about intptr_t conversion to int + [a2095baa3cbd] + + * source/common/quant.cpp: + quant: nit + [774dc8b6a535] + + * source/encoder/entropy.cpp: + entropy: cleanup calcCost, add EMMS for float safety + [b2ee9de29995] + + * source/common/quant.cpp: + quant: assign a signal cost of zero to zero coeff after lastNZ + + This stack variable was uninitialized otherwised, and could have + been used during the optimization of lastNZ + [7f550011c40f] + + * source/common/quant.cpp: + quant: rdStats -> cgRdStats + [4addf282bf76] + + * source/common/quant.cpp: + quant: cleanup coeff group zero-check code + [545d1ae4375b] + + * source/common/quant.cpp: + quant: sign hide nits + [59398cdff016] + + * source/common/quant.cpp: + quant: denoise nits + [42b1d7c17510] + + * source/common/quant.cpp, source/common/quant.h: + quant: don't pass member variable to class method + [7d727cd9bc63] + + * source/common/quant.cpp, source/common/quant.h: + quant: inline the quant method, only called from transformNxN + [f55d750331aa] + + * source/common/quant.cpp, source/common/quant.h: + quant: disallow psy-rdoq if transformSkip was used + + psy-rdoq is not possible without a transform. Or said another way, + we would need a different psy-rdoq approach for transform skip + [ede3a4679d98] + + * source/common/quant.cpp: + quant: comment nits + [c97672cee4c3] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/quant.cpp, + source/common/quant.h: + quant: remove curUseRDOQ argument from transformNxN, always true + [c01f8fd0985f] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp: + search: remove curUseRDOQ arguments from two more methods + + residualTransformQuantInter and xEstimateResidualQT were always + called with curUseRDOQ = true, and the flag was never changed + [02ac2e9324b5] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: hard-code curUseRDOQ + [2ea423991b4a] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp: + search: remove curUseRDOQ argument from encodeResAndCalcRdInterCU, + always true + [dcf8948beac2] + + * source/common/quant.cpp, source/common/quant.h: + quant: hoist IEP_RATE out of getICRateCost + [01116c890510] + +2014-08-01 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/common/x86/const-a.asm: + asm: cvt16to32_cnt[8x8] for TSkip + [49bab9bdf2a3] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/quant.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + asm: cvt16to32_cnt[4x4] for TSkip + [6f502ab94357] + + * source/common/x86/x86inc.asm: + update header and support Intel IACA marker + [e6184896aa7b] + +2014-08-02 Steve Borho + + * source/common/quant.cpp: + quant: change lastCG into a bool, use isOne flag to avoid abs() + calls + [69beab744475] + + * source/common/quant.cpp: + quant: consistent comment style, improve comments + [b12ac8919761] + + * source/common/quant.cpp: + quant: levelDouble -> levelScaled + + This always confused the heck out of me. The level was not doubled, + it was not a double, and it wasn't squared. It was just the level + scaled by the quant scale factor + [28c35f8e4f43] + + * source/common/quant.cpp: + quant: rename sigCost to codedSigBits, comment nit + [a28d5ae1b52a] + + * source/common/quant.cpp: + quant: RDO_CODED_LEVEL macro can now be inlined for easier debugging + [9bb93a267300] + + * source/common/quant.cpp: + quant: re-order rdoq logic so only one RDO_CODED_LEVEL() call is + required + [30f1f1d739db] + +2014-08-01 Steve Borho + + * source/common/quant.cpp: + quant: more readability nits - no output changes + [ed49f875ab20] + + * source/common/quant.cpp: + quant: improve comments for trailing zero coeff + [f14d233107d4] + + * source/common/quant.cpp: + quant: remove redundant level intialization + [d341acd13af2] + + * source/common/quant.cpp: + quant: blockUncodedCost -> totalUncodedCost, improve comments + [3b8853b12d9c] + + * source/common/quant.cpp: + quant: correct rounding factor for unquant + [253ad3eafaa2] + + * source/common/quant.cpp: + quant: clarify last-nz optimization loop + [11a3a69d3e29] + + * source/common/quant.cpp: + quant: rename costCoeff0 to costUncoded, add docs + [1c9a6a976e5d] + + * source/common/quant.cpp: + quant: support scaling lists in psy-rdoq + [8767ddb686af] + + * source/common/quant.h: + quant: make IEP_RATE an anonymous enum, it doesn't need storage + [be69e059808a] + + * source/common/quant.cpp, source/common/quant.h: + quant: m_lambda2 no longer needs to be a member variable + + it is only used in rdoQuant() and can be declared on the stack + [287d37822825] + + * source/common/quant.cpp, source/common/quant.h: + quant: convert getCodedLevel() into a macro, remove m_transformShift + hack + [ae8c153ee91d] + + * source/common/quant.cpp: + quant: simplify minAbsLevel + [db62272d284c] + + * source/common/quant.cpp: + quant: hoist some calculations out of the loop + [32b4aa0eb4fb] + + * source/common/quant.cpp: + quant: delay err3, err4 calculation until/if necessary + [244ba5fa80d4] + + * source/common/quant.cpp: + quant: apply scale factor in just one place + [2a7315a37d67] + + * source/encoder/dpb.cpp: + dpb: cleanup decodingRefreshMarking() + [6b1753638790] + + * source/common/slice.h, source/encoder/dpb.cpp: + dpb: remove checks for slice types we do not emit + [963b8e7b1dff] + + * source/encoder/dpb.cpp: + dpb: style nits + [5d1bd6097113] + + * source/encoder/dpb.cpp: + dpb: getNalUnitType() cannot return NAL_UNIT_CODED_SLICE_IDR_N_LP + [b911b02737c8] + + * source/encoder/dpb.cpp: + dpb: remove redundant call to getNalUnitType(), output will not + change + [fb24f965eade] + +2014-08-01 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: enable abr reset in the first pass of two pass encode. + + observe this improves second pass results in ultrafast presets for + some videos. + [a9a7f0933ecc] + +2014-08-01 Santhoshini Sekar + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/predict.cpp, + source/encoder/predict.h: + cleanup: move m_predYuv and m_predTempYuv from predict to TEncSearch + [a74b24444ae8] + +2014-08-01 Deepthi Nandakumar + + * source/encoder/predict.h: + predict: whitespace nits + [3db5fda6abf0] + +2014-07-31 Santhoshini Sekar + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + rc: update vbv for all b frames + + HEVC is complex (and slow) enough that we can afford to update/plan + the VBV buffer states for all frames, not just I and P. The leads to + smoother rate control, particularly when there are large stretches + of B frames, and less (over) compensation is necessary for P frames. + [e85b0aaa64e4] + +2014-07-31 Deepthi Nandakumar + + * source/encoder/predict.h: + predict: nits, cleanup, add TODO comments + [88a18a365d56] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/predict.cpp, + source/encoder/predict.h: + predict: save clipped MVs + [323e8e87f903] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/predict.cpp, source/encoder/predict.h: + predict: prepare motionCompensation sets base class fields + [8f30d3659f82] + +2014-07-31 Steve Borho + + * source/encoder/dpb.cpp: + dpb: fix picture re-use bug introduced in bc53f6dcda7f + [29ca05751777] + +2014-07-30 Steve Borho + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: remove unused flush function + [674fb6e4d24c] + + * source/encoder/entropy.cpp: + entropy: remove redundant !isIntra() check + [b21a9eeec1f8] + + * source/common/quant.cpp: + quant: use bisLuma + [8587668cf279] + + * source/common/quant.cpp, source/common/quant.h: + quant: use x264 style lambda naming scheme + [77655ea856fe] + +2014-07-31 Steve Borho + + * source/encoder/dpb.cpp, source/encoder/dpb.h: + dpb: cleanup prepareEncode() + [565603cfe031] + +2014-07-30 Steve Borho + + * source/encoder/rdcost.h: + rdcost: settle on x264 style lambda naming scheme + [639bef366e3e] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/rdcost.h: + rdcost: change input type of setC*DistortionWeight to uint16_t + + the inputs were shorts, converted to doubles, passed to the + function, then floor() and casted to uint64_t + [5e7a5c1c3446] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/rdcost.h: + rdcost: move zeroPel buffer to TEncSearch, its only user + [79d976fdc263] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/rdcost.h: + rdcost: keep psy-rd scale factor as an int (0..512), remove + psyRdEnabled() + [fdb6bb3b2763] + +2014-07-31 Steve Borho + + * source/common/common.h: + common: remove unused SCALING_LIST_PARAMETER enum + [5d6c2ddd5620] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + TComRom: nits + [cbf7a9c4be88] + +2014-07-30 Steve Borho + + * source/encoder/entropy.cpp: + entropy: nits + [667a38aaccff] + + * source/encoder/entropy.cpp: + entropy: use bIsLuma in codeCoeffNxN() + [a800a5ca5db7] + + * source/common/slice.h, source/encoder/entropy.cpp: + slice: allow loop filter boundary flag to be analyzed + [89316a3d04b6] + + * source/common/quant.cpp, source/common/quant.h, + source/encoder/entropy.cpp, source/encoder/entropy.h: + quant: use bIsLuma args instead of using a subset of TextType + [32bc57acf918] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/CMakeLists.txt, + source/common/quant.cpp, source/common/quant.h, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/predict.h: + TComTrQuant to Quant + [3c659845e4dc] + + * source/common/scalinglist.h, source/encoder/entropy.cpp: + scaling: use anonymous enums for class constants + + these didn't need to be integers, they didn't need storage. They're + only used to declare arrays and enforce loop bounds. This fixes a + link error in debug compiles on Linux. While I was at it I improved + the comments and moved one definition to entropy.cpp since that was + the only place it was used. + [0b0d398f35ca] + +2014-07-30 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/CMakeLists.txt, + source/common/intrapred.cpp, source/encoder/CMakeLists.txt, + source/encoder/analysis.h, source/encoder/predict.cpp, + source/encoder/predict.h: + predict: predict.cpp/.h defined, hungarian function names cleaned + up, warnings removed + [c32a86702b68] + +2014-07-30 Steve Borho + + * source/common/param.cpp: + param: disable range checks for psy-rdoq while we tune it + [3d814fd1268b] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/common/scalinglist.cpp: + defs: remove DISTORTION_PRECISION_ADJUSTMENT, fix bug in + 2e22ea6ec4bc + + FULL_NBIT was disabling this macro for even 16bpp builds, but I + accidentally enabled it. Since it was previously disabled for every + build, and is only even present in SAO and quant, it is best to just + remove it completely. + [38349967645f] + +2014-07-25 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp, + source/encoder/encoder.cpp: + psy-rdoq: implementation of psy-rdoq (highly experimental) + + This initial version is storing a temp variable in TComTrQuant to + avoid adding even more parameters to getCodedLevel() and it is + ignoring scaling lists in the unquant operation. Currently, you may + need large psy-rdoq scale values to have any real effect. It needs + lots of testing. + [06dcd7c5df6e] + +2014-07-30 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: minor cleanups + [df8314a1d3cb] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: rename absLevel to level, remove diffLevel + [09ae268bb0ce] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: readability nit + [ddef8e2d88fd] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: pass curCostSig to getCodedLevel as an integer + [4cb71a283ae3] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: return signal cost from getRateLast(), do not include lambda + [e0320502f9ea] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: don't bother with extra temp variables + [5210fca67553] + +2014-07-29 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix output non-determination bug + + When temporal candidates end up as MVP candidates, they can reach + beyond the available region of valid pixels in the reference + picture. In that case, we should not try to measure distortion + there, it only introduces non-determinism. Just take the other MVP + without contest. If both MVP were out of range, we default to mvpIdx + = 0. + + Note that there was never any risk of a decode error since when ME + runs it clamps all motion vectors to the range of available pixels. + The setting of this mvmin/mvmax range is assigned just after picking + the best of the two MVP candidates. + [05132ebe8413] + +2014-07-29 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: getICRate() and getICRateCost() do not need to be class + methods + [bb9b2fcf75ef] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: hoist lambda multiplication out of getICRateCost() + [e9e5f177a698] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: inline getRateSigCoef(), getRateSigCoeffGroup() and + getICost() + [f174808a7df4] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: improve documentation and code clarity, no logic change + [4012d5a6ff93] + + * source/common/scalinglist.cpp: + scaling: factor prec into scalingBits + + This makes errScale clearly function to remove quantCoeff^2, and add + this cost scale factor (which is just a bit shift). By factoring it + this way, we can probably remove the need for doubles in the near + future by not using the scaled coefficients in the stupid first + place. + [664eff34ef25] + + * source/common/scalinglist.cpp: + scaling: cleanup + [71bce9ae1072] + + * source/Lib/TLibCommon/CommonDef.h, source/Lib/TLibCommon/TComRom.h: + move quant defines from TComRom.h to CommonDef.h; they are not table + sizes + [eeeb96ecf0c7] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/deblock.cpp, + source/encoder/entropy.cpp: + typedef: remove redundant ChromaFormat enums + [57a8320024ed] + + * source/Lib/TLibCommon/CommonDef.h, source/Lib/TLibCommon/TypeDef.h, + source/encoder/entropy.cpp: + defs: cleanup CommonDef.h and TypeDef.h again + + TypeDef.h only defines enums, CommonDef.h only defines constants + (grouped together logically). Remove doxygenness + [b5339cbf1764] + + * source/common/scalinglist.cpp: + scaling: further simplify scaling list calculation + + It does more operations with ints, but mainly I just find this more + readable + [7c8fc08dfbd6] + + * source/Lib/TLibCommon/CommonDef.h: + defs: remove FULL_NBIT, no change to logic + [2e22ea6ec4bc] + + * source/common/scalinglist.cpp: + scaling: replace pow(2.0, -2.0 * transformShift) with shift + adjustment + + Result is the same + [e51017789090] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/encoder/entropy.cpp: + quant: use ScanType enum as type for scanType, other cleanups + [8d1b7dc5b832] + + * source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, source/encoder/entropy.cpp: + quant: move enums for scan order to TComRom together with the tables + [fecc01d4c27f] + +2014-07-29 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp: + rc: fix crash in 2 pass for faster preset (ultrafast - faster) + [a9678988dda2] + +2014-07-29 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/sei.h: + hrd: add a comment about when an APS is required, update SEI + [2263fe1c05f1] + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + sei: fix byte alignment following HRD SEI + [dbec825c30ad] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/CMakeLists.txt, source/encoder/motion.cpp, + source/encoder/motion.h: + cmake: enable full warnings for remaining TLibEncoder files + [3a1245fb52a4] + +2014-07-28 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: delay getTUEntropyCodingParameters() until known necessary + [2834687bcb51] + +2014-07-29 Albert Wang + + * source/encoder/frameencoder.cpp: + frameencoder: fix for crash in frameencoder when AccessUnitDelimiter + is on + + The m_entropyCoder need to set the bitstream first before calling + codeAUD, otherwise a crash will happen. + [47e6a51b943f] + +2014-07-29 Steve Borho + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/CMakeLists.txt: + remove sign compare warnings from TLibEncoder, cleanup SAO + [a021b541a2ef] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp: + rc,weight: nits + [0308c9e187be] + +2014-07-28 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + rc: call getEstimatedPictureCost() from API thread, prevent race + hazards + [2bc34d32e6cc] + + * source/encoder/dpb.cpp: + dpb: cleanup + [bc53f6dcda7f] + + * source/Lib/TLibCommon/TComPicSym.h, source/common/slice.h, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/ratecontrol.cpp: + slice: move m_bReferenced to picsym and rename for clarity + + This flag was being used for two purposes and introducing race + hazards: + + 1. determining if a picture in the DPB must be signaled in an RPS 2. + determining if the slice type is referenced (non-b) + + The m_bHasReferences variable on the picsym is used for the first + purpose. The macro IS_REFERENCED(slice) is used for the second + purpose. + + The m_isReferenced member variable on the frame encoder was removed + and it just checks the slice type directly. + [3c50a7e516ed] + + * source/encoder/frameencoder.cpp: + frameencoder: nits + [e84b28132be1] + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + frameencoder: re-order compressFrame() to avoid dependency problem + with HRD + + 1. if --hrd is enabled, the buffer period SEI needs data set by + rateControlStart 2. rateControlStart may block, so get compute heavy + tasks done first, like weighted prediction analysis 3. hoist + rateControlStart and QP setup back out of compressCTURows() and move + other code into it that belonged there in the first place 4. move + framefilter setup logic together 5. don't measure rateControlEnd() + in frame wall time + [71c023799f3d] + +2014-07-28 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp: + search: separate bSkipRes == true path + [7eab256b60f0] + +2014-07-28 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: compute frame satd cost for zero latency + [eba786bbe4b5] + +2014-07-28 Steve Borho + + * source/common/scalinglist.cpp, source/common/scalinglist.h, + source/encoder/entropy.cpp: + scaling: remove m_refMatrixId; it was only used to return an int + + Only one value of the matrix was ever used at a time, so we can + return thtat index directly from checkPredMode + [236df356402d] + + * source/common/scalinglist.cpp, source/common/scalinglist.h, + source/encoder/entropy.cpp: + scaling: use simple ints for small counters and int32_t for scales + and coeffs + + This fixes a gcc warning and mostly makes the code more readable and + avoids some (int) casts + [d66a3bd865e3] + + * source/common/CMakeLists.txt: + cmake: ignore msvc warnings about forcing value to bool 'true' or + 'false' + [8bab5275baed] + +2014-07-27 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: fix for tskip + [6b286f66cfab] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: nits + [ea63a67164e5] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: inline transformSkip + [67e62a4d520f] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: inline invTransformSkip + [ec8f8d6e59d9] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: simplify dequant, remove redundant invQuantScales array + [28c7868814de] + + * source/encoder/level.cpp: + level: signal level None if lossless coding - we have no bitrate + control + [74a5481eceb5] + + * source/common/scalinglist.cpp, source/common/scalinglist.h, + source/encoder/entropy.cpp: + scaling: remove duplicate square-block size enums + [0d2eb9d6d7e0] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/common/scalinglist.cpp, source/common/scalinglist.h, + source/encoder/entropy.cpp: + scaling: move global arrays into class statics, rename + [9ad7abfab9d1] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/common/scalinglist.cpp, source/common/scalinglist.h, + source/encoder/encoder.cpp: + quant: move quant/dequant/errScale buffer maintenance to scaling + list class + + This was a wasteful duplication of compute and memory. It was always + strange that TComTrQuant had so many internal references to the + ScalingList class, just to build these arrays itself. + + As part of this refactor, ScalingList now has checked mallocs. + [57b57ec43838] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: remove x prefixes from remaining methods; no logic change + [82bd32283dd1] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: replace xGetIEPRate() with a constant + [96925cfd342d] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: remove x prefixes from scaling list methods + [991d931d9db4] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: use rem consistently for variables holding qp % 6 + [1b39a252efac] + + * source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + quant: remove default curUseRDOQ argument value + [250deb63f2ea] + + * source/Lib/TLibCommon/TComTrQuant.h: + quant: remove default useTransformSkip argument value + [2f52f5c4a913] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: more coding style cleanup, update argument list in + xGetCodedLevel comment + [262af2694656] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: QpParam class to struct + [e4008f081c33] + + * source/Lib/TLibCommon/TComTrQuant.h: + quant: re-order header and improve comments + [5758998128e1] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + quant: remove get/set methods for scaling list enable flag + [7435c0f166f6] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: remove access methods for the class's own members + [0625f303cd49] + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/param.cpp, source/common/scalinglist.cpp, + source/common/scalinglist.h, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + api: expose scaling lists and psy-rdoq as a CLI/API options + + psy-rdoq is not yet functional - option is ignored move scaling list + related globals to scalinglist.cpp from TComRom + [93a434014f5a] + +2014-07-26 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: don't pass m_resiDctCoeff to member methods + [00c127bd42e7] + +2014-07-25 Steve Borho + + * source/common/param.cpp, source/encoder/encoder.cpp: + param: move all consistency param tweaks into Encoder::configure() + + x265_check_params() shouldn't be modifying the param structure; it's + role is to spot invalid configurations and prevent crashes in + encoder creation. + [a86d9aaaa7d7] + +2014-07-26 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + quant: cleanup initialization; catch malloc failures. rename + m_tmpCoeff + [db5642f2e4b7] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: nits + [a28a01820e30] + +2014-07-25 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: cleanups + [b60d35a06d3a] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: cleanup signBitHidingHDQ() + [50db97f20e61] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + quant: simplify setQPforQuant + [3f205a75877b] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: remove dead comment + [08de42d0b4a0] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + quant: do not malloc the EstBitsSbac structure, pass by reference + [82279c006a70] + + * source/Lib/TLibEncoder/TEncSearch.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + white-space and brace nits + [fef8314f730b] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: simplify verbage, fix white-space, no logic change + [7a8e7a87c9a2] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/cturow.cpp: + search: move all init code into TEncSearch::initSearch() + + TEncSearch owns all the objects being initialized + [915397565c05] + + * source/encoder/framefilter.cpp: + framefilter: initialize saoRowDelay just once + [6ca9f09455ca] + + * source/Lib/TLibCommon/TComPicSym.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp: + frameencoder: remove getSAO() access method; delay allocation of + SAOParam + [606b6ebed3b8] + + * source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + move slice setup together into one function, reorder functions for + clarity + [509fe705ed8d] + + * source/common/slice.h, source/encoder/frameencoder.cpp: + slice: remove redundant initSlice(), m_numRefIdx[] is set by DPB + unconditionally + [b573c299fb31] + + * source/common/slice.h, source/encoder/dpb.cpp: + slice: move coloc/tmvp flag setting all to one place + [d65c665f15f8] + + * source/x265.h: + api: improve documentation of x265_encoder_encode + [f73c0902079f] + +2014-07-26 Steve Borho + + * source/encoder/analysis.cpp: + analysis: cleanup early-skip + [dc595d2bfa36] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cudata: nits + [50e015720392] + + * source/encoder/entropy.cpp: + entropy: nit + [a581b22e70e4] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/framefilter.cpp: + encoder: remove unused m_maxRefPicNum and m_maxNumOffsetsPerPic + [e6ff719bd703] + +2014-07-25 Steve Borho + + * doc/reST/cli.rst: + rest: better document configuration options of --vbv-init + [66ed81577483] + +2014-07-25 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: fix crashes in vbv with 2 pass + [d3e2e487a488] + +2014-07-25 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/framefilter.cpp: + encoder: remove redundant m_pad array + [8aa4132d3097] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: fix uninitialized variable use reported by valgrind + + m_conformanceWindow was not initialized at all; and this was causing + uninitialized values to be written to the SPS if padding was + necessary. + + Remove unnecessary m_defaultDisplayWindow in the process + [c5aa92fff04a] + + * source/encoder/encoder.cpp, source/encoder/entropy.cpp: + nits + [d8a2892b8044] + + * source/common/param.cpp, source/common/slice.h, + source/encoder/encoder.cpp, source/encoder/level.cpp: + level: move all profile set/determine logic into one file, fixes for + MSP + [239cb2ba252e] + +2014-07-24 Steve Borho + + * source/x265.cpp: + cli: call x265_param_apply_profile() after applying file-reader + params + + the Y4M headers could change param->internalCsp, so applying the + profile before the reader makes changes allows bugs. The CLI was + allowing an encode of: + + x265 foo_422.y4m o.bin --profile main + + Even though --profile main was trying to enforce 4:2:0; the color + space change happened afterward the profile was enforced and thus + invalidated the specified profile. After this fix the encode + properly fails with: + + x265 [error]: Unsupported color space (2) on input aborted at input + frame 1, output frame 0 + [cb6f58ca056e] + + * source/x265.cpp: + cli: nit + [88afb4a4bcee] + +2014-07-24 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp: + qtLayer in reverse order + [11f8f6e4be20] + +2014-07-22 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/shortyuv.cpp, source/common/shortyuv.h, + source/encoder/analysis.cpp, source/test/testbench.cpp: + refine partition size related + + - reorder LumaPartitions to simplify partitionFromLog2Size() + - remove unused + [64a3b661b79a] + +2014-07-24 Steve Borho + + * source/common/param.cpp: + param: log multi-pass state + [670e330e4120] + + * source/encoder/ratecontrol.cpp: + rc: nits + [f54d5368590a] + + * source/encoder/ratecontrol.cpp: + rc: instrument all error conditions in initPass2() + + it's not helpful to abort the encode with no hint as to why + [650c2d3c5194] + + * source/encoder/bitcost.cpp: + bitcost: nits + [74630a9af6c9] + + * doc/reST/cli.rst: + rest: better document that -I N -i N does not disable scenecut + detection + [63a4b5c790ca] + + * Merge with stable + [8b59410a70be] + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h: + bitcost: increase size of cost arrays, for pessimal situations + + MVD is MV minus MVP. MV could be BC_MAX_MV and MVP could be + -BC_MAX_MV + [a12920e08700] + + * source/encoder/bitcost.h: + bitcost: fix bitcost() function, broken 13 months ago by + 07015bbe306b + + when bitcost() was first introduced, we had two competing table + formats for the s_bitsizes[] array. Eventually the x264 style array + won out, but this function was not updated. + + This bug was almost harmless, bitcost() could still evaluate + differences in bit costs correctly, but the upshift caused read + bound exceptions when the motion vectors became close to the HEVC + max of 4k. + [49ea5391828c] + + * source/x265.cpp: + cli: reorder argument help for clarity, add more sections + [b78d4dc01e2c] + +2014-07-24 Aarthi Thirumalai + + * source/common/param.cpp, source/x265.cpp: + rc: add cli options for multi-pass rate control + [5955c949ef8c] + +2014-07-24 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/common/CMakeLists.txt, source/common/deblock.cpp, + source/common/deblock.h, source/common/loopfilter.cpp, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + port TComLoopFilter Functions into /common/deblock + + 1. Rename TComLoopFilter to Deblock 2. Remove hungarian notation + function names and variables 3. Remove unused or empty functions + [6c7a31f1b029] + +2014-07-23 Steve Borho + + * source/encoder/entropy.cpp: + entropy: reorder methods in general encode order for clarity, no + logic change + [47407360120a] + + * source/common/slice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + entropy: remove ambiguity about timing info signaling + [5b22512f13d1] + + * source/encoder/entropy.cpp: + entropy: nits + [2debe3d86181] + + * source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: do not code a scaling list in PPS + + The scaling list in PPS is only necessary to override the scaling + list in the SPS. We will only ever have at most one scaling list (at + this time) + [ab0c9b6cc30f] + +2014-07-24 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncSearch.cpp: + rc: calculate mvBits, coefBits per Cu for 2 pass in skip mode as + well + [6c0929567805] + + * source/encoder/ratecontrol.cpp: + rc: compute average of qpaRc, qpAq per frame even in 2 pass + [7dd13d919265] + +2014-07-23 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.h, source/encoder/analysis.cpp: + pred: remove default motionCompensation argument values + + bugs waiting to happen; they've bit us multiple times in the past + [342d72f0b613] + + * source/encoder/analysis.cpp: + analysis: eoln fixup + [4f9ff4eeb7e3] + +2014-07-23 Deepthi Nandakumar + + * source/encoder/analysis.cpp: + TComPrediction: fix MC error caused by previous patch + [db6b7046d4f0] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp: + TComPrediction: remove unused parameter warning + [6919c9f2432a] + + * source/encoder/analysis.cpp: + analysis: whitespace nit + [379c44100fff] + +2014-07-23 Steve Borho + + * doc/reST/cli.rst: + rest: tpyo + [9a1c64cc7f28] + + * doc/reST/cli.rst: + rest: move profile, level, and tier into their own section + + They really didn't belong combined with the slice-decision options + [8a8495c750ad] + +2014-07-23 Deepthi Nandakumar + + * source/encoder/analysis.cpp: + analysis: add in extra X265_CHECK + [2ee1444e8a00] + +2014-07-23 Gopu Govindaswamy + + * source/encoder/analysis.cpp: + analysis: setQPforQuant in checkIntraInter to fix the hash mismatch + at rd=5&6 + [c1823bb20eed] + +2014-07-23 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: white space nits, remove isPSlice (next series) + [5fdc394bc5e0] + +2014-07-22 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComPrediction: remove redundant colorspace information + + m_csp is sufficient inside the TEncSearch, TComPrediction structures + [30f41c3ef7d3] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp: + TComPrediction: save CU data (partAddr, width, height) as member + fields + + Before motion compensation, save CU related data inside the + TComPrediction structure + [a5422a41c85a] + +2014-07-21 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComPrediction: remove TComDataCU as pointer to private functions + [d8d26a695cf6] + +2014-07-22 Steve Borho + + * source/common/param.cpp: + param: re-order code to avoid unreachable code warnings in 10bit + builds + [e3ad03b7c485] + + * build/regression/01build-and-smoke-test.bat, + build/regression/02perftest.bat, build/regression/commandlines- + example.txt, build/regression/config-example.txt, build/regression + /email-csv.py, build/regression/run.bat: + remove very obsolete regression batch files + [fe3f3ec46306] + + * source/encoder/level.cpp: + level: fix GCC signed/unsigned comparison + [345bc6231fa3] + +2014-07-21 Steve Borho + + * doc/reST/cli.rst: + rest: update --profile, --level, --high-tier docs + [08da00a7e719] + + * source/CMakeLists.txt, source/common/param.cpp, + source/encoder/api.cpp, source/encoder/level.cpp, + source/encoder/level.h, source/x265.cpp, source/x265.h: + level: add --high-tier and auto-configure VBV if --crf N --level M + (refs #61) + + This patch is a major overhaul of the level enforcement logic. The + first obvious difference is that the user may specify the tier. The + second difference is that x265 will no longer run any configuration + that might generate non-compliant bitstreams. + + Any of these conditions will cause the encode to abort if a minimum + decoder level was specified: + + * picture size is too large + * frame rate is too high + * constant QP rate control is configured + * the specified level does not exist + + Further, if CRF was specified, we now configure VBV using the + maximum CPB size and bitrate for their level/tier (and issue a + warning that the output may now be non-deterministic). + + Note that the encoder will still encode the minimum decoder level + which covers the encode parameters. So even if you specify --level + 5.1, we may signal the stream as level 4.0-High if the stream should + be decodable at that level. + + This further fixes the CLI to allow --level-idc or the shortened + --level, just as it now also supports --high-tier or the shortened + --high. + [5510d559c2bd] + +2014-07-22 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: bug fixes in 2 pass ABR + + ratecontrol - fix duplication of bits in calculating m_totalBits + frameencoder - don't amortize totalBits in 2 pass ABR + [b85dbec30cc5] + +2014-07-21 Steve Borho + + * source/encoder/ratecontrol.cpp: + rc: prevent gcc shadow warning about int terminate and terminate() + [d303b4d860e9] + + * source/encoder/analysis.cpp: + analysis: don't initialize members of base class in constructor + [eeea411e197a] + + * source/common/param.cpp: + param: fix profile warnings in x265_param_apply_profile() + + This function was likely never used since it was not supported by + the CLI till now. When the user specifies --profile Main10, they + intend that the output stream is decodable by a Main10 decoder. This + is true of any Main encode. + + The reverse is not true for --profile Main and a Main10 encode. We + must return an error and abort the encode if --profile Main is + specified and the encoder was compiled for Main10 (HIGH_BIT_DEPTH). + [5835fa3a8281] + + * source/x265.cpp: + cli: expose x265_param_apply_profile() via --profile + + This is repairing an egregious oversight + [56e9d8a66527] + + * source/encoder/level.cpp: + level: nit + [a2506d62d4af] + + * source/encoder/api.cpp, source/encoder/encoder.cpp, + source/encoder/level.cpp, source/encoder/level.h: + level: move m_vps.maxDecPicBuffering logic into one function + + This prevents duplicate logic from existing in two separate files + [72641fecf86a] + + * source/Lib/TLibCommon/TypeDef.h, source/common/slice.h: + ptl: move enums from TypeDef.h to slice.h + [5f64abf3a20a] + + * source/common/slice.h, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/level.cpp, source/encoder/level.h: + slice: add ProfileTierLevel instance to VPS + + The PTL data is coded in the VPS (first) and thus belongs with that + data structure, as it allows us to simplify arguments to the level + set and determination functions. + [7621ec03e9c6] + + * source/encoder/level.cpp: + level: fix typo + [eed3099a90ea] + +2014-07-17 Steve Borho + + * source/encoder/level.cpp: + fix for GCC warning about loop bounds + [bf78ebd06ed0] + +2014-07-21 Steve Borho + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/analysis.cpp, + source/encoder/analysis.h, source/encoder/api.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + entropy: settle on Entropy class name and member names + + Through all these various refactors the SBAC/CABAC/entropy classes + have all merged together and the member variables were a scattered + mix + [ca02fa285ac0] + + * source/CMakeLists.txt: + cmake: tabs to spaces + [26458b8dcba4] + + * source/CMakeLists.txt: + cmake: only create SONAME shared lib on POSIX systems (closes #62) + [80c1d35e5517] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: prevent deadlock on encoder flush or close + + 1. add a terminate() function to unblock any blocked rate control + methods 2. never block if the order ordinal is already above the + required ordinal 3. simulate start events when encoder is flushing + + Without these changes, closing the encoder without flushing (CTRL+C + in the CLI) or encoding fewer frames than frame encoders (-F8 -f4) + could cause deadlocks + [a2fd8a71de61] + +2014-07-21 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: fix warnings in ratecontrol + [3801142d080d] + +2014-07-18 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: generate cutree offsets for 2nd pass from stats + [2f87f3c24b4a] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + rc: fix sliceType of 2nd pass from prev pass stats + + avoid doing sliceAnalyse in lookahead for 2nd pass + [1b2fbf3208ca] + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: add 2 pass logic in rateEstimateQscale + [0461e091a7b5] + +2014-07-21 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp: + trquant: store QpParam for each component + [4d2c3d09e836] + +2014-07-19 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, source/encoder/entropy.cpp: + scan order tables + [88310701f857] + +2014-07-21 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + TComWeightPrediction: initialising pointers to NULL + + getWPScaling will assign appropriate weight pred tables to these. + [aa651ea5673c] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + TComWeightPrediction: add braces to prevent errors for an optional + else statement + [a30e3bd53959] + +2014-07-17 Michel Zou + + * source/CMakeLists.txt: + cmake: fix dll installation directory on Windows + + On Windows the shared library (.dll) is considered RUNTIME and the + import library (.dll.a on MinGW or .lib on MSVC) as LIBRARY. + [eb983d29c11a] + +2014-07-17 Min Chen + + * source/common/x86/ssd-a.asm: + use macro HADDD to improve AMD performance + [7e9de0923541] + +2014-07-18 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: replaced functions with tables + [477a0a6bf89b] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: remove redundant functions + [6df47cdd3f47] + +2014-07-18 Santhoshini Sekar + + * source/encoder/analysis.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: update ratecontrol stats in every frame, avoid frame parallelism + lag in abr + + RateControl statistics are updated for every frame when refLagRows + number of rows are completed in processRowEncoder. With this updated + data rateControl predicts more accurate QP + + This removes the previous hack which disabled frame parallelism for + the first half-second of video and replaces it with a more fine- + grained rate control call timing to achieve a similar ABR + improvement with less effect to frame parallelism. + [0a8ecd8a6cf9] + +2014-07-18 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.h, source/common/CMakeLists.txt, + source/common/scalinglist.cpp, source/common/scalinglist.h, + source/common/slice.cpp, source/common/slice.h, + source/encoder/encoder.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + split ScalingList class into its own header and source file + + The PPS and SPS headers no longer have scaling list pointers, so + there is no need for it to be declared in the same header + [eef6867b9c53] + + * source/encoder/dpb.cpp, source/encoder/ratecontrol.cpp: + a copy of POC is cached in the Frame struct + [e4199c04d78d] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/frame.h, + source/encoder/analysis.cpp, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + frame,cu: remove getSlice() + [ea61fd838115] + + * source/Lib/TLibCommon/TComPicSym.h, source/common/frame.h, + source/encoder/dpb.cpp: + picsym: remove getSlice() + [821c875b2d39] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/common/CMakeLists.txt, source/common/common.cpp, + source/common/frame.h, source/common/param.cpp, + source/common/slice.cpp, source/common/slice.h, + source/encoder/analysis.cpp, source/encoder/cturow.h, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/reference.cpp, + source/encoder/sei.cpp, source/encoder/sei.h, + source/encoder/weightPrediction.cpp: + move PPS, SPS, Slice and ScalingList to common/ + [f04c98f33ca5] + +2014-07-17 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + slice: remove unused initWpScaling() + [9cbd9fc1710a] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/encoder/entropy.cpp: + slice: remove getWpScaling() + [601e61fb185f] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/common/piclist.cpp, + source/common/piclist.h: + slice: make xGetRefPic a PicList method + [b4643bc7ac7d] + + * source/Lib/TLibCommon/TComSlice.h: + slice: do not initialize fields that are always written + unconditionally + [eb6178f8449e] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + encoder: do some slice initializations only once + [37ea4eb500cf] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + entropy: slice->m_colFromL0Flag is a bool + [1bf5e0390f7d] + + * source/encoder/dpb.cpp: + dpb: simplify check for m_bCheckLDC; make note for future + optimization + [f7624977ec9a] + + * source/encoder/dpb.cpp: + dpb: remove check for B frame without L1 + + our lookahead won't emit B frames without a leading P frame + [fb9681ba75ac] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/entropy.cpp: + slice: remove m_bLMvdL1Zero + + This feature, which signals L1 MVD is zero for an entire slice, was + never enabled because we never use the same frame list in L0 and L1. + [82c1fec5da66] + + * source/encoder/entropy.cpp: + entropy: fix warning reported by llvm + + /Users/steve/repos/x265/source/encoder/entropy.cpp:321:44: warning: + equality comparison with extraneous parentheses[-Wparentheses- + equality] if ((cu->getSlice()->m_numRefIdx[list] == 1)) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~ + [8eed1fac9ccb] + + * source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/nal.cpp, + source/encoder/nal.h: + wpp: generate stream size array as coded (in bytes) and pre- + calculate max offset + + removes another per-frame alloc/free and redundant work + [93ab6ed75b01] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + slice: remove substream array, move to frame encoder + + \0/ the slice structure no longer allocates memory + [5cbdfc4532d9] + + * source/Lib/TLibCommon/TComPrediction.cpp: + nit + [57f8e7264e9e] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, source/encoder/dpb.cpp: + replace slice->m_refPicList[][]->getPOC() with + slice->m_refPOCList[][] + [4c14b90fad83] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp: + slice: merge setRefPOCList() into setRefPicList() + + Now both lists are always aligned so it is always safe to use the + POC table + [51caccd3b359] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/slicetype.cpp: + slice: remove getRefPOC + [eb2278d72afa] + + * source/encoder/dpb.cpp: + dpb: perform setRefPOCList() directly after setRefPicList() + + A precursor to combining the two functions + [c1acfb981082] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp: + slice: remove getRefPic(l,r) + [e2f90ba1de4a] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/entropy.cpp: + slice: remove setNumRefIdx + [ce65e2bd2f12] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp: + slice: remove set/get for m_colRefIdx and m_sliceCurEndCUAddr + [4701e921b33d] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/entropy.cpp: + slice: remove getMvdL1ZeroFlag/setMvdL1ZeroFlag + [b2ac589c522e] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/entropy.cpp: + slice: remove getColFromL0Flag() + [1268011090f0] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/weightPrediction.cpp: + slice: remove getNumRefIdx() access method + [785217b5ed5c] + + * source/Lib/TLibCommon/TComSlice.h: + slice: removed unused setRefPic() and setRefPOC() methods + [c7846351fc9f] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + slice: remove unused getNumRpsCurrTempList() method + [0f5cfda6a764] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + slice: remove get/set methods for slice QP and lastIDR + [3f024a5a40f0] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp: + slice: remove get/set methods for m_bReferenced + [aa785acc0d5f] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + slice: remove get/set methods for m_sliceType + [880285f10294] + + * source/Lib/TLibCommon/TComSlice.h: + nit + [95a35fd1c0b7] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + slice: remove m_cabacInitFlag, it was write-only + [39f9cb10090a] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + slice: remove m_numEntryPointOffsets, it was write-only + [3a6a0307f6c6] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/analysis.cpp, + source/encoder/cturow.cpp, source/encoder/frameencoder.cpp: + slice: remove slice bit counter + [79c839303e2a] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp: + slice: remove get/set methods for m_poc + [98b7f7a776b1] + + * source/Lib/TLibCommon/TComSlice.h: + nits + [67c9a0c5ec56] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp: + slice: remove m_sliceSegmentBits and get/set methods for max-merge + [bf060c902d24] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp: + slice: remove m_saoEnabledFlag, m_saoEnabledFlagChroma - use + saoParams + + SBac::encodeSliceHeader() was already using saoParams for chroma but + not for luma (inexplicably) + [fd4297676f76] + + * source/encoder/frameencoder.cpp: + sao: combine if(){} expressions + [a5d5d329daf1] + + * source/encoder/frameencoder.cpp: + sao: (bool == 1? ? true : false -> wat? + [0233b8965906] + + * source/encoder/ratecontrol.cpp: + rc: fix vbvBufferSize calculation + [930003de2009] + + * source/Lib/TLibCommon/TComSlice.h: + slice: nits + [901252fe8a16] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/frameencoder.cpp: + slice: cleanup m_nalUnitType and helper methods + [4c49302d8b19] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/weightPrediction.cpp: + slice: remove get/set methods for m_pic + + This cleaned up TComDataCU::getPic() as a side-effect. + [72e3bd69cb70] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/entropy.cpp: + slice: temporal MVP was never disabled + [6e45eff6d2b3] + + * source/encoder/encoder.cpp: + rc: HRD must be initialized prior to calling init() + [c06b0b9d3501] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + slice: remove unused deblock data, directly use status/offsets from + PPS + + nits: reorder TComSlice for better clarity and remove initializer + list from constructor + [ba028e8d4115] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp: + slice: remove unused slice QP deltas + [b5abd96ccc90] + + * source/encoder/dpb.cpp, source/encoder/entropy.cpp: + nits + [134406df4863] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/entropy.cpp: + slice: we only use a slice-local RPS (none signaled in SPS) + [f5352780ad35] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + slice: TComReferencePictureSet -> RPS + [e6a8280a3c1a] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + slice: hoist constructor to header (it will be removed in later + commits) + [08232a102124] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/sei.cpp, source/encoder/sei.h: + slice: remove get/set methods for SPS and PPS, make pointers const + + Nearly everywhere, SPS and PPS are used read-only, so making the + pointers const enforces this at compile time and allows the compiler + to make some optimizations. The few places that initialize or change + the headers are passed the original objects belonging to the top- + level encoder. + [419d1a1b0d39] + + * source/encoder/frameencoder.cpp: + nit + [675a22d901af] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/reference.cpp, source/encoder/reference.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h, + source/encoder/weightPrediction.cpp: + slice: rename wpScalingParam -> WeightParam + [575d4a0d849b] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp: + pps: simplify deblocking signaling, disable slice override + + Disable slice override of deblock-enable state and offsets - we do + no such thing and attempts at using these in the past have not been + effective. + [3e4616cffc90] + + * source/encoder/encoder.h: + nits + [6d0c8efbe3ed] + + * source/encoder/api.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + encoder: make SPS and PPS top-level encoder singletons + + This fixes a subtle bug. If level requirements forced the max + references to be lowered, VPS and SPS maxDecPicBuffering was never + adjusted. Now, they are not configured until after level enforcement + has set them to final values. + [f0d157944c55] + +2014-07-16 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + pps: we never very chroma QP per picture, so remove + bSliceChromaQpFlag + [b59d09ca6c1b] + + * source/encoder/encoder.cpp, source/encoder/entropy.cpp: + pps: move deblocking logic together, fix a couple signaling bugs + [bb5e9a11868c] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + pps: class to struct, remove m_ prefix, unify naming scheme + [677547b73129] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + pps: remove set/get methods and hard-coded members + [31d11036bf9c] + + * source/Lib/TLibCommon/TComSlice.h: + slice: white-space nits + [febec529cc80] + + * source/Lib/TLibCommon/TComSlice.h: + vps: remove constructor, those values are always initialized + [c64bca550377] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + sps: fix check statement + [8193ac4172fe] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/sei.h: + sps: class to struct, remove m_ prefixes + [59782b2bc59c] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.h, + source/encoder/analysis.cpp, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sei.h: + sps: remove set/get methods, remove members with hard-coded values + + Note all the SPS members that could be replaced by direct access to + param. Removes m_vps pointer from slice, it was never set or + referenced + [e288df705bf3] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + vps: class to struct and other nits + [9f906f877ba8] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/analysis.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp: + pps: remove hard-coded getLog2ParallelMergeLevelMinus2(0) + + and snuck in some other nits + [b9bb54fd5478] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + remove hard-coded pps->getSliceHeaderExtensionPresentFlag() + [ba3146812b42] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp: + nits + [92f32f4492f0] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp: + slice: remove redundant/unused scaling lists from SPS, PPS, and + Slice + + The SPS and PPS were allocating their own scaling list instances and + never using them; with the recent refactors only their status flags + were coded. TComSlice had a pointer to the top-level scaling list, + but never used it. + + "There can be only one!" + [8969598e1d91] + + * source/Lib/TLibCommon/TComSlice.cpp: + slice: nits + [891f975f1ba5] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, source/common/common.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp: + slice: remove trivial set/get methods, move all relavent functions + to the class + [164b43bfdf7a] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp: + slice: remove unsignaled m_temporalLayerNonReferenceFlag + [b3db2d76500b] + + * doc/reST/cli.rst: + rest: fix crf min/max option names + [9213f5dea023] + + * source/Lib/TLibCommon/TComSlice.h: + slice: start to clean up scaling list struct + [04601befb930] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + slice: convert Window to struct, simplify names + [26f94fc58d5a] + + * doc/reST/cli.rst: + rest: fix documented default AQ mode + + As noted by Alex Ramsey + [c45eda4d780f] + + * source/Lib/TLibCommon/TComSlice.h, source/common/bitstream.h, + source/encoder/entropy.cpp, source/encoder/sei.h: + bitstream: change writeFlag() to take a bool to avoid int + conversions + [2737d0b05b72] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sei.h: + vui: prune set/get methods and unsignaled fields + [cb5f30c19ccc] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + slice: finish cleanup of TComReferencePictureSet + [ab3e3c2b806f] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + slice: remove unused PPS ID + [06ceed0047c1] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.cpp, source/encoder/dpb.cpp: + brace nits + [f28129d63336] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/common/frame.cpp, + source/common/frame.h, source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp: + slice: Cleanup TComReferencePictureSet, prune the notion of long- + term-pictures + [afcfff7f654c] + +2014-07-15 Steve Borho + + * source/encoder/entropy.cpp: + entropy: nits + [d850cbf81e0f] + +2014-07-15 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/CMakeLists.txt, + source/encoder/analysis.cpp, source/encoder/analysis.h, + source/encoder/compress.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.h: + port TEncCU functions into analysis structure + + 1. Rename TEncCU to Analysis 2. Remove warnings (/wd4244 /wd4389 + /wd4018) from TEncCU 3. Roll over compress.cpp into analysis.cpp 4. + Rename TEncCU to Analysis 5. Remove hungarian notation function + names + [76eb82b31c37] + +2014-07-15 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: assert to X265_CHECK + [6cf3a2414461] + +2014-07-15 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel.h, source/common/x86/ssd-a.asm, + source/test/pixelharness.cpp, source/test/pixelharness.h: + replace sse_sp(residual, ZERO) by ssd_s(residual) + [7e6ac3a85073] + +2014-07-15 Santhoshini Sekar + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: define function for updating rate control statistics in + processRowEncoder + + Rate control updates its statistics like bits in RateControlEnd. + With frame parallelsim enabled and N parallel frames running, the + feedback given to rate control is delayed until rateControlStart of + N frames are called. To avoid this delay, update rate control + statistics for every frame after encode of few frames are done in + processRowEncoder. By updating statistics for every frame we make + ABR to function more accurately (predicts more accurate QP) making + use of latest data rather than stale values. + + Frame parallelism lag (the wait for reconstructed pixels) already + forces this delay between frames, this change will simply take + advantage of the progress made by referenced frames before each + frame is allowed to begin coding by having more accurate rate + control data. + [27a27d733e1e] + +2014-07-15 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h: + fix for crash when scalingList feature is enabled + [919fd9de54dc] + +2014-07-15 Steve Borho + + * source/encoder/entropy.cpp: + entropy: remove obsolete check macro + [3499aa4fa20c] + + * source/Lib/TLibCommon/TComSlice.h: + slice: fix warning + [32aeb47c2d36] + + * source/Lib/TLibCommon/TComSlice.h: + slice: remove unused m_useDF + [6e0def026364] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp: + slice: remove more hard-coded fields + [3b0e141c3a2f] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComMotionInfo.h, source/encoder/encoder.cpp: + defs: remove more dead definitions + [105778f37879] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sei.h: + hrd: remove *Minus1 from member names and semantics, remove m_ + prefixes + + Note that hrd->cpbSizeValue stored a "minus 1" value but was not + named as such + + This commit also fixed a math bug (late up-conversion) in cpbSize + calculation in ratecontrol.cpp + [3bc86d0fbdc7] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sei.h: + hrd: remove set/get methods of TComHRD, remove unused members + [d9921a247139] + + * source/Lib/TLibCommon/TComSlice.h: + hrd: remove four unused fields + [e2003ed17684] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/ratecontrol.cpp, source/encoder/sei.h: + hrd: CPB count is always 1 + [00b91817e3e4] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/ratecontrol.cpp: + hrd: inline HrdLayerInfo, remove second array dimension (was for + nal/vcl) + [b9c3a067f89e] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/sei.h: + hrd: we always signal NAL HRD but not VCL and not sub-pic + + Remove redundant fields and methods + [27c28450e098] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/entropy.cpp, + source/encoder/ratecontrol.cpp: + slice: simplify TimingInfo to a struct + [9b807ca6313a] + + * source/Lib/TLibCommon/TComSlice.h: + slice: nits + [b0bc72c359d1] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + change ProfileTierLevel to a struct, no m_ prefixes + [758cb5de97b6] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + ptl: remove hard-coded m_profileSpace + [bc9bbe6bfdb1] + + * source/encoder/entropy.cpp: + entropy: de-hungarian codePredWeightTable(), fix comments (remove + redundants) + [84478006fe0f] + + * source/encoder/entropy.cpp: + entropy: nits + [dfd2bd5c7cfa] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + vps: remove m_maxLatencyIncrease, it was always 0 + [969a6a5de916] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + vps: remove remaining set/get methods + [d7b673cdbc78] + + * source/Lib/TLibCommon/TComSlice.h: + vps: remove now unused m_layerIdIncludedFlag + [bf28c2e0a8ce] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp: + vps: remove unused m_maxNuhReservedZeroLayerId, m_numOpSets + + The values were all hard-coded + [00fafd0b078c] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp: + sps: hard-code an SPS ID of 0, we only ever use one SPS + [fe9aee4225a1] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/frameencoder.cpp: + vps: remove m_VPSId, signal hard-coded 0 + [f7c168558bf0] + + * source/encoder/entropy.cpp: + entropy: remove broken X265_CHECK + [1dd0b14a8e9c] + + * source/encoder/frameencoder.cpp: + frameencoder: nits + [e68eedbd6795] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + remove unused multiple HRD parameter sets from VPS class + [27307f7547c0] + + * source/Lib/TLibCommon/CommonDef.h: + defs: remove some dead definitions, cleanup a comment + [ddb6dc16a64a] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp: + slice: remove unused m_interRPSPrediction from + TComReferencePictureSet + [6ac88ce5d670] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp: + slice: simplify max AMP level determination + [20ca3447ed96] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: remove redundant member variables + [12de88f4e110] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/sei.h: + Remove MAX_TLAYER and stop pretending we can code multiple temporal + layers + + This simplifies a great many things; enabling further simplications + to come. This commit leaves a sprinkling of TODOs for further + cleanups + [e07686925819] + + * source/encoder/entropy.cpp: + entropy: nits + [7fac1eb8252f] + +2014-07-14 Sumalatha Polureddy + + * source/encoder/compress.cpp: + compress: fix for sa8dcost storage in rdlevel 1, 0 + [c923f4a94946] + +2014-07-15 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + TEncCU: remove useless field + + There's a global variable for this. + [863d969e3ab9] + +2014-07-15 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: hoist a couple of trivial functions, brace nits + [5c5183eeacb5] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp: + encoder: keep ProfileTierLevel singleton in the top-level encoder + + Remove the copies in the SPS and VPS and remove the TComPTL object + entirely, there is only one level. + [83b498cd9aef] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: simplify PTL coding, there is only one layer + [8542b057d33e] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: remove profilePresentFlag argument from codePTL, always 1 + [5995d7bbfa1a] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp: + slice: Remove get/set methods from TComPTL + [c52a6321b845] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/entropy.cpp: + slice: remove set/get methods from ProfileTierLevel, and dead + encoder vars + [fa4fc98233e0] + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/frameencoder.cpp, source/x265.cpp, source/x265.h: + api: add an option to disable the informational SEI message + + For regression testing, or comparing outputs between compilers or + platforms, this header is quite unhelpful, so make it optional. + [6f51bf4ba665] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + slice: remove RPS list class and relevant SPS encoding logic + + x265 only codes a reference picture set in the slice header, none in + the SPS and never an "inter-RPS" + [59855812b4ef] + + * source/common/param.cpp: + param: fix cut-paste bug + [836c08981029] + +2014-07-14 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h, source/common/param.cpp, + source/encoder/entropy.cpp: + slice: white-space cleanups, remove dead funcdefs and forward defs + [2454ef2a083e] + + * source/Lib/TLibCommon/TComSlice.h: + slice: remove virtual destructors for slice classes + + we do not want or need vtables for any of these classes + [e3efe14f4e6a] + + * source/common/param.cpp, source/x265.cpp: + param: move --input-res parsing to x265_param_parse (closes #63) + [67c7bfda7c28] + + * source/CMakeLists.txt: + cmake: set SONAME even if no revision number or tag is found (closes + #58) + [51b3ff5e88e2] + + * source/Lib/TLibCommon/TComPicSym.cpp: + picsym: check m_cuData before trying to destroy array instances + [539e94e7aa34] + +2014-07-14 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/frame.cpp, + source/common/intrapred.cpp, source/common/param.cpp, + source/common/primitives.h, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/encoder/compress.cpp, + source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/slicetype.cpp: + refine block size related, use more log2 domain. + [0af3c5a1782d] + +2014-07-14 Praveen Tiwari + + * source/encoder/encoder.cpp: + encoder.cpp, clean-up + [f2195700e1e4] + +2014-07-14 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp: + cu: white-space cleanups + [e7037f5914bc] + + * doc/reST/cli.rst: + rest: fix lossless white-space, improve --sign-hide + [baa6b00f7491] + +2014-07-14 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: fix sa8dCost comparisons. + + After this patch, all sa8d costs are stored in m_sa8dCost, and RD + costs in m_totalRDCost + [6fdae4c60b1f] + + * source/encoder/encoder.cpp: + encoder: insert missing comma + [7a41bc257044] + +2014-07-13 Steve Borho + + * doc/reST/cli.rst, doc/reST/index.rst, doc/reST/lossless.rst, + doc/reST/presets.rst: + rest: document lossless coding theory and behavior + [d9e1f9c77ceb] + +2014-07-13 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: save intra costs in rd <= 2 also in m_sa8dCost. + + Basically, all sa8d costs are always saved in m_sa8dCost. + [6601a3d01974] + + * source/encoder/compress.cpp: + compress: save inter sa8d costs also in m_sa8dCost + [5bb9ccedb0de] + + * source/encoder/compress.cpp: + compress: Fixes fudging in merge costs, stores sa8d costs for + bestMergeCU in m_sa8dCost + [5051f1566664] + +2014-07-14 Deepthi Nandakumar + + * source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + Merge, discard accidental push + [6055baa75085] + + * source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + Backed out changeset: cff4f6482385 + + This head was accidentally created. + [406879d25112] + +2014-07-13 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: explicit size conversion to avoid MSVC warning + [454a2fc37fee] + +2014-07-10 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + entropy: rename combined CABAC/SBac class to Entropy + [cff4f6482385] + +2014-07-12 Steve Borho + + * source/encoder/encoder.cpp: + encoder: nits + [9d3683ab096b] + + * source/encoder/encoder.h: + encoder: remove unused variables + [7a9933d5df9e] + +2014-07-11 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/sei.cpp, + source/encoder/sei.h: + sei: emit SEI describing the encoder and encode options + + example: % x265 ../sintel_trailer_2k_480p24.y4m o.bin % strings + o.bin | head x265 (build 26) - 1.2+74-a5024bfc0b50:[Mac OS X][clang + 5.1.0][64 bit] 8bpp - H.265/HEVC codec - Copyright 2013-2014 (c) + Multicoreware Inc - http://x265.org + - options: 856x480 fps=24/1 bitdepth=8 wpp ctu=64 tu-intra-depth=1 tu- + inter-depth=1 me=1 subme=2 merange=57 no-rect no-amp max-merge=2 no- + early-skip no-fast-cbf rdpenalty=0 no-tskip no-tskip-fast strong- + intra-smoothing no-lossless no-cu-lossless no-constrained-intra + open-gop interlace=0 keyint=250 min-keyint=24 scenecut=40 rc- + lookahead=20 bframes=4 bframe-bias=0 b-adapt=2 ref=3 weightp no- + weightb aq-mode=2 aq-strength=1.00 cbqpoffs=0 crqpoffs=0 rd=3 + signhide lft sao sao-lcu-bounds=0 sao-lcu-opt=1 b-pyramid cutree + rc=crf crf=28.0 qcomp=0.60 qpmin=0 qpmax=51 qpstep=4 ipratio=1.40 + pbratio=1.30 + [6af56f7c8703] + + * source/encoder/frameencoder.cpp: + frameencoder: nit + [7e3c96e01ca9] + + * source/common/param.cpp: + param: fix typo in d1d5b377294e + [6e116afd68e7] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: nits + [fd87a7e5b1e3] + + * source/CMakeLists.txt: + cmake: nit - about target arch strings + [a5024bfc0b50] + +2014-07-11 Santhoshini Sekar + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: store reflagrows as a member variable + [a1c553d36746] + +2014-07-11 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: init RC data for second pass in a multi pass encode + [03164c7ddcbb] + + * source/CMakeLists.txt, source/x265.cpp, source/x265.h: + param: keep the total frame count of the input video in param. + + to be used in 2 pass. In case this is not available, the value can + remain as 0. + [aed58d050ff9] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + rc: store final cu counts in frameStats as a percentage of total # + cus + + there exists a mismatch in the total no. of 16x16 blocks calculated + between encoder and RC. rate control counts no. of 16x16 blocks + without considering the border extension of source pic. keeping cu + stats as percentage avoid this problem when used later in RC's 2nd + pass. + [07654693159b] + +2014-07-11 Steve Borho + + * source/common/param.cpp: + param: use dashes consistently in param2string + [d1d5b377294e] + +2014-07-11 Aarthi Thirumalai + + * source/common/param.cpp: + param: add more param options to print as string + [f3223737009e] + + * source/common/common.cpp, source/common/common.h: + common: introduce x265_slurp_file, reads data from file to string + buffer. + [b4ee6251307e] + +2014-07-11 Steve Borho + + * source/encoder/level.cpp: + level: in CRF/CQP encodes, allow user specifed level to increase + stream level + + See the comment + [474c45db6a2c] + +2014-07-11 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + cu: fixing int-to-bool compile warnings + [29ae3f84c3ea] + +2014-07-11 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h: + cu: directly use param fields for lossless coding options + + Remove m_CUTransquantBypass, m_TransquantBypassEnableFlag + [e171ad8bc8a3] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + cu: nits + [2d0fd0241025] + + * source/x265.cpp: + cli: retrieve params from the encoder, in case logging flags have + changed + + For instance, if the user asked for --lossless --ssim, the encoder + will disable SSIM for lossless encodes and we don't want the CLI to + report SSIM stats that were never measured. + [c9a7be09cdc0] + +2014-07-11 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/rdcost.h: + zero stride for zeroPel[] + [a96d3ed11aaf] + +2014-07-10 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/common.h, + source/encoder/compress.cpp, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + search: make RDCost and TComTrQuant member instances of TEncSearch + + Both of these structs are required by TEncSearch, it cannot operate + without them There is always a 1::1 correlation with each. There is + no reason at all to allocated them separately in TLD and then link + them all together. Much simpler for them to be members of the + TEncSearch class. + [77aeba71695e] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.cpp: + derive TEncCu from TEncSearch + + TEncSearch methods are only callable from TEncCu methods. and there + is much duplication in their data members. This removes a lot of + redundancy and removes a lot of pointer dereferences + [25e9c85aea1f] + +2014-07-11 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: save CABAC context for rd = 2 after inter/intra encodes + [1c8573c886b1] + + * source/encoder/compress.cpp: + compress: save CABAC context after intra decision. + + Since CABAC encode happens only for rd > 2 + [65d4f5b4fd3a] + + * source/encoder/compress.cpp: + compress: save CABAC context after inter/merge decision + [b728ca41433f] + + * source/encoder/compress.cpp: + compress: save CABAC context in merge modes for lower RD levels. + [52cda492abf5] + +2014-07-10 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + entropy: clarify the SBac's bit counting mode + + The SBac class has always had the ability to be a bit counter + without any other external data structures. With this change, the + SBac defaults to being a bit counting SBac until it is given a + Bitstream object to write into. The class no longer accepts a + BitCounter object, since it would only add more overhead to the bit + counting. + + TEncCu no longer needs o be told whether it is writing into a bit + counting SBac or not, it can ask the SBac to find out. + + The BitCounting class is only used for SEI writing, and may + disappear in order to remove the vtable from the critical path of + entropy coding. + [e3e077965c39] + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framefilter: rename row0 coder + [e658be3fe5a3] + + * source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.cpp: + cturow: rename m_rdGoOnSbacCoder to m_sbacCoder; there is only one + [9e50b8b7503b] + + * source/encoder/frameencoder.h: + frameencoder: remove unnecessary include of TEncSearch.h + [7649ffe940e1] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/PPA/ppaCPUEvents.h, + source/output/y4m.cpp, source/output/yuv.cpp: + ppa: cleanup event names and remove uninteresting events + [0a0fe4d52711] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp, + source/encoder/cturow.cpp: + cu, search: remove redundant m_rdGoOnSbacCoder pointer + + m_rdGoOnSbacCoder was always either pointing to the same SBac as + m_sbacCoder (for calls to compressCU()) or it was supposed to be + unused (for calls to encodeCU) + [d6c423c66e4d] + + * source/common/bitstream.cpp, source/common/bitstream.h: + bitstream: hoist trivial methods to the header so they may be easier + inlined + [57a4c1c2274e] + + * source/common/bitstream.h: + bitstream: convert macros into inline members for better compile- + time checking + [b96714bdda85] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/common/bitstream.cpp, source/common/bitstream.h, + source/encoder/CMakeLists.txt, source/encoder/encoder.cpp, + source/encoder/entropy.cpp, source/encoder/frameencoder.cpp, + source/encoder/sei.cpp, source/encoder/sei.h: + bring SyntaxElementWriter into common/, remove trace file macros + + File tracing has been essentially broken since frame parallelism was + introduced (not to mention multiple encoder instances in a process) + and is going to be soon even more impossible when the second + encodeCU() pass is removed. It's best to remove this old HM feature + so people don't get stuck trying to use it. + + Stream analyzers like Parabola and Elecard are now mature enough + that these trace files are not very helpful. If you *really* want + the old style trace file, enable tracing in the HM decoder and + decode your bitstream with it. + + I left the strings in the WRITE_* macros in place because they make + those calls reasonably self-documenting. The compiler will throw + them away harmlessly. + [64dc40d52519] + + * source/encoder/entropy.cpp: + entropy: nits + [cbfe2ac89d41] + + * source/encoder/entropy.h: + entropy: re-order methods again for better clarity, make many + private + [dd179bdba7fd] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/encoder/entropy.h: + entropy: remove include of TComSampleAdaptiveOffset.h + [6c95abc53d26] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + TComTrQuant: rename estBitsSbacStruct to EstBitsSbac + [4fcd24d23994] + + * source/encoder/entropy.cpp: + entropy: nits + [8084d123f14d] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: hoist trivial functions for later optimizations + [898fc94ae9e9] + + * source/encoder/cturow.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/framefilter.cpp: + merge CABAC into SBac class + [ef1b1da7264c] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: convert initSection into a constructor, since it was used + as such + [ac568c7796b8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nits + [9914e8d8e2e4] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: replace SBac and TEncSearch methods with TURecurse methods + + Both classes had nearly the exact same methods for managing this + external structure. how wierd? + [657bfd3423a5] + +2014-07-09 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.cpp, + source/encoder/entropy.h: + entropy: remove m_ prefix from struct TURecurse members + [e776ead24445] + +2014-07-09 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/encoder/compress.cpp, source/encoder/framefilter.cpp: + use std::swap() for readability + [194432db28b9] + +2014-07-09 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: remove braces around single line expressions, improve code + flow + [30e20a7b6fdd] + + * Merge with stable + [56c4719ae735] + + * .hgtags: + Added tag 1.2 for changeset d6257335c537 + [7ea0ba364367] + +2014-07-03 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + csv: add slice-type stats to encode summary, avoid dup copies of + summary string + [5c7da1e88682] + +2014-07-09 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.h: + entropy: remove temporary helper function + [b7934e58ea40] + + * source/encoder/entropy.cpp: + entropy: nits + [a86cd034e254] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + cu: remove get/set methods for its own member variable + [b1d077221c25] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/cturow.cpp: + cu: remove more set methods + [cd93637ce951] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/cturow.cpp: + cu: nit + [c78b7cca188a] + + * source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp: + cu: assign m_rdGoOnSbacCoder without helper methods + [a44b2c12fe04] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/cturow.cpp, source/encoder/frameencoder.cpp: + cu: pass bit counting flag as argument to encodeCU + [4c98884a7729] + + * source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.cpp: + ctu: inline setThreadLocalData() for better clarity + [6c3a4a5498f1] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: remove rarely used getRowCoder() and getBufferSBac() + [22ac140e0da8] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: remove barely use helper function getRDGoOnSbacCoder + [ddf49ea5073c] + + * source/encoder/cturow.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + ctu: rename m_sbacCoder to m_rowEntropyCoder + [6a1370596ede] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp: + entropy: allocate RDO coders as dual-dimensioned array + + In order to make this change even less intrusive as it was going to + be, I simultaneously changed the load/store functions to take + references instead of pointers (since NULL pointers are not + supported). + + This further simplified setup/tear-down code and should improve data + locality + [c49b61c5e6bb] + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framefilter: rename m_rdGoOnSbacCoder to m_sbacCoder - there is only + one + [878e6f228fe6] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/CMakeLists.txt, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + entropy: make m_cabac member of SBac class a non-pointer instance + + This forces a 1::1 correspondence between SBac and CABAC instances, + which cleans up a mess of confusing allocations and assignments. + + This moves the CABAC class into entropy.h and it is no longer + referenced outside of this header and entropy.cpp, allowing us to + further combine them. + [f60a2f9a88ad] + + * source/encoder/entropy.h: + entropy: remove now unused Entropy class + [461e3e940b0c] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.cpp: + remove Entropy from TEncCu, TEncSearch, and CTURow + + Instead, give each an SBac pointer that they are supposed to use to + code the bitstream. + [ff987b44de91] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: remove most traces of the Encoder class + [02f8c0af20f2] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: do not pass member variable to its own function + + We were still passing an array of Bitstream objects to encodeSlice() + even though the array was made a member of the class so it was + available via m_outStreams. Within the function is was using both + pointers, unhelfpully. + [6589396373de] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/entropy.cpp, source/encoder/entropy.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + entropy: remove m_slice member from SBac class + + The slice object was only needed in resetEntropy() and + determineCabacInitIdx() and could be passed to those two methods + easily. + + It was also being used in two functions to gain access to the + scaling list, but this was actually a problem for us because headers + can be queuried before any slice objects are initialized so it was + better to pass in the scaling lists directly from the top Encoder. + Note this fixes a bug in scaling list support - we were previously + incapable of signaling them properly in our stream headers. + + Removing this member variable lets us clean up a lot of redundant + setSlice() calls and other cruft + [576bb702f286] + + * source/encoder/frameencoder.cpp: + frameencoder: do not use Entropy object to code stream headers + [919c77a86b29] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + sao: remove redundant braces and other nits + [1429920a7420] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + remove m_entropyCoder from FrameFilter and SAO code + + Lo and Behold! the SAO code suddenly makes a tad more sense when + this useless redirection is removed + [767a10ce1a64] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: remove unnecessary API differences between Entropy and SBac + + These would only trip us up as we try to remove Entropy entirely + [ca2b4259c44f] + +2014-07-08 Steve Borho + + * source/encoder/entropy.h: + entropy: reorder SBac methods for more clarity + [01e20a16e890] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/entropy.h: + entropy: remove default argument to encodeIntraDirModeLuma() + [436bd8207469] + + * source/encoder/entropy.cpp: + entropy: fix msvc warnings about signed/unsigned operations + [a25a3782e51c] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: hoist all non-trivial Entropy methods into SBac + + Now Entropy only exists in entropy.h, making further cleanups + possible + [e4a21cfa8206] + + * source/encoder/entropy.cpp, source/encoder/entropy.h: + entropy: remove all data members except m_entropyCoder + + The three variables were only used to maintain state through + recursive calls to encodeTransform() and could be just as easily + kept on the stack. + + The class is now a pure reflector to the SBac class and can be merge + into it piece-wise. + [5321d02d9703] + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/primitives.cpp, + source/encoder/CMakeLists.txt, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/entropy.cpp, + source/encoder/entropy.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + port TEncEntropy and TEncSbac into internal Entropy and SBac classes + + The Entropy class will be short-lived + [53fcddea0959] + +2014-07-09 Deepthi Nandakumar + + * Merge with stable + [644773b85329] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: bug fix introduced in commit 8a5b02464813 + + Remove the xCheckBestMode call, add in a ccontext save. + [d6257335c537] [1.2] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge with stable + [a1e46d813642] + +2014-07-09 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + TEncCU: save CABAC context missing after merge + + Also removed an extra context save + [8a5b02464813] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + TEncCU: save CABAC context missing after merge + + Also removed an extra context save + [c269f73b94c9] + +2014-07-07 Satoshi Nakagawa + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/encoder/encoder.cpp, + source/test/mbdstharness.cpp: + quant: returns numSig instead of absSum and lastPos + [65ac66dc89b6] + +2014-07-07 Steve Borho + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h: + Merge with default (prepare for 1.2) + [6623f1195baa] + + * source/encoder/frameencoder.cpp: + frameencoder: nits + [6325261d393d] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: do not copy a QP buffer into itself - prevents valgrind warning + [6aa084ad45a0] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cu: cleanup and simplify initCU + [c60560dcae1c] + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/sei.h: + rc: allocate HRD SEI structures on demand, fix RCE vtable/memset bug + [82963e72231b] + +2014-07-03 Min Chen + + * source/encoder/frameencoder.cpp: + avoid VS2008 and MinGW ambiguous pow() build warning + [11c808e562b8] + +2014-07-03 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp: + frameencoder: remove warning about ambiguous pow function + [e3f9acd4ff88] + + * source/common/param.cpp: + lambda-file: file close before return + [8620deb17a19] + +2014-07-03 Steve Borho + + * source/input/y4m.cpp: + y4m: avoid implicit size_t to int conversion, avoid more MSVC + warnings + [1dc27824bde1] + + * source/encoder/encoder.h, source/encoder/frameencoder.cpp: + encoder: avoid int64_t to int conversions, avoid more MSVC warnings + [50291ad2cfe8] + + * source/common/param.cpp: + param: avoid spurious MSVC warning about buf being possibly + uninitialized + [e8ebfa3cf395] + + * source/encoder/ratecontrol.cpp: + ratecontrol: fix cast of cutree offset, avoids MSVC warning + [85dce645d0fc] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: match up integer argument types to avoid MSVC warning + + The prototype in the WaveFront base class was: virtual void + processRow(int row, int threadId) = 0; + + Some versions of MSVC flag this as a warning + [c41c7a78e46a] + + * source/common/param.cpp: + param: avoid spurious gcc warning about toksave possibly being + uninitialized + [eaee58b6515b] + +2014-07-02 Steve Borho + + * source/common/param.cpp, source/common/param.h, + source/encoder/encoder.cpp: + lamba-file: make improper number of constants a fatal error + + Having an invalid set of lambdas is likelty to be completely useless + [1e94a2b12d15] + + * source/common/param.cpp: + lambda-file: detect and report if lamda file containts too many + values + + It seems likely that this is an unintended error the user would want + to be made aware of. + [0013dbcac349] + +2014-07-02 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp: + simplify: getLumaIntraDir()[x] -> getLumaIntraDir(x) + [959ff37cdd31] + +2014-07-02 Steve Borho + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/param.cpp, source/common/param.h, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + api: allow lambda tables to be user-specified via a text file + + This change allows easy experimentation with the lambda tables. One + can cut-paste the existing tables from TComRom.cpp into a text file + and hash(#) comment the C constructs (variable names and braces) and + arrive at a functional lambda file, then edit to taste. + [44dc246b7835] + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: fix gcc warnings + [b90fdc3ffb96] + +2014-06-30 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: write frame stats and cu stats to file in first pass + [887081b5f694] + +2014-06-26 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/cturow.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + rc: compute inter, intra and skip cus per frame for the first pass + [5bee122bc183] + +2014-07-01 Aarthi Thirumalai + + * source/common/frame.cpp, source/common/frame.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + rc: move structure FrameStats to FrameEncoder class + + Stats are not needed post encode, moving it to FrameEncoder. + [8f76f88c7dbf] + +2014-07-02 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + add primitives.nquant for RDOQ + [5bfd621a58b9] + +2014-07-02 Kavitha Sampath + + * source/common/frame.cpp: + frame: initialize recon to avoid SAO read of uninitialized pixels + beyond picture end + [cbe0999934b7] + +2014-07-02 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + encoder: fix --no-wpp behavior, keep TLD selection logic in one + place + + do not use static m_threadLocalData, this would break if multiple + encoders were allocated in a single process + + pass the selected TLD to row processes, so they do not need to be + aware of the WPP/NO-WPP details. + + Cleanup frameencoder.h, move non-trivial processRow() to the cpp + file + [0743791a8245] + +2014-07-01 Steve Borho + + * source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/input/yuv.h: + input: streamline control logic of threaded file readers + + These files were written before ThreadSafeInteger and this caused + the control logic to be over-complicated. Now they can be greatly + simplified and their control flows can be re-unified to be more like + each other. + + Also, drop PPA events. File reading is pretty uninteresting when + profiling. + [99b8e4d69e0f] + +2014-07-02 Steve Borho + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h: + TEncEntropy: inline the only one caller of encodeInterDirPU + [f483344d276f] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TEncEntropy: hoist encodePredMode() after moving I-slice check to + callers + [ca17915c0176] + + * source/Lib/TLibEncoder/TEncEntropy.h: + TEncEntropy: reorder nit + [a4cf2c474fb8] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TEncEntropy: hoist encodeSkipFlag() after moving I-slice check to + callers + [8208c49e9aa4] + +2014-07-01 Steve Borho + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/encoder/frameencoder.cpp: + TEncEntropy: hoist encodeSliceHeader() to header after simplifying + [719ec54347ab] + + * source/Lib/TLibEncoder/TEncEntropy.cpp: + TEncEntropy: nit + [166b7ddace37] + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h: + TEncEntropy: hoist trivial functions to header for possible inlining + + And remove useless comments + [a15e58e38501] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nit + [a18972fd05b1] + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: remove redundant refIdx check from fillMvpCand + + fillMvpCand is only called from one place and the refIdx will always + be positive + [7006c14a6149] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge with stable + [976dc15ea5ea] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: limit AMVP loop bounds in xCheckBestMVP + + entries in the table beyond m_num were not initialized + [4f7be97ebb9d] + +2014-07-01 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + remove redundant memset + [28a17ce0bf5c] + +2014-07-01 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + split rate calculation functions to luma and chroma to simplify luma + path + [fb54c0d470ed] + +2014-06-27 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + fix emms: move selectLambda() into xRateDistOptQuant() and issue + emms before it + [b3ff3f436bc9] + +2014-06-25 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: rename texBits to coeffBits in RateControlEntry structure to + maintain uniformity. + [f18febf8cd3a] + +2014-06-30 Steve Borho + + * source/encoder/encoder.cpp: + encoder: RDOQ is only applicable at rd levels 4, 5, and 6 + [38da32f28481] + +2014-06-27 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp, + source/encoder/frameencoder.cpp: + rc: accumulate mv bits, coeff bits per frame + [3d4b66c1bc88] + +2014-06-30 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: remove warnings for shadowed declaration + [5a37c8198035] + +2014-06-30 Albert Wang + + * source/encoder/frameencoder.cpp: + frameencoder: fix for error in VPS when AccessUnitDelimeter is + turned on + + The bitstream needs to be reset before start encode VPS, since if + AU_Delimeter is turned on, there is one byte of data left in the + bitstream that will be written wrongly into the VPS. + [8eb8200a3449] + +2014-06-30 Steve Borho + + * doc/reST/cli.rst: + rest: document ipratio and pbratio parameters + [4e565eb62d0a] + +2014-06-25 Satoshi Nakagawa + + * source/common/param.cpp, source/x265.cpp: + cli: add --ipratio and --pbratio + [c80f9f3fba25] + +2014-06-21 Steve Borho + + * source/x265.cpp: + cli: use consistent bool naming convention + [8b6b3e05946f] + + * source/Lib/TLibEncoder/TEncEntropy.cpp: + entropy: nit + [3a71cbcaee1f] + +2014-06-21 Satoshi Nakagawa + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: fix race condition + [8253fe375772] + +2014-06-26 Min Chen + + * source/Lib/TLibEncoder/TEncSbac.cpp: + reduce condition jmp in codeCoeffNxN + [32aa6cc3cf4d] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + improve count_nonzero by SSSE3 + [e5d8a2fa6bca] + +2014-06-27 Sumalatha Polureddy + + * source/Lib/TLibEncoder/TEncSearch.cpp: + psyrd: fix for inconsistent output + + maximum buffer size for zeropel is MAX_CU_SIZExMAX_CU_SIZE. since + stride was wrong, it was accessing out of boundary memory which was + different for each run, so inconsistent output + [a765a34425f0] + +2014-06-27 Deepthi Nandakumar + + * source/encoder/framefilter.cpp: + framefilter: disable warning + [98a2bfed9ed4] + +2014-06-23 Min Chen + + * source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + pass TLD into class FrameFilter + [66c701cb5500] + +2014-06-24 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/shortyuv.cpp, + source/common/shortyuv.h: + refine intra tskip related. + [b9bc64443ee4] + +2014-06-25 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/frame.cpp, + source/common/frame.h: + rc: initalize states to hold frame statistics used in two pass + + frame stats includes mv bits,DC coeff bits and number of Intra, + Inter and Skip Cus per frame. + [1b669c33ff3a] + +2014-06-25 Sumalatha Polureddy + + * source/encoder/compress.cpp, source/encoder/encoder.cpp: + psy-rd: implement psy-rd in rdlevel=4,3 and 2 + [e2ed009d296a] + +2014-06-25 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/frame.cpp, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp: + remove PCM mode support + [5797d6a8197c] + +2014-06-24 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + fix xGetIntraBitsQTChroma() for 4:2:2 [CHANGES OUTPUT 4:2:2 with + tskip] + [09450ac6dc7d] + +2014-06-24 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/rdcost.h: + psy-rd chroma: Adjust chroma lambda based on QP offsets. + + This hurts PSNR but improves visual quality + [613bfe5cd169] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/common.h: + psy-rd chroma: add chroma offset tables for lambda calculations + [20611feb4a45] + + * source/encoder/encoder.cpp: + Chroma QP Offset: increase chroma QP when psy-rd is enabled. + [812dc9f61549] + +2014-06-23 Min Chen + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + move calcSaoStatsRowCus_BeforeDblk into encode loop + [18f936182df5] + +2014-06-23 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + TEncCU:fix for memory leak + [25a1b89a8efb] + +2014-06-20 Praveen Tiwari + + * source/encoder/compress.cpp: + TEncCu: [CHANGES OUTPUT] xComputeCostIntraInInter, fix bug + [b5fbe9e2a10a] + +2014-06-21 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + TEncCU:fix for large memory consumption + [da4aa721bf2f] + +2014-06-20 Min Chen + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h: + alloc concatenation memory for m_cuData + [fe370292c232] + +2014-06-20 Steve Borho + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp: + entropy: remove TEncEntropyIf abstract class + + Only one class implemented the interface, so it served no purpose + (except adding a vtable where we didn't need one). Renamed some + member variables to make it all more explicit: m_entropyCoderIf -> + m_entropyCoder, m_binIf -> m_cabac + [960ea9018a7b] + + * source/Lib/TLibEncoder/TEncEntropy.cpp: + TEncEntropy: nit + [bf4fd0756cbb] + + * source/encoder/frameencoder.cpp: + frameencoder: remove more redundant checks + + slice->getSaoEnabledFlag() is the same as saoParam->bSaoFlag[0] + slice->getSaoEnabledFlagChroma() is the same as + saoParam->bSaoFlag[1] + [6111f474ef99] + + * source/encoder/frameencoder.cpp: + frameencoder: simplify cabac context save + [d0b71172d7da] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/encoder/frameencoder.cpp: + frameencoder: simplify SAO reset when not enabled on slice + [06b03c4647d6] + + * source/encoder/frameencoder.cpp: + frameencoder: further simplify SAO coding + [8e37416993db] + + * source/encoder/frameencoder.cpp: + frameencoder: simplify SAO coding logic + [f24ac9e50e4d] + + * source/encoder/frameencoder.cpp: + frameencoder: simplify loop bounds + [ce47fb939f69] + + * source/encoder/frameencoder.cpp: + frameencoder: further simplify the logic which sync's CABAC with + upper-right LCU + [613f9a60a5d3] + + * source/encoder/frameencoder.cpp: + frameencoder: remove redundant widthInCU variable + [4857a503d709] + + * source/encoder/frameencoder.cpp: + frameencoder: simplify logic slightly, numSubstreams can be > 1 only + with WPP + [0e6200c402cf] + + * source/encoder/frameencoder.cpp: + frameencoder: nits + [4455746792b0] + + * source/encoder/frameencoder.cpp: + frameencoder: move trace code for clarity, remove dead + bWaveFrontsynchro + [bb6c9f097ba2] + + * source/encoder/frameencoder.cpp: + frameencoder: use m_param->bEnableWavefront directly + [8911196c3bf1] + + * source/encoder/frameencoder.cpp: + frameencoder: simplify setup for encodeSlice() + [847cb38b409e] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: rename m_pic to m_frame + [0aab15ebff0d] + + * source/encoder/frameencoder.cpp: + frameencoder: remove unread bitsOriginallyInSubstreams + [5fde40742246] + +2014-06-20 Aarthi Thirumalai + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/common/param.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/x265.cpp, source/x265.h: + make cu level stats logging run time configurable + + two-pass encodes will need these statistics, so they need to always + be compiled into libx265. + [c9f36b715b3f] + +2014-06-20 Steve Borho + + * source/x265.h: + api: comment nits + [868e35687ac7] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/common/CMakeLists.txt, source/common/frame.cpp, + source/common/frame.h, source/common/lowres.cpp, + source/common/param.cpp, source/common/piclist.cpp, + source/common/piclist.h, source/encoder/compress.cpp, + source/encoder/cturow.h, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/encoder/weightPrediction.cpp: + replace TComPic with Frame class in common/frame.h + + The new class name clashes with old variables like pic or m_pic, but + I would like to pull in TComPicYuv in the future as a Picture class, + and TComYUV as Yuv class. + [6a11b4e683e1] + + * source/encoder/frameencoder.cpp: + frameencoder: fix --no-wpp, broken since 854f5e3072a0 + [2317cb9af53e] + +2014-06-20 Sumalatha Polureddy + + * source/common/param.cpp, source/encoder/encoder.cpp: + encoder: move psy-rd logic together + [c2ebebb66bf4] + +2014-06-20 Steve Borho + + * source/encoder/encoder.cpp: + encoder: use free() to release strdup'd string + [36f6a06df2d5] + +2014-06-19 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/encoder/frameencoder.cpp: + TEncSearch: change pointer type of m_mref, to avoid repeated pointer + setup + [e89fb21a4e06] + + * source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + cturow: use thread local storage for TEncCU and TEncSearch + + This saves about 10% of memory by reducing redundancy, and lays the + ground work for more fine grained work distribution. + [854f5e3072a0] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/framefilter.cpp: + cu: remove m_loopFilterAcrossTilesEnabledFlag and getPU* default + arguements + [f3ad033af809] + +2014-06-20 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: comment nits + [d68c638e1469] + +2014-06-20 Sumalatha Polureddy + + * source/encoder/encoder.cpp: + encoder:[CHANGES OUTPUT] disable rdoq when psyrd is enabled + [3780de35ba2a] + +2014-06-20 Steve Borho + + * source/encoder/nal.cpp: + nal: include the start code bytes in the bounds check + [cc0beb898def] + + * source/encoder/nal.cpp: + nal: nits + [57f26a8b7ecb] + + * source/encoder/frameencoder.cpp, source/encoder/nal.cpp, + source/encoder/nal.h: + nal: allow the concatenated substream buffer to be re-used from + frame-to-frame + + this removes another malloc/free from the processing of every frame. + This commit also moves the NALList constructor to nal.cpp since it + is no longer quite so trivial. Lastly, it adds a check to prevent + crashes in case one or more of the WPP substreams had a malloc + failure. + [51dd3c429cdb] + + * source/encoder/nal.cpp: + fix checked build warning + [91fef2cf2e08] + + * source/common/bitstream.cpp, source/encoder/cturow.cpp: + fix msvc build warnings + [73ece35100df] + +2014-06-19 Steve Borho + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/common/CMakeLists.txt, + source/common/bitstream.cpp, source/common/bitstream.h, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.h, + source/encoder/nal.cpp, source/encoder/nal.h, + source/encoder/sei.cpp, source/encoder/sei.h: + move bitstream implementations into common + [5091ffc86a42] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/encoder/frameencoder.cpp: + bitstream: unify resetBits() and clear() methods + + There's no point in having a virtual resetBits() method in the + abstract class and then not implemeent in the derived class but make + a second method with identical semantics. What the heck? + [0d77026a11ac] + + * source/encoder/api.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h: + encoder: remove redundant flush argument + [0b4a50730f21] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: cleanup obsolete NAL data fields + [aecb3b2a98c1] + +2014-06-20 Satoshi Nakagawa + + * source/encoder/api.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/slicetype.cpp: + x265_encoder_encode: don't return 0 while flushing. + [ced3a726d515] + +2014-06-18 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/encoder/frameencoder.cpp: + lambda: [CHANGES OUTPUT] Change the distortion weights applied to + chroma. + + Inside R-D, a single lambda is applied for both luma and chroma. To + account for the different QPs, (as defined by the standard) the + chroma distortion was weighted by a factor proportional to the + difference in QPs. This patch eliminates the extra weighting given + to the chroma distortion (since it is visually less perceptible). + [25d2c596dd28] + +2014-06-19 Steve Borho + + * source/encoder/ratecontrol.cpp: + rc: disable MSVC warnings about using POSIX function names like + unlink() + [dfaf67c21c32] + +2014-06-16 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + TComDataCU: refactor structure for better data locality + [62d1d9dd760c] + +2014-06-19 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.h: + cu: nits + [c72093672ad1] + +2014-06-19 Min Chen + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/encoder/framefilter.cpp: + move m_blockingStrength to local + [8c875d7341b7] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h: + static m_bLFCrossTileBoundary since all of LCU use same value + [565818adf640] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h: + remove reduce code since trHeight always equal to trWidth + [c91e624c4502] + +2014-06-17 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: initialize the 2 pass states in rc + [60178ece9879] + + * source/common/param.cpp: + param: log rc states of m_param in x265_param2string + [ff6e2349cff3] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: add 2 pass states in RateControl + [ed85651a2840] + + * source/common/param.cpp, source/encoder/slicetype.cpp: + rc: define default setting and validations for 2 pass states + [c50f3d2aeb4d] + + * source/CMakeLists.txt, source/x265.h: + api: introduce param variables for two-pass encodes + [5063065d7037] + +2014-06-19 Steve Borho + + * source/encoder/encoder.h: + encoder: remove useless comment + [82ca012854ea] + + * source/common/common.h, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/nal.cpp, source/encoder/nal.h: + nal: refactor nal code, marshall packets directly into output buffer + + This removes two malloc/free for every NAL unit and removes yet + another set of memcopies at the end of the each frame encode. We're + now writing the escaped NAL packets directly into the buffer handed + back to the user. + + We preserve the max size of this output buffer to prevent having to + do any reallocations once the encoder is running. + [ba9c58a4bee0] + +2014-06-19 Satoshi Nakagawa + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + keep TComPic until next call if recpic is exported as pic_out + + We must prevent the recon YUV from being recycled until the next + picture is encoded. + [ecccd5401d27] + +2014-06-19 Gopu Govindaswamy + + * source/encoder/encoder.h: + encoder: remove m_freeList from encoder class, the m_freeList moved + into dpb + [d2a13e8541f4] + + * source/encoder/dpb.cpp: + dpb: destroy and delete m_reconPicYuv from picSymFreeList + [d86fea3cea9c] + +2014-06-19 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: correct QP range clipping for 10-bit input depths. + + Fixes a bad hash error and artifacts introduced by commit + c8973aec5e70 + [59a6891dff51] + +2014-06-18 Steve Borho + + * source/common/pixel.cpp: + pixel: fixup zeroBug stride for satd_4x4 in psyCost (spotted by + MinGW) + + This fixes a bug introduced in f3fb2d2b0ba6 + [f5be40ace2cc] + + * source/Lib/TLibCommon/TComPic.cpp: + pic: reorder initializations after moving m_reconPicYuv in + e3418f7497e9 + + Fixes a gcc warning + [5db614f31b85] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp: + pic: keep the recon TComPicYuv in the pool with the TComPicSym - + another 10% + [e3418f7497e9] + + * source/encoder/encoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp: + frameencoder: make m_ssimCnt an unsigned int + [cd63ddf0e935] + + * source/encoder/encoder.h: + encoder: move EncStats into x265 namespace + [a43a223a1294] + +2014-06-17 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/slicetype.cpp: + pic: keep a pool of TComPicSym, allocate only for encode process + + This commit moves the picture freelist to the DPB, it seemed to + belong there. This change reduced memory in a medium encode by more + than 40%. + [cbed0913df50] + + * source/Lib/TLibCommon/TComPic.h: + pic: nit + [da81f3741282] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/encoder.cpp: + pic: split picsym allocation from main create function, rename + reinit() + [526d79dabf1a] + + * source/Lib/TLibCommon/TComPic.h: + pic: destroy() no longer needs to be virtual, compress white-space + [0e9a74223938] + + * source/Lib/TLibCommon/TComPic.h: + pic: improve comments + [7a5bec950123] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.cpp: + pic: remove unread m_bCheckLTMSB + [8fb41a7f4301] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/dpb.cpp: + pic: remove unread m_bUsedByCurr + [3a458e3f674e] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + TEncSearch: keep param pointer instead of top encoder pointer + + This required cacheing some top encoder fields in TEncSearch + [5a0f8d5377da] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: nits + [9a0dde9e5fa6] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framefilter: replace top pointer with param pointer + [7259c8a2d1b1] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: replace m_cfg with m_param pointer, use m_top for + leftovers + [c53f4e4669c2] + + * source/encoder/frameencoder.h: + frameencoder: remove unused bitcounter + [785e2b88c134] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncCu.cpp: + TComDataCU: remove hungarian prefixes from CU vars + [ca032ae0fd67] + + * source/Lib/TLibCommon/TComPicSym.h: + sym: do not return pointer reference from getCU() + [bf0aa61ceaab] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + pic: move m_elapsedCompressTime and m_frameTime to frame encoder + + This data only needs to exist when the picture is being encoded + [f73c1d866741] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + pic: move SSIM, PSNR and hash to frame encoder + + This data only needs to exist when the picture is being encoded. + This changes the SSIM scores slightly, even though the bitstream and + recon does not change, because it is resetting the SSIM sum and + counter each frame. When these fields were on the TComPic, they were + never cleared. + [bf112abf087c] + + * source/Lib/TLibCommon/TComPic.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + pic: move timing structures to RateControlEntry + + This saves a bit of memory, since this data only needs to exist when + the picture is being encoded. + [cb4d408c048e] + + * source/encoder/sei.h: + sei: remove unused RESERVED hash method, avoid compiler warnings + [51aa9d1542ef] + + * source/Lib/TLibCommon/TComPic.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp: + pic: better name for picture timing SEI member + [d3ac1cbdc9c3] + + * source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h: + writer: xConvertToUInt did not need to be a class member + [a2d63126b37c] + + * source/encoder/sei.h: + sei: allow SEI classes to implement write() directly to avoid two- + pass encode + [778385cde253] + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/sei.h: + sei: add m_ prefix to members of SEIDecodedPictureHash for + consistency + [bb872c51a453] + + * source/encoder/frameencoder.cpp: + sei: repair decoded picture hash SEI + [2be51b34e033] + + * source/common/pixel.cpp: + psyrd: use zero stride in psyCost C reference, minor optimization + [f3fb2d2b0ba6] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: simplify square block dim logic + + this works around a bizarre MSVC warning on those lines. '<<' : + result of 32-bit shift implicitly converted to 64 bits (was 64-bit + shift intended?) + + I have no idea why it was implicitly making the result 64bits. + [0c3d33212ebd] + +2014-06-16 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + fix: TOPSKIP refers outside of picture [OUTPUT CHANGE] + [7c6654c332c5] + +2014-06-17 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComTrQuant.cpp: + quant: Enforce QP range after adding bit-depth offsets + [c8973aec5e70] + +2014-06-17 Kavitha Sampath + + * source/encoder/ratecontrol.cpp: + sei: disable HRD with warning when vbv is off + [b6f8e0ce8c81] + +2014-06-17 Sumalatha Polureddy + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + psyrd: use psyrdcost for PU/TU decision for inter and intra + [2b514f3f6e1f] + +2014-06-17 Steve Borho + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/ratecontrol.cpp, + source/encoder/sei.cpp, source/encoder/sei.h: + sei: first step towards simplifying SEI writing + [9d43b41eb529] + + * source/Lib/TLibCommon/TComBitStream.h: + bitstream: give bit counter a useful constructor + [f9fef6da4da6] + +2014-06-17 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: [CHANGES OUTPUT for 10-bit CRF] Remove QP_BD_OFFSET + from Ratecontrol + + This offset is added inside Quant (setQPforQuant) + [3a19a9fdb103] + + * source/common/param.cpp, source/encoder/ratecontrol.cpp: + ratecontrol: move validate of CRF params to x265_check_param + [53fab23c6e5b] + +2014-06-17 Steve Borho + + * source/Lib/TLibEncoder/SyntaxElementWriter.h: + writer: nits + [ab8124028030] + +2014-06-17 Min Chen + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h: + cleanup unused TComLoopFilter::loopFilterPic + [55ba291c2c5b] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/encoder/framefilter.cpp: + move m_bEdgeFilter to local + [a99fb0ffe53a] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h: + move lfcuParam to local + [4c204b14ad0e] + +2014-06-16 Steve Borho + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/encoder/frameencoder.cpp, source/encoder/nal.cpp, + source/encoder/nal.h: + nal: concatenate and escape row data while building row-start array + for header + + This way we only need to do a single pass over each slice NAL + looking for start code emulations; and there is one less memcopy. + countStartCodeEmulations() and appendSubstream() methods could be + removed. + [9198ff2e0125] + + * source/common/pixel.cpp: + pixel: prevent msvc warning + [f25ed8618509] + +2014-06-15 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + refine tskip related + [6d8d8c18ba28] + +2014-06-14 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp, + source/encoder/frameencoder.cpp: + TComTrQuant: lambda for each Cb and Cr + [0957164296c1] + +2014-06-16 Sumalatha Polureddy + + * source/common/pixel.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/encoder/rdcost.h: + psyRd: Change psy-rd energy measurement + + psy-rd energy is measured by summing up the absolute differences of + the AC energy in each 8x8 block in series + [cf8b4506e94e] + +2014-06-12 Steve Borho + + * source/Lib/TLibCommon/TComBitStream.cpp: + bitstream: simplify appendSubstream + + This function is only called from one place, and we can clearly see + that both the source and dest bitstreams are byte-aligned. + [e69a427e461f] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + sbac: simplify codeSliceHeader, x265 never codes dependent slices + [a15044282290] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/encoder/frameencoder.cpp: + slice: remove unused m_nextSlice and m_dependentSliceSegmentFlag + [b3070d28b792] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: inline resetEntropy(), which was only called once + [54d6bb746c04] + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framefilter: remove empty end method + [e254c0d81b5d] + + * source/encoder/frameencoder.cpp: + frameencoder: use numSubstreams directly + [2fdc26545855] + + * source/encoder/frameencoder.cpp: + frameencoder: remove useless setBitstream call + [33853b3f694d] + + * source/encoder/frameencoder.cpp: + frameencoder: remove redundant function calls and obsolete comments + [5b42f824f19c] + + * source/encoder/frameencoder.cpp: + frameencoder: explain why SAO is being called after compression + [35bfd27a3c82] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp: + slice: remove unused m_tileOffstForMultES + [df0ee2403067] + + * source/encoder/frameencoder.cpp: + frameencoder: remove need for intermediate bitstreamRedirect + [602b2c2506a9] + + * source/encoder/frameencoder.cpp: + frameencoder: remove sliceSegment, it was always false + [067b68fd94cc] + + * source/encoder/frameencoder.cpp: + frameencoder: remove unnecessary set of nextSlice() + + This flag is only read in one place, when encoding the slice header. + We do not support multiple slices, so there's no point in setting + this multiple times. + [51618487cb7d] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: remove unused m_bPCMInputBitDepthFlag + [635f159b4d0f] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp: + remove unused display orientation SEI + [4f87da44e4db] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: make per-row output streams member variables + + Avoid repeated malloc/free and subsequent reallocs and frees each + frame + [97be21ff81bb] + + * source/encoder/frameencoder.cpp: + frameencoder: nit, unify comment style in this function + [8644321a206e] + + * source/encoder/frameencoder.cpp: + frameencoder: remove redundant sets of SAO lambdas, reorder for + clarity + [8e51b0bfdd30] + + * source/encoder/frameencoder.cpp: + frameencoder: nit + [edbb60c0ac0c] + + * source/encoder/frameencoder.cpp: + frameencoder: move QP/lambda initialization together + [9afd674721a8] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: inline slice bounds determination, do not perform + twice + [4ac8602fb0af] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp: + slice: remove unused finalized flag + [e8aee2613bf6] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp: + sao: remove unnecessary shim function + [957a5e3d0d7b] + + * source/encoder/frameencoder.cpp: + frameencoder: remove obsolete comment + [d55abb8e9106] + + * source/encoder/dpb.cpp: + dpb: no need to reset next slice in init function + [1c2ee741e5b1] + + * source/encoder/frameencoder.cpp: + frameencoder: declare bitstreamRedirect on stack, do not malloc. + cleanups + [2b866266f884] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp: + remove unused gradual decoding refresh info SEI + [9283279c721c] + + * source/encoder/frameencoder.cpp: + frameencoder: move prefix SEI generation all together, before + compressCTURows + [ff47e8fd3751] + + * source/encoder/frameencoder.h: + frameencoder: fix header case + + Reported by Elyonta + [01a0982d2e4b] + +2014-06-12 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: set isCbr flag to true only in vbv mode and maxrate equals + bitrate + [a302daa9abd0] + +2014-06-12 Steve Borho + + * source/encoder/compress.cpp: + compress: add EMMS at return of xCompressInterCu + + Removed the EMMS from xComputeCostMerge2Nx2N since it is only called + from the CU compress function and it is not using floats + [5b975609dc0a] + +2014-06-05 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: re-use a single bitstream object for all NAL and SEI + + This saves on malloc/free/realloc/memcpy calls. + [c5015669b6dc] + +2014-06-11 Steve Borho + + * source/encoder/compress.cpp: + compress: nit comment spacing + [e8df9b57eb09] + + * source/encoder/weightPrediction.cpp: + weight: ensure weight table is initialized on failure, simplify + weightAnalyse() + [10a4c2d42d3f] + + * source/encoder/compress.cpp: + compress: remove floating point math to avoid needing to use EMMS + [8da75c4dbbc0] + + * source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: nits + [ebe26cc5fdd8] + + * source/encoder/frameencoder.cpp: + frameencoder: remove obsolete check + [ea6f7da090ef] + + * source/encoder/nal.cpp: + nal: msvc and its integer conversion warnings + [b76c4e415cdd] + + * source/encoder/nal.cpp: + nal: greatly simplify NAL header generation, do not use + TComOutputBitstream + + This saves a malloc/free and a great deal of needless overhead + [7868d22e535d] + +2014-06-10 Steve Borho + + * source/encoder/encoder.cpp: + encoder: set m_totalFrameThreads at the same time as + param->frameNumThreads + [20d74192e097] + + * doc/reST/cli.rst, source/common/param.cpp, + source/encoder/encoder.cpp: + param: clip bframe bias range as x264 does + [d0bacf50eb95] + +2014-06-10 Kavitha Sampath + + * source/Lib/TLibCommon/TComRom.cpp: + TComRom: fix fraction part variation in x265_lambda2_tab for higher + QPs + [a89e0703e724] + +2014-06-09 Deepthi Nandakumar + + * source/common/param.cpp: + preset: improve the speed settings for fast preset. + [0cbc7320c9f2] + +2014-06-05 Steve Borho + + * source/encoder/ratecontrol.cpp: + Merge with stable + [e5656f1e1904] + + * source/encoder/ratecontrol.cpp: + rc: fix potential build warning + [7a3214c9622f] + + * source/common/common.h: + common: include cctype for isdigit + [25b8aa22e4c8] + + * source/Lib/TLibCommon/TComBitStream.cpp: + bitstream: fast-path for appending bitstreams + [47fdb8041e9d] + + * source/encoder/nal.cpp: + nal: add back comments now that member variables are gone + [420122606888] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, source/common/common.h, + source/encoder/frameencoder.cpp, source/encoder/nal.cpp: + bitstream: simplify TComBitStream interface and code + [467d0e89a52f] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: remove layer violation include + + files in common/ shouldn't include encoder/ headers + [9f6dca97e01f] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncCu.h, source/common/common.cpp, + source/common/common.h, source/encoder/level.cpp, + source/encoder/level.h, source/test/testharness.h: + always include CommonDef.h via common.h, avoid include loops + + Remove some spurious stdlib includes not in common.h + [0cdfd02d6b1a] + + * source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/common/CMakeLists.txt, source/encoder/cturow.h, + source/encoder/frameencoder.h: + merge TComBitCounter into TComBitStream.h + [ac044bf2f850] + + * source/encoder/nal.cpp: + nal: add more padding for corner cases + [ecbe8796febc] + +2014-06-05 Kavitha Sampath + + * source/encoder/frameencoder.cpp: + weightb: reinitialize weight table when weightb is off + [9dddc48aecc7] + +2014-06-05 Deepthi Nandakumar + + * source/encoder/ratecontrol.h: + ratecontrol: change type of m_bframeBits to int64_t + [495874699b0d] + +2014-06-04 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + Merge with stable + [c11920cb6f04] + +2014-06-04 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + rc: fixes to improve quality in vbv + [93921c0afac5] + +2014-06-05 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: use actual bits to update vbv buffer + [4df4e48d24a0] + +2014-06-04 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: set pbFactor to default with cutree + [a076f4fc8d36] + +2014-06-04 Steve Borho + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/nal.cpp, source/encoder/nal.h: + combine NAL output semantics into one clean(er) interface + [4a146c01fcab] + + * source/Lib/TLibEncoder/NALwrite.cpp: + nal: fix checked build + [7186f76fe62c] + + * source/encoder/encoder.cpp: + encoder: no longer needs NALwrite.h + [e4b3061770b9] + + * source/Lib/TLibCommon/TComBitStream.h: + TComBitstream: nits + [d94a29bb9ce7] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/cturow.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h: + remove Encoder::bEnableRDOQTS, we did not allow separate config for + it + + Either RDOQ and RDOQTS were enabled at once, or both were disabled, + or TSKIP was disabled and it didn't matter how RDOQTS was set. + + Removed RDOQ_CHROMA as well, which allowed the removal of a warning + disable + [eca52aac2ec5] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/cturow.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: bEnableRDOQ->m_bEnableRDOQ for consistency + [2a228a28395c] + + * source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/api.cpp, + source/encoder/cturow.cpp, source/encoder/dpb.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/slicetype.cpp: + encoder: param->m_param for consistency + [c02dacf1a13b] + + * source/encoder/encoder.h: + encoder: remove m_useLossless + [65b385fa9989] + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: convert lookahead "structs with methods" to classes + [72a00c40366a] + + * source/encoder/ratecontrol.cpp: + rc: nit + [969811ce69da] + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: convert RateControl to a class + + The "struct with methods" metaphore was too stretched here. The + methods are too complicated to keep track of which variables were + members; variable shadowing was very common. + [d3b1d2d3f104] + + * source/Lib/TLibEncoder/NALwrite.cpp: + nal: check padded buffer size + [522e419ed55d] + + * source/encoder/api.cpp: + api: must match X265_ALLOC with X265_FREE for m_nalUnitData + + This was only an issue on Windows + [e3a5aa8afeae] + + * source/encoder/ratecontrol.cpp: + rc: fix type of actualBits + [30e6d62afa3d] + + * source/Lib/TLibEncoder/NALwrite.cpp: + nalwrite: simplify write method; single allocation with no reallocs + [c3a4e0ab37c2] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: only pass param to rc constructor + [5c9a0aa10472] + + * source/encoder/ratecontrol.cpp: + rc: further improvements to code clarity + [794b1c67c829] + + * source/encoder/ratecontrol.cpp: + rc: nit + [9f44990be4d4] + + * source/encoder/ratecontrol.cpp: + rc: use member isCbr rather than making a shadow variable + [cca69754c32e] + + * source/encoder/ratecontrol.cpp: + rc: move isCbr initialization to constructor, to avoid init() race + [4cdc53eb185b] + + * source/encoder/encoder.cpp: + Merge with stable + [8d9ea2d89fbf] + +2014-06-04 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + rc: fixes to improve quality in vbv + [cb025d96fac3] + + * source/encoder/ratecontrol.cpp: + rc: use actual bits to update vbv buffer + [5eca4ea111f7] + + * source/encoder/ratecontrol.cpp: + rc: set pbFactor to default with cutree + [0f68adb0e190] + +2014-06-04 Kavitha Sampath + + * source/Lib/TLibCommon/TComSlice.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp: + SEI: fix bus error, cleanup m_decodingUnitInfoSEIEnabled + + Remove setHrdPaarameters function as all the necessary HRD + parameters are already computed and set in the initHrd function. + [fe25d256f0e1] + +2014-06-04 Steve Borho + + * source/Lib/TLibCommon/SEI.h: + sei: white-space nits + [00f286a1f5e3] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/encoder/frameencoder.cpp: + sei: remove SEIPictureTiming fields we do not signal, cleanup + + Completely remove constructor and destructor + [1b93336d68d4] + +2014-06-04 Satoshi Nakagawa + + * source/Lib/TLibEncoder/NALwrite.cpp, source/encoder/encoder.cpp: + fix: uninitialized read m_totalFrameThreads + [63ac1a0aa81d] + +2014-06-04 Steve Borho + + * doc/reST/cli.rst: + rest: reorganize analysis options into their own section + [d40f60dae2ce] + +2014-06-04 Kavitha Sampath + + * source/encoder/ratecontrol.cpp: + fix crash in hrd when init function is called with null pointer + [61a02a9c78eb] + +2014-06-04 Deepthi Nandakumar + + * Merge with stable + [9b81591718d5] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: fix bug in no-intra, intra-analysis should still happen for + P-frames + [5cc4502256a7] + + * source/common/param.cpp: + Merge with stable + [1fe42453d2e3] + + * source/common/param.cpp: + preset: correct ultrafast settings (in line with doc) by disabling + loop-filter. + [9f59dad6eea6] + +2014-06-03 Steve Borho + + * doc/reST/cli.rst, source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/param.cpp, + source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/x265.cpp: + Merge with stable + [f2479eb454b0] + + * .hgtags: + Added tag 1.1 for changeset ae9609aeebdc + [2650fdd1a7f6] + + * source/common/param.cpp: + param: only log psy-rd if it is enabled + [ae9609aeebdc] [1.1] + + * source/encoder/ratecontrol.cpp: + lossless: do not allow CQP to adjust QP for I/B when encoding + lossless + [07e7a54ae1cb] + + * source/encoder/encoder.cpp: + lossless: report compression ratio, disable distortion metrics + [d9facf9895f7] + + * doc/reST/cli.rst, source/encoder/encoder.cpp: + lossless: disable rate control, force QP=4 + + strangely enough, using QP=4 gives better lambda for lossless ME and + RD decisions + [12d35ab8148d] + +2014-06-03 Praveen Tiwari + + * source/x265.h: + x265.h, data declaration alignment + [6f3149d95bd9] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/x265.h: + whitespace clean-up + [8c66560a3e23] + +2014-06-03 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: nit - use bools to initialize bools + [8b5dadb1454e] + + * source/encoder/frameencoder.cpp: + SEI: only setup SEIPictureTiming fields which will be coded + [3356ce23cad2] + +2014-06-03 Kavitha Sampath + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.h, source/common/param.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/x265.cpp, source/x265.h: + SEI: Insert buffering period and picture timing SEI messages + + The buffering period SEI message is inserted for every key frame and + the picture timing SEI is inserted for every frame. The commit also + computes the HRD parameters as well as HRD Timing parameters that + are calculated using the delay components(cpb removal delay and dpb + output delay) carried in these SEI messages. HRD parameters can be + signalled by enabling --hrd. + [f9f553c8bd6a] + +2014-06-03 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/encoder.cpp: + qp: remove Encoder::m_useLossless. + + This patch removes confusion between Encoder::m_useLossless and + x265_param::bUseLossless. m_useLossless was set whenever qp = 0, and + used in TComSPS. qp = 0 is still a valid encoder option. + [b4e87638e5cc] + +2014-06-03 Steve Borho + + * source/common/winxp.cpp: + xp: avoid compiler warning from GCC-MinGW + + MSVC gives a warning at link time if this file is empty, so I added + a useless variable to the file. GCC-MinGW issues a compile time + warning about this unused variable. + [d315d1430f5c] + + * source/x265.cpp: + cli: seperate new lossless options, make --lossless description more + concise + + try to avoid having the --lossless line wrap + [cfab9513e27c] + +2014-06-03 Deepthi Nandakumar + + * Merge with stable + [d78f38b707ba] + + * doc/reST/presets.rst, source/x265.h: + rest: correct documentation on AQ + [f66ad1400874] + + * Merge with stable + [d3f55d391314] + + * source/common/param.cpp: + param: initialise lossless/cu-lossless fields to zero + [7153905d56e6] + +2014-06-02 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: rename variable to avoid shadowing an earlier 'part' + [92ef2e02f653] + + * source/common/primitives.cpp, source/common/x86/asm-primitives.cpp: + primitives: move more aliasing to Setup_Alias_Primitives + [31f93f0d024f] + +2014-06-02 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp: + refine cbf==0 path: remove clearing coeff and resi + [b46dd1095ed8] + +2014-06-02 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unnecessary pragma + [bc38a1637220] + + * source/Lib/TLibCommon/TComRdCost.h: + Merge with default (prepare for 1.1) + [108996798e78] + + * source/encoder/rdcost.h: + psy: use m_lambdaSAD multiplier for psy cost, m_lambdaSSE for bits + + in x264, m_lambdaSSE = lambda2, m_lambdaSAD = lambda + [5b6c9cda191b] + + * COPYING, source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h, + source/common/bitstream.h, source/common/common.cpp, + source/common/common.h, source/common/cpu.cpp, source/common/cpu.h, + source/common/dct.cpp, source/common/intrapred.cpp, + source/common/ipfilter.cpp, source/common/loopfilter.cpp, + source/common/lowres.cpp, source/common/lowres.h, + source/common/mv.h, source/common/param.cpp, source/common/param.h, + source/common/piclist.cpp, source/common/piclist.h, + source/common/pixel.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/common/threading.cpp, + source/common/threading.h, source/common/threadpool.cpp, + source/common/threadpool.h, source/common/vec/blockcopy-sse3.cpp, + source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp, + source/common/vec/dct-ssse3.cpp, source/common/vec/vec- + primitives.cpp, source/common/version.cpp, + source/common/wavefront.cpp, source/common/wavefront.h, + source/common/winxp.cpp, source/common/winxp.h, source/common/x86 + /asm-primitives.cpp, source/common/x86/blockcopy8.asm, + source/common/x86/blockcopy8.h, source/common/x86/const-a.asm, + source/common/x86/cpu-a.asm, source/common/x86/dct8.asm, + source/common/x86/dct8.h, source/common/x86/intrapred.h, + source/common/x86/intrapred16.asm, source/common/x86/intrapred8.asm, + source/common/x86/ipfilter16.asm, source/common/x86/ipfilter8.asm, + source/common/x86/ipfilter8.h, source/common/x86/loopfilter.asm, + source/common/x86/loopfilter.h, source/common/x86/mc-a.asm, + source/common/x86/mc-a2.asm, source/common/x86/mc.h, + source/common/x86/pixel-32.asm, source/common/x86/pixel-a.asm, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/common/x86/pixel.h, source/common/x86/pixeladd8.asm, + source/common/x86/sad-a.asm, source/common/x86/sad16-a.asm, + source/common/x86/ssd-a.asm, source/common/x86/x86util.asm, + source/encoder/api.cpp, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h, + source/encoder/level.cpp, source/encoder/level.h, + source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/rdcost.h, source/encoder/reference.cpp, + source/encoder/reference.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/encoder/weightPrediction.cpp, + source/filters/filters.cpp, source/filters/filters.h, + source/input/input.cpp, source/input/input.h, source/input/y4m.cpp, + source/input/y4m.h, source/input/yuv.cpp, source/input/yuv.h, + source/output/output.cpp, source/output/output.h, + source/output/y4m.cpp, source/output/y4m.h, source/output/yuv.cpp, + source/output/yuv.h, source/test/checkasm-a.asm, + source/test/intrapredharness.cpp, source/test/intrapredharness.h, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h, + source/test/mbdstharness.cpp, source/test/mbdstharness.h, + source/test/pixelharness.cpp, source/test/pixelharness.h, + source/test/testbench.cpp, source/test/testharness.h, + source/test/testpool.cpp, source/x265.cpp, source/x265.h, + source/x265_config.h.in: + change license email globally + [9c21227bb5bf] + +2014-05-30 Ashok Kumar Mishra + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/param.cpp, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + add support for lossless encode + + --lossless forces full lossless coding of every frame and every CU. + + --cu-lossless forces the encoder to perform RD cost analysis between + lossy and lossless modes and choose the least cost mode for each CU. + [bebc88f64d99] + +2014-06-02 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: nits + [ac763df220b6] + +2014-05-30 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncCu.cpp: + fix to support multiple color space format in + TComSampleAdaptiveOffset and TEncCu structure + [dbd7cabc5ee6] + +2014-06-02 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: improve visual quality in high bit depth encodes. + [1c65cf8f068e] + +2014-06-02 Steve Borho + + * source/encoder/CMakeLists.txt: + cmake: nit + [39ddc8c12309] + + * source/encoder/CMakeLists.txt: + cmake: encoder.cpp no longer needs -Wno-unused-parameter + [d3a89fd010dd] + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: move all "GCC-like" logic together, use consistent var names + [734956446ac1] + +2014-05-30 Den Scherbin + + * source/CMakeLists.txt: + Disable exceptions for all compilers which support fno-exceptions + [f21ce9406d4d] + +2014-05-30 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h: + TEncSearch: use actual frame thread count to decide motion search + limits + [223a9bbf52f4] + +2014-06-02 Deepthi Nandakumar + + * doc/reST/presets.rst, source/common/param.cpp: + preset: set superfast, veryfast, faster and fast presets to rd 2. + [a5998df9b12e] + + * doc/reST/presets.rst, source/common/param.cpp: + preset: For ultrafast preset, turn off lft, change rdLevel from 3 to + 2. + + This adds about 20-25% more performance for ultrafast presets at + little efficiency cost. + [d957ff8312d3] + +2014-05-29 Den Scherbin + + * source/CMakeLists.txt: + Disable exceptions. Fixes Clang linker errors. + [592ef184549e] + +2014-05-29 Steve Borho + + * source/encoder/ratecontrol.cpp: + rc: cleanups for clarity + [58420e834424] + +2014-05-29 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: improve visual quality and bitrate savings in ABR. + + Try to prevent ABR over-compensation after I frames by amortizing + the cost over the next few frames; + + Improve ABR quality with frame parallelism - enable frame + parallelism only after first few P frames to prevent excessive qp + fluctuations. + + Fix initial I frame qp. when cu tree is enabled, the qp decided + arbitrarily is too low. This causes a huge qp spike in immediate P + frames.Tuned cplxrSum for intial I frame so that a more appropriate + qp is chosen. + [7cbe7e7a75c4] + +2014-05-29 Steve Borho + + * source/cmake/CMakeASM_YASMInformation.cmake: + cmake: 32-bit macho compiles need _ prefixes on exported functions + [d4932f6e964b] + +2014-05-28 Steve Borho + + * doc/reST/cli.rst, doc/reST/presets.rst, source/common/param.cpp: + preset: disable rectangular and AMP partitions at medium preset + + This gives a considerable speed-up (50-70%) at the default preset + for not much loss in compression efficiency (2-3%). AMP is rarely + useful, at non-RDO presets we were only considering AMP merge + anyway, and rect can be approximated by a split and two merges. + [b42827682323] + +2014-05-28 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/encoder/reference.cpp: + refine YUV and coeff buffer + [260752f39b27] + +2014-05-28 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + nits + [e9776dfd1471] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp: + nits + [eb236aec3757] + + * source/common/threadpool.cpp, source/common/threadpool.h, + source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h, + source/test/testpool.cpp: + pool: allow thread private data structures + + Pass worker's threadId to JobProvider::findJob() and allow job + providers to use this ID as they see fit to keep thread local data. + No behavior changes, this is just laying the plumbing for future + optimizations. + [77f788046989] + + * doc/reST/cli.rst: + rest: add missing --no-repeat-headers option + [6df1a5bb11fc] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: nit + [e6ba953dcb1a] + +2014-05-28 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp: + psy-rd: bug fix in merge mode, use psyCosts for all decisions + whenever psy-rd is enabled + [306d3e6b5185] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: inserting runtime checking for m_totalPsyCost + [503a359e874c] + +2014-05-27 Kavitha Sampath + + * source/encoder/frameencoder.cpp: + SEI: fix bug in picture timing SEI when interlace encoding is + enabled + + When the interlace mode is 1(top field first), the picstruct is + 2(bottom field) for all frames. But the picStruct for frames in + display order should be 1(top field), 2(bottom field), 1, 2,..etc + [807ee7f1597b] + +2014-05-27 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + psyRD: psyRD costs are saved into m_totalPsyCost. + [acd166cf9836] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComDataCU: introduce new variable m_psyEnergy + [1bd3da64eab9] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: introduce new variable m_totalPsyCost + [d84f5d1fbb10] + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: whitespace nits + [08766d706869] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TComDataCU: renaming m_totalCost to m_totalRDCost + [812615048da0] + +2014-05-25 Satoshi Nakagawa + + * source/CMakeLists.txt: + add -D__STDC_LIMIT_MACROS=1 + [af78d190e57b] + +2014-05-25 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + nits + [80e6a349a9fb] + +2014-05-23 Steve Borho + + * source/common/shortyuv.cpp: + assert to optional runtime check + [5e8cce428457] + +2014-05-23 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/shortyuv.cpp, source/common/vec/blockcopy-sse3.cpp, + source/encoder/compress.cpp, source/encoder/slicetype.cpp: + refine block size related + [74f8aa42020f] + +2014-05-24 Satoshi Nakagawa + + * source/encoder/rdcost.h: + rdcost: overflow check by integer + [91330e7dddd7] + +2014-05-23 Steve Borho + + * source/compat/msvc/stdint.h: + vc9: define UINT64_MAX in our hacked stdint.h + [a3f5a7b9f0fb] + + * source/common/CMakeLists.txt: + cmake: re-split TLibCommon headers to fix VC9 builds + [ade75817d892] + +2014-05-22 Steve Borho + + * source/encoder/rdcost.h: + rdcost: fix energy cost check, dc is already subtracted from the + energy + [5134e76aa729] + + * source/encoder/rdcost.h: + rdcost: use floating point math to calculate psy rd cost + [0c05e3b298a5] + + * source/encoder/rdcost.h: + rdcost: fix dc portion of psy energy calculation + + now flat blocks have energy of 0, as expected + [8a8b04982694] + +2014-05-15 Steve Borho + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/api.cpp, source/encoder/level.cpp, + source/encoder/level.h, source/x265.cpp, source/x265.h: + api: allow minimum decoder level to be specified + [29cc8b928868] + +2014-05-22 Steve Borho + + * doc/reST/presets.rst, source/common/param.cpp: + param: introduce warnings about measurements with psy-opts enabled + + Disable psy-rd when tuning for psnr or ssim, and validate various + combinations + [d66773c5f275] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: readability nits + [73e8188a48ca] + +2014-05-19 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + cleanup bReusePred, unify absTUPartIdx to absPartIdx + [b7e6b21a5906] + +2014-05-22 Steve Borho + + * source/encoder/encoder.cpp: + rc: simplify updateVbvPlan() + [34187fb932d3] + +2014-05-22 Deepthi Nandakumar + + * doc/reST/cli.rst: + rest: refine rdLevels description. + [5548d3e6915d] + + * doc/reST/cli.rst, source/encoder/cturow.cpp: + psyrd: is enabled only at rdLevels 5 and 6. + + Only rdLevels 5 and 6 use full-RDO based decisions. These are used + in presets slower, veryslow and placebo. + [ceb4ff55f0e8] + +2014-05-22 Santhoshini Sekar + + * source/encoder/encoder.cpp: + rc: always use frameSizeEstimted for bits in updateVbvPlan + [9cf0134982f8] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: rename variable vbvMinRate to isCbr + [8f097477d74c] + +2014-05-22 Satoshi Nakagawa + + * source/encoder/slicetype.cpp: + fix: segmentation fault --tune zerolatency + [6a3a53543a89] + +2014-05-21 Steve Borho + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/CMakeLists.txt: + cmake: re-enable wd4800 for TLibEncoder + [dadb646a7266] + + * source/common/CMakeLists.txt: + cmake: further cleanups + [bc26992efaff] + + * source/common/CMakeLists.txt: + cmake: re-enable wd4244 and wd4127 in common/vec + [8a26aa7f81e3] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/common/CMakeLists.txt, + source/encoder/cturow.cpp, source/encoder/encoder.h: + cmake: re-enable wd4800 for TLibCommon, fix remaining warnings + [5bfeba9f7ae2] + + * source/encoder/frameencoder.cpp: + frameencoder: fixup comment following variable rename + [2ed0de7f2ea7] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp: + lowres: use a better name for cuTree offset buffer + [7e92e9718b37] + + * source/common/CMakeLists.txt: + cmake: merge TLibCommon source groups + [163ea3a0388b] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt, source/encoder/compress.cpp, + source/encoder/cturow.h, source/encoder/rdcost.h: + move TComRdCost.h to encoder/rdcost.h, cleanup refactor + + The RDCost class has no resemblance to the original TComRdCost + class. + [0c9e5ebbb636] + +2014-05-21 Praveen Tiwari + + * source/Lib/TLibCommon/TComRom.h: + TComRom.h, removed unused tables + [5d0658f9c539] + + * doc/reST/cli.rst, doc/reST/threading.rst, source/CMakeLists.txt, + source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/common/common.h, + source/common/param.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/x265.cpp, source/x265.h: + noise reduction ported from x264 + [9ba6575aaac1] + +2014-05-21 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.h: + TComTrQuant: nits + [479e34a05002] + +2014-05-21 Santhoshini Sekar + + * source/encoder/ratecontrol.cpp: + aq: fine tune aq logic to distribute bits even better + [9812db027903] + +2014-05-21 Den Scherbin + + * source/cmake/version.cmake: + cmake: detect mercurial source installs on Windows, fixes #56 + [a1c87ac44989] + +2014-05-20 Steve Borho + + * doc/reST/cli.rst, source/x265.cpp: + psy: mention the fact that psy-rd requires rdo + [f39484bb3eec] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: nits + [a13534aa7c6b] + + * source/common/param.cpp: + param: fix check of minimum picture dimensions + [66bb56dac4f7] + + * source/common/winxp.cpp: + xp: hack to avoid linker warnings + [a4ddd39763f2] + + * doc/reST/presets.rst: + rest: update fastdecode to mention no intra in B + [10d5af0889aa] + +2014-05-20 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: fix bug for invalid read in sa8d + + fix invalid read when different stride value given for two input + buffer and enabled 16x16 and 8x8 sa8d primitives. + [0d2ec86fa28b] + +2014-05-20 Steve Borho + + * source/common/param.cpp: + param: tabs to spaces + [a815df2b313f] + +2014-05-20 Deepthi Nandakumar + + * source/common/param.cpp, source/x265.cpp: + param: initialise crf-max and crf-min values in the param structure. + [279f72586069] + + * source/x265.cpp: + help: removing the default QP value in the help message + + Since CQP is not the default rate control mode, a default QP + confuses readers. + [2c722169215c] + +2014-05-19 Ashok Kumar Mishra + + * source/Lib/TLibEncoder/TEncSearch.cpp: + fix : square chroma transform expected error message + [d050fe5f8f28] + +2014-05-18 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + refine merge related + [b35a5d8f012b] + +2014-05-16 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + encodeCoeffNxN(): call only if non-zero coeff exists + [ba2a9f61ea06] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + cleanup unused arg + [f9f132b3e36e] + +2014-05-19 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h: + psyrd: use EMMS before checking calcPsyRdCost + [c78bed701db4] + +2014-05-16 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp: + psyrd: use actual recon pixels for intra predictions (bug spotted by + valgrind) + [f3f3eecd28f1] + + * doc/reST/cli.rst: + rest: improve --rd docs + [4739c02a62c0] + + * source/encoder/ratecontrol.cpp: + ratecontrol: initialize singleFrameVbv (bug spotted by valgrind) + [2d5dc4e63fb9] + + * source/encoder/slicetype.cpp: + slicetype: initialize bReady (bug spotted by valgrind) + [8b6c5d7ff4c0] + +2014-05-16 Sumalatha Polureddy + + * doc/reST/cli.rst, source/Lib/TLibEncoder/TEncCu.cpp, + source/common/param.cpp, source/x265.cpp, source/x265.h: + cli: introduce --[no]-b-intra which enables/disables intra modes in + B frames + [7d11f60c5dba] + +2014-05-16 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + nits + [0bd90aaaa3a7] + + * source/CMakeLists.txt, source/Lib/TLibCommon/TComRdCost.h, + source/common/common.h, source/encoder/CMakeLists.txt, + source/encoder/encoder.cpp: + common: make a global X265_LL macro for printing uint64_t + [5167067ed452] + + * source/Lib/TLibCommon/TComRdCost.h, source/common/primitives.h, + source/common/x86/asm-primitives.cpp: + psy-rd: add sad_square primitive array to optimize psy-rd + + This fixes the DC component calculation at the same time making the + calculation more efficient. + [48af10fff12b] + + * source/encoder/slicetype.cpp: + slicetype: fix unintended assignment + [7533425d5060] + +2014-05-08 Sumalatha Polureddy + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/param.cpp, + source/encoder/cturow.cpp, source/x265.cpp, source/x265.h: + adapt psy-rd from x264 + + In this initial implementation, we only use sa8d to estimate the + energy of the source and reconstructed blocks. psy-rd is disabled by + default, to evaluate use + --preset veryslow --psy-rd 1.0 + [d95ad61c8abc] + +2014-05-15 Gopu Govindaswamy + + * source/encoder/slicetype.cpp: + cutree: removed logLevel check from if loop + [63f4a60cb268] + + * source/encoder/slicetype.cpp: + cutree: adjust the rowSatd with qpoffset only for reference frames + [3212839ec987] + +2014-05-15 Steve Borho + + * doc/reST/api.rst, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/x265.def.in, source/x265.h: + api: introduce x265_encoder_parameters(), copy param earlier to + avoid mods + + in x265_encoder_open(), make a copy of the provided param structure + much earlier to avoid making any changes to the param strucutre + provided by the user, in case they want to use that same param again + to allocate another encoder. + [4da6be0eca52] + + * source/common/param.cpp, source/x265.cpp: + cli: remove default strings for preset and tune, modify default AQ + mode to match + + The cli has used --tune ssim as the default since we've had an ssim + tune, but API users were getting the default aq-mode of 1 instead. + This commit removes the default tune (and preset) strings and fixes + the default param structure to match the previous CLI default + behavior (aq-mode 2) + [b03ef02dc2a2] + + * source/common/common.h: + checked: enable check macros in debug builds + [a9c403e19928] + + * source/common/common.h: + check: disable MSVC warnings about constant conditionals with + checked builds + [bbc026370c52] + + * source/Lib/TLibCommon/TComRdCost.h: + check: fix MSVC warnings and check behavior + [4fe655fb3f24] + +2014-05-14 Steve Borho + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp: + check: another pile of assert->X265_CHECK conversions + [d5b42a9fe43b] + +2014-05-14 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup clear() + [890b34705c95] + +2014-05-14 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unused (and shadowed) variable + [8ae4d97f3913] + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/cpu.cpp, + source/common/dct.cpp, source/common/piclist.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/threadpool.cpp, source/common/vec/dct-sse41.cpp, + source/common/wavefront.cpp, source/encoder/compress.cpp, + source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/motion.cpp, + source/output/y4m.cpp, source/output/yuv.cpp: + checked: convert a pile of asserts() into optional runtime checks + [190a21bd77a3] + +2014-05-11 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp: + fix 4:2:2 coeff rate calc + [a4d0d5679c28] + +2014-05-10 Satoshi Nakagawa + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + refine intra reference samples + [e0a3b7e79d4d] + +2014-05-12 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: do not low-clip first I frame with ABR + [4fe2705adb5b] + +2014-05-09 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + reduce xModeBitsIntra() call + [dd1635326993] + +2014-05-12 Steve Borho + + * doc/reST/api.rst: + rest: fix typo + [8b75aa2cc06d] + +2014-05-11 Steve Borho + + * doc/reST/api.rst, doc/reST/cli.rst, doc/reST/index.rst, + doc/reST/presets.rst: + rest: add API docs with links to appropriate sections + [e7f11c87d7db] + +2014-05-10 Steve Borho + + * doc/reST/presets.rst: + rest: add a note to the tune documentation about using --tune + ssim/psnr + [7985392f3e55] + + * source/x265.h: + api: even moar + [deb01d38340c] + + * source/x265.h: + api: moar tpyo + [bac6ad6472b6] + + * source/x265.h: + api: tpyo + [c52aa51c82c7] + + * source/x265.h: + api: remove mention of x265_encoder_reconfig(), which doesn't yet + exist + [32e587a601db] + + * source/x265.h: + api: remove _t suffix from x265_param in comments + [d70f0afba883] + +2014-05-08 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + reduce malloc call to avoid fragment and overhead + [d0acf82a77f9] + +2014-05-08 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h: + rdcost: fix comments for lambda fix8 variables, refs b77ca886ef3b + [129bfec3144f] + +2014-05-09 Min Chen + + * source/common/x86/intrapred8.asm: + reduce code size on intra_pred_ang8_* + [f7827ea458e9] + +2014-05-08 Steve Borho + + * source/Lib/TLibCommon/TComPic.h: + TComPic: nits + [c665b4904e1f] + + * doc/reST/cli.rst: + rest: fix indent of min-crf + [4a36a281e77c] + + * doc/reST/cli.rst, source/x265.cpp: + cli: expose param.bRepeatHeaders to the CLI, reorder help for more + clarity + [e50bd438514f] + + * source/encoder/slicetype.cpp: + slicetype: avoid implicit integer conversions, fix MSVC warning + + intraCost is an int array, so there was no purpose to downcasting to + uint16_t + [c8ffacb54563] + +2014-05-08 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: add minor changes in flow when singleFrameVbv, rateFactorMax + values are set. + [87524d63e543] + + * source/encoder/ratecontrol.cpp: + rc: update logic in clipQScale for vbv, when vbv lookahead is done, + [96290a38459f] + + * source/encoder/ratecontrol.cpp: + rc: clamp qp for first I frame only in rateEstimateQscale (changes + output) + [5546cc82e1ce] + +2014-05-08 Gopu Govindaswamy + + * source/encoder/ratecontrol.cpp: + rc: qpvbv is between qpMin and prevRowQp when specifies the + rfConstantMin + [0a53062845be] + +2014-05-08 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: update the lowresCosts and intraCosts when aq/cutree is + enabled + [59a43a5cc704] + +2014-05-08 Kavitha Sampath + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp: + SEI: enable generation of recovery point SEI message for keyframes + + SEI recovery points are inserted for every keyframe which tells the + decoder an identifier of the recovery point from which perfectly + valid pictures can be displayed no matter what the starting point of + decoding is. The SEI specifies recovery_poc_cnt that counts the + number of frames after which perfect video can be displayed to the + user. Pictures encoded after the CRA but precede it in display + order(leading) are ignored by the decoder and pictures following CRA + in POC order(trailing) do not reference pictures prior to the recent + CRA and are guaranteed to be displayable. Hence recovery_poc_cnt is + zero. + [8e64aa56d635] + +2014-05-08 Steve Borho + + * source/CMakeLists.txt: + cmake: bump X265_BUILD for qpfile and crf-min + [fdce542171cb] + +2014-05-08 Gopu Govindaswamy + + * doc/reST/cli.rst, source/common/param.cpp, + source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/x265.cpp, source/x265.h: + x265: implemented crf-min that specifies a minimum rate factor value + for encode + [6871636e99cb] + +2014-05-07 Steve Borho + + * doc/reST/cli.rst: + rest: document --qpfile + [6494bb2e6425] + +2014-05-07 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPic.h, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/x265.cpp, source/x265.h: + cli: added qpfile feature to force frame qp and slicetype using file + [7773ee321539] + +2014-05-06 Steve Borho + + * Merge with stable + [8963bc3aa2e1] + +2014-05-07 Tom Vaughan + + * doc/reST/presets.rst: + presets.rst edited online with Bitbucket - Correct spelling + [607384b3312e] + +2014-05-05 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + fix: residualQTIntrachroma() for 4:2:2 + [0e22bd1dfe6c] + +2014-05-03 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + refine picture boundary check + + - cleanup slice end check + - fix split bits for TOPSKIP + [8c07f39ef5c5] + +2014-05-06 Steve Borho + + * source/common/param.cpp: + log: remove hyphen from signhide, so it matches param name + + This is just to avoid confusion + [608267a4f634] + + * Merge with stable + [ec4917459326] + + * doc/reST/index.rst, doc/reST/presets.rst: + rest: add a page describing presets + [110993a5ef10] + +2014-05-06 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: remove repeat initialisations. + [3974eafe528d] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: remove RateControl->qpNoVbv. + + Less confusion, and we have whatever info we need in rce->qpNoVbv + anyway + [5d9220ec8c81] + + * source/encoder/ratecontrol.cpp: + ratecontrol: avoid clipping QP multiple times when VBV is not + enabled. + [7d557d8819e9] + +2014-05-06 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: avoid redundant call. intraSatd costs are updated in + frameEncoder already + [532c961b16f1] + +2014-05-06 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPic.h, source/encoder/encoder.cpp, + source/encoder/ratecontrol.cpp: + log: add the RateFactor for each frame to the csv file when using + --crf mode + [94803b73ba8a] + +2014-05-06 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h: + check: add checks for integer overflow in rdcost functions + [2577eb2cbd20] + +2014-05-05 Steve Borho + + * source/CMakeLists.txt, source/common/common.h, + source/common/version.cpp: + cmake: add CHECKED_BUILD option so we can add optional runtime + checking + [5c590a31734b] + + * .hgignore: + ignore .DS_Store files (Mac pocket lint) + [075705aa41a9] + + * source/input/y4m.cpp, source/input/y4m.h: + y4m: support variable bit depth via CXXXpDD Y4MPEG header; ie: + C420p10 + + ffmpeg's support for this is non-standard, so you must use -strict + -1, aka: + + ffmpeg -i vid.avi -pix_fmt yuv420p10le -strict -1 -f yuv4mpegpipe - + | ./x265 - --y4m o.hevc + + closes issue #53 + [c4adcaef8d1d] + +2014-05-04 Steve Borho + + * source/x265.cpp: + cli: report input bit depth + [5ca8717a3ec2] + +2014-05-05 Steve Borho + + * doc/reST/introduction.rst: + rest: update introduction page, add more detail about software + patents + [d0b17e3ef9db] + +2014-05-03 Steve Borho + + * Merge with stable + [dcf74ea39e31] + + * source/Lib/TLibCommon/TComRdCost.h: + rdcost: explicit cast of return value, to prevent MSVC warnings + [04e91f38854f] + +2014-05-01 Steve Borho + + * source/CMakeLists.txt, source/cmake/clean-generated.cmake: + cmake: add a clean-generated Makefile rule + + 'make clean-generated' will remove all the machine generated files + in the build folder so that they will be re-generated with more up- + to-date version info the next time you run 'make' + + The "easy" workaround for this problem is to just nuke the build + folder and start a new one. + [d72770a77ff8] + + * source/input/y4m.cpp, source/input/yuv.cpp: + use fseekg() to skip frames for 64bit builds + + This is orders of magnitude faster, at least on Windows, than + repeated calls to ignore(framesize) + [61ad93af167c] + +2014-05-02 Steve Borho + + * Merge with stable + [81aad858a830] + + * .hgtags: + Added tag 1.0 for changeset cea97c4d7945 + [d3d47e3ef9c2] + +2014-05-02 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp: + interlace: set sourceScanType to 0 to indicate interlaced video + [cea97c4d7945] [1.0] + +2014-05-02 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComRom.cpp: + fix g_chromaScale for 420 videos accordingly to the HEVC spec. + [9b66012c93bb] + +2014-05-02 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h: + rdcost: use less fractional bits for lambda scale, and larger ints + for chroma + + This fixes really bad mode decisions made with some Main10 encodes + that were apparently caused by integer overflow + [b77ca886ef3b] + +2014-05-02 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp: + interlace: set sourceScanType to 0 to indicate interlaced video + [f3585fd81c3b] + +2014-05-01 Steve Borho + + * Merge with stable + [22eda589d8ca] + + * source/encoder/slicetype.cpp: + slicetype: pre-calculate cost estimates for B slices, simplify + callback + + rate control was always calling back for B slice estimates but since + we were only pre-calculating them if VBV was enabled we were forced + to make the estimates at that time (withing the context of the API + thread). + + With this patch, we estimate B costs unless CQP is in use, and this + allows us to simplify getEstimatedPictureCost() + [1ed0bd2dbfd1] + +2014-04-30 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.h, source/common/shortyuv.h, + source/encoder/encoder.cpp: + fix WP issue for 422 and 444 format + [20551ab7ff00] + +2014-04-29 Steve Borho + + * doc/reST/cli.rst, doc/reST/index.rst, doc/reST/threading.rst: + rest: initial documentation for threading details and considerations + [6d83eaf6cf24] + + * source/encoder/encoder.cpp: + csv: always write summary info if file handle is open + + fixes behavior of --csv FILE --log full + [148c3b9c11fd] + + * Merge with default (code freeze for 1.0) + [c6ca14a4f2fa] + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/bitstream.h, + source/common/common.cpp, source/common/common.h, + source/common/loopfilter.cpp, source/common/param.cpp, + source/common/param.h, source/common/pixel.cpp, + source/common/primitives.h, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/common/threading.h, + source/common/wavefront.cpp, source/common/winxp.cpp, + source/common/winxp.h, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.h, source/common/x86/ipfilter8.h, + source/encoder/compress.cpp, source/encoder/cturow.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/level.cpp, + source/encoder/level.h, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/encoder/slicetype.h, + source/encoder/weightPrediction.cpp, source/filters/filters.cpp, + source/input/y4m.cpp, source/input/yuv.cpp, + source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/test/pixelharness.h, + source/test/testharness.h, source/x265.cpp, source/x265.h: + uncrustify - enforce coding style mechanically + [a25fb61a7326] + + * source/encoder/slicetype.cpp: + slicetype: nit + [e6ba3c2320e8] + +2014-04-28 Steve Borho + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + TComRom: hard-code the lambda tables + [4f7658b3c78a] + +2014-04-28 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + remove list data from lowres costs prior to use + [84d31cb2aeab] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + fix g_chromaScale to be full length + [6e233b6777c0] + +2014-04-25 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + fix: 4:2:2 rdLevel <= 1 + [f799f8079b87] + +2014-04-28 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + fix hash mismatch for 422 format with HM 14.0_RExt decoder + [56b1d4a44798] + +2014-04-27 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: remove unused auto var + [36e53135da57] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: rename rowsCompleted to bFrameCompleted + + one would expect rowsCompleted to be a counter, when in fact it is a + bool value signaling completion of the whole frame. + [563273f5772f] + +2014-04-27 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: fix incorrect initialization of fenc->rowSatds when wpp + is enabled. + [c8bff937eee0] + +2014-04-27 Steve Borho + + * source/common/x86/mc-a.asm, source/encoder/slicetype.cpp, + source/test/checkasm-a.asm, source/test/testharness.h: + asm: adjust x264_ prefixes + [e519b32b70d0] + +2014-04-25 Min Chen + + * source/test/checkasm-a.asm, source/test/pixelharness.cpp, + source/test/testharness.h: + testbench: support float ret value + [7baf8b8ecfdc] + + * source/common/x86/intrapred8.asm: + correct register num in intrapred8.asm + [52d812d0fc48] + + * source/common/x86/pixel-util8.asm: + fix pixel_ssim_end4, the 3rd is dword + [2900e858e30a] + +2014-04-24 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: stack-check ssim_end4_t, this seems to expose an + argument read bug + [c630b0b393ee] + + * source/common/x86/ipfilter8.asm: + ipfilter: fix register stack allocation in luma horizontal filter + [4a91c42d2a72] + + * source/test/mbdstharness.cpp: + dct: fix MSVC warnings + [10de7b0a228c] + + * source/common/intrapred.cpp, source/common/primitives.h, + source/common/x86/intrapred.h: + primitives: pass bool value as int to assembly + + bool type is too ambiguous + [cdbcd835b849] + + * doc/reST/cli.rst: + rest: move quality metrics section up higher + [fcad3bb9943d] + + * doc/reST/cli.rst: + rest: move --repeat-headers to bitstream options section + [afec492ea679] + + * doc/reST/cli.rst: + rest: loop filters, plural + [6f259296bfe1] + + * doc/reST/cli.rst: + rest: move --hash into bitream options section, document error- + recovery aspects + [1662240073e2] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + dct: stack-check asm transform/quant primitives + [06be236c9ece] + + * source/test/ipfilterharness.cpp: + ipfilter: remove utterly useless comments + [17dd496b46ee] + + * source/test/ipfilterharness.cpp: + ipfilter: stack-check asm filter primitives + [a61ceb6f80c2] + + * source/test/intrapredharness.cpp: + intrapred: stack-check asm intra primitives + + prune some extra memsets that were used during development + [a332c6667dc9] + +2014-04-24 Min Chen + + * source/common/x86/pixel-a.asm: + correct register number in pixel-a.asm + [791790391fb6] + +2014-04-24 Steve Borho + + * source/Lib/TLibCommon/TComMotionInfo.h: + TComMotionInfo: nit + [e9f64036fbee] + + * source/test/CMakeLists.txt, source/test/pixelharness.cpp: + fixups for testbench build on Windows + [904b96e1c436] + +2014-04-17 Steve Borho + + * source/cmake/CMakeASM_YASMInformation.cmake, source/common/common.h, + source/common/threading.cpp: + align the stack for GCC x86_32 builds + + For all threads x265 creates, align the stack immediately in the + call to threadMain(). + + The API calls do not appear to require an aligned stack; all + primitives that require stack aligned buffers are called from frame + encoder or worker pool threads + [4c341edb4cf8] + +2014-04-24 Steve Borho + + * source/common/CMakeLists.txt, source/common/x86/checkasm-a.asm, + source/test/CMakeLists.txt, source/test/checkasm-a.asm: + move checkasm-a.asm to the test/ folder (fixes linux link) + [6dca7a1de7f7] + +2014-04-23 Steve Borho + + * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake, + source/common/CMakeLists.txt: + cmake: export YASM_FLAGS from CMakeASM_YASMInformation.cmake + + This avoids duplicate logic in other cmake scripts + [c24c7b2d0eff] + +2014-04-23 Min Chen + + * source/common/x86/blockcopy8.asm: + improve by LEA to ADD in cvt32to16_shr + [f8b68aa1877d] + +2014-04-23 Steve Borho + + * source/common/primitives.h: + primitives: nit + [1c9b3a48ea13] + +2014-04-23 Min Chen + + * source/common/x86/x86inc.asm: + update x86inc.asm + [cdd79be8cff9] + + * source/test/pixelharness.cpp: + pixelharness: stack-check all asm pixel primitives + [56a1c114d3be] + + * source/common/CMakeLists.txt, source/common/x86/checkasm-a.asm, + source/test/testharness.h: + testbench: port x264 stack & register check code + [91685d4e6c1a] + +2014-04-23 Steve Borho + + * source/common/threadpool.cpp: + threadpool: increase reference count if pool was allocated while + blocked + [ea597d46f30e] + +2014-04-23 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter16.asm, source/common/x86/ipfilter8.h: + fix wrong cpu type on interp_4tap_vert_x_4x2 + [aa8442a28aa4] + +2014-04-22 Steve Borho + + * source/common/x86/x86inc.asm: + Backed out changeset: ce11d3617899 + [53712f218a1a] + + * source/CMakeLists.txt: + cmake: do not prefix absolute library paths with -l in pc file + + fixes static linking on Mac OS X in particular + [d11c90310c8a] + +2014-04-22 Gopu Govindaswamy + + * source/common/pixel.cpp, source/common/primitives.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + pixel: Added C Primitives for estimateCUPropagateCost and removed + from lookahead + [4ed2953f358e] + +2014-04-22 Murugan Vairavel + + * source/test/pixelharness.cpp: + testbench: ensure randomly generated width and height are >= 16 + + Fixes a crash in testbench caused by upshift and downshift when + height equal to 1 + [d916025046f7] + +2014-04-22 Min Chen + + * source/common/x86/x86inc.asm: + update x86inc.asm + [ce11d3617899] + +2014-04-22 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + me: backout previous patch; add 1 to account for temporal MV + candidate + [7ff92e09900e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + me: stack allocated pointer moved to heap + [54e73d05a5b1] + +2014-04-21 Steve Borho + + * source/encoder/slicetype.h: + slicetype: rowsCompleted variable should be volatile + + Don't allow a compiler to optimize out the reads, it would cause a + deadlock + [84315557c97f] + + * source/common/param.cpp: + log: do not report weightb twice + [5f49217b2489] + +2014-04-22 Den Scherbin + + * source/x265.cpp: + cli: fix bitrate output when encoding + [3acc99098339] + +2014-04-21 Steve Borho + + * doc/reST/cli.rst: + rest: improve VUI docs + [8c1e721209f4] + + * doc/reST/cli.rst: + rest: remove redundant `in` + [f625e0cba6d6] + + * doc/reST/cli.rst: + rest: improve description of constrained intra + [c1b4a55fbb10] + + * doc/reST/cli.rst: + rest: update --qp docs to mention behavior of --qp 0 + [cbdce05c6959] + +2014-04-21 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + fix preset placebo option crash for 444 color format + [a30c81796c22] + +2014-04-12 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: use a worker thread for slicetypeDecide when it may help + (closes #17) + + If slicetype/scenecut analysis is enabled and the user has a thread + pool of at least 4 threads, use a worker thread to run + slicetypeDecide. + + Improves performance in presets that were bottlenecked by b-adapt 2 + style lookahead complexity. + [c7f3b7487f60] + +2014-03-27 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/bitcost.cpp, + source/encoder/frameencoder.cpp, source/encoder/slicetype.h, + source/encoder/weightPrediction.cpp: + adapt x264 style lambda tables [CHANGES OUTPUTS] + + initLambda() in TComRom.cpp is passed the scale factor that is used + to adjust lambda2 values. It uses 0.85 right now, which is similar + to x264's value (0.9), and seems to be optimal for HEVC based on our + tests. + + These lambda tables seem to help most at low QPs, allowing the + encoder to get nearly lossless quality at high bitrates. + [a4ae35d04777] + +2014-04-17 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: nits, assume sizeof(char) == sizeof(uint8_t) + + cleanup white-space and hungarian prefixes + [e45e5ca4608e] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + me: build a larger list of motion candidates for ME (closes #14) + [fdccd5524ff7] + +2014-04-21 Min Chen + + * source/common/x86/blockcopy8.asm: + fix asm cvt32to16_s wrong XMM register number + [ccfd0819b5c3] + +2014-04-21 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: initCU was missing this initialisation, possible causing + errors in row restarts. + [e6e78f2fb192] + +2014-04-17 Steve Borho + + * source/encoder/ratecontrol.h: + rc: remove unused keyFrameInterval + [78c1f43f12f5] + + * source/encoder/ratecontrol.cpp: + rc: if --qp 0 is specified, use zero quant for all slice types + + This is not the same as lossless. For true lossless we must enable + transform skip in the PPS and force all blocks to be coded with + transform skip enabled. + [07156a0e74e5] + +2014-04-16 Steve Borho + + * source/common/threadpool.cpp: + threadpool: use a mutex to protect pool allocation + + We only want to allow one thread to create the singleton thread + pool. Rename the static instance variable to s_instance as per our + coding style + [85c95672ccf0] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + Merge with stable + [2fc309678785] + +2014-04-03 Gopu Govindaswamy + + * source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + frameencoder: store the reference state of the picture in + FrameEncoder + + We find that reference state of the reference frame changed during + the encode when we use frame-thread > 1 this cause the CU level QP + for the frame is non-deterministic, this is leading the non- + deterministic encoded output for the frame, to avoid this store the + reference state of the frame to FrameEncoder->m_isReferenced and + when the QP is calculate for CU, refer the reference state of the + frame from FrameEncoder->m_isReferenced this stat will never change + during the encode + + Moved slice reference state initialization from dpb to FrameEncoder + initSlice() + [aacd6919d173] + +2014-04-16 Min Chen + + * source/common/x86/dct8.asm: + align DCT8's stack to 64-bytes to avoid crash and improve cache + performance + [024ca523052f] + +2014-04-16 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: initialize m_vps pointer + [818a591c3a6e] + + * source/encoder/frameencoder.cpp: + encoder: singleton m_vps nits + [7fd1df6f4db8] + + * source/cmake/CMakeASM_YASMInformation.cmake: + cmake: nit + [41ef5053e04c] + + * source/encoder/frameencoder.cpp: + Merge with stable + [436c63dd2d24] + +2014-04-03 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: use m_isReferenced when configuring SAO in + compressFrame() + + In some pessimal situations, the slice's reference state could even + be changed by the time compressFrame() starts. This prevents any + race hazard. + [5746582ff4a6] + +2014-04-15 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp: + vbv: clear row diagonal and cu SATD costs after vbv row reset was + triggered + + refs #45 + [03525a77d640] + +2014-04-16 Steve Borho + + * source/cmake/CMakeASM_YASMInformation.cmake: + cmake: use HAVE_ALIGNED_STACK=0 for x86_32 builds, even for GCC + + In order to enable HAVE_ALIGNED_STACK for 32bit builds, we would + need to align our stack internally at all thread entry points and + all API entry points that might use primitives. 32bit performance is + not a high priority for us at the moment. + + This fixes a number of reported crashes on 32bit builds + [cfb1bb58d4fe] + + * source/encoder/motion.cpp: + motion: always include the mvcost returned by motionEstimate + [CHANGES OUTPUTS] + + This was a rather subtle bug that has been in the code base for some + time. The caller of motionEstimate() will often want to remove the + mvcost from the returned cost value, and in this circumstance it + would go negative, and since the returned value is unsigned it + became very large, causing the encoder to actually discard a zero- + residual match. + + If the stars were perfectly aligned and all of the reference ME + costs became exactly -1, *all* possible ME candidates were discarded + which could lead to crashes. + [bf40ab3af59a] + +2014-04-16 Ashok Kumar Mishra + + * source/Lib/TLibCommon/TComYuv.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/shortyuv.cpp: + fix compile warning in pixel.cpp for 422 primitive setup + [24e8bac645a3] + +2014-04-15 Steve Borho + + * source/x265.cpp: + cli: help nits (closes #47) + [0b696c7f46f2] + +2014-04-15 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp: + vbv: clear row diagonal and cu SATD costs after vbv row reset was + triggered + + refs #45 + [166e4d6e819e] + +2014-04-15 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp: + fix: rdLevel <= 1 + [9735b037b461] + +2014-04-14 Steve Borho + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + vui: emit VUI w/ timing info unconditionally, remove timing info + from VPS + + Since many tools seem to only look in the VUI for timing info, it + seems to be best to output that info there, and remove it from the + VPS (to avoid sending redundant data in our headers) + [8ebadea05bfe] + +2014-04-15 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPattern.cpp: + fix: constrained intra + [0a95a6bb0f8e] + +2014-04-14 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: fix indentations + [1cf67a7b362d] + + * source/common/pixel.cpp: + pixel: remove out-of-range chroma copy function assignments + [08d64a70594e] + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: white-space nits + [488b81a7f1c6] + + * source/common/param.cpp: + param: reintroduce range checks of internalCsp + [e62e96cf506d] + + * doc/reST/cli.rst: + rest: update colorspace docs + [b89d85f10bc4] + +2014-04-14 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + encoder: nits + [9f0f011294bd] + + * source/encoder/encoder.cpp: + encoder: weightP not supported yet for 444 and 422. + [6d4f0673c386] + + * source/x265.cpp: + x265: adding support for i422 in the help command + [ae8f99512774] + +2014-04-12 ashok + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/common/ipfilter.cpp, + source/common/param.cpp, source/common/pixel.cpp, + source/encoder/ratecontrol.cpp, source/x265.cpp: + Modify TComDataCU structure to support 422 color space format + [3a9c1fc2b742] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.h: + Modify TComLoopFilter structure to support 422 color space format + [74ab24517d0e] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + Modify TComRom and TComPattern structure to support 422 color space + format + [abf05dab1844] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, source/common/shortyuv.cpp, + source/common/shortyuv.h: + Modify TComYuv and TShortYuv structure to support 422 color space + format + [ff3892c00318] + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + Modify TEncSbac and TEncEntropy structure to support 422 color space + format + [aa0c458ede27] + + * source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Modify TEncSearch structure to support 422 color space format + [0454675dbaf7] + +2014-04-12 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: Fix initial I-slice QP for 10-bit CRF mode. + [a622c4bfc688] + +2014-04-11 Steve Borho + + * source/common/winxp.cpp, source/common/winxp.h: + xp: fix build with XP headers + [483e5077a6b7] + + * source/CMakeLists.txt: + cmake: allow icpc to use vectorization if it likes + [850ebca2f7c8] + +2014-04-09 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + cip: add is*AvailableCIP() + + simplify no CIP case. + [13fa6d4d4fe4] + +2014-04-11 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: nits + [a5aa9e730323] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: comparing >=0 of unsigned variable is always true + [85f977830d9d] + +2014-04-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm-primitives: Enable missing XOP primitives in HIGH_BIT_DEPTH + [bd5e59fc917d] + + * source/common/x86/x86util.asm: + asm: HADDD optimization for XOP + [78ab76f203a9] + +2014-04-08 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + vbv: log frame-average QPs for VBV; even when AQ is disabled. + [55981f89b398] + +2014-04-10 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: prevent one layer of non-zero I-slice qp clipping + [CHANGES OUTPUT]. + + This seems to have been an error importing RC code from x264. It + could be responsible for error reports on I-slices (after POC 0) + having much higher QP/low visual quality. Needs review. + [1c0f1a4ce209] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: rename reInit to init. This method is called in the + constructor and when ABR history is reset. + [83ccf2f1453f] + +2014-04-09 Steve Borho + + * source/encoder/level.cpp: + level: fix MinGW warning + [51c627e235bc] + +2014-04-10 Deepthi Nandakumar + + * source/Lib/TLibCommon/CommonDef.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + ratecontrol: cleanup, replace (unnecessary) constant arrays with + constants. + [db15b6bee7f4] + +2014-04-09 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: more dead code removed + [e2df2309e6f1] + +2014-04-09 Steve Borho + + * source/encoder/level.cpp: + level: fix MSVC compile error, give sqrt() argument unambiguous data + type + [bd987db26d5d] + +2014-04-09 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: partial backout of 6284b30a0fdd, get rid of debugging + notes. + [bdca492dc1d7] + + * source/encoder/level.cpp: + level: remove conditional expression warning + [7f62dfabe319] + + * source/encoder/ratecontrol.cpp: + ratecontrol: remove dead code + + Imported from x264; likely a part of 2-pass. In any case, not + relevant here. + [6284b30a0fdd] + +2014-04-07 Steve Borho + + * source/x265.rc.in: + rc: update the default build output filename for our DLL + [15471ecb08d8] + + * source/common/common.cpp, source/common/common.h, + source/encoder/CMakeLists.txt, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/level.cpp, source/encoder/level.h: + level: move decoder level logic to level.cpp, make table based + + the new function enforces more limits defined in Annex A - widths + and heights are capped to avoid extreme rectangular dimensions DPB + size is limited based on resolution at higher resolutions, the min + CTU size is 32 + + this commit also lays the groundwork for adding --level 5.1 --tier + HIGH params + [23b509a26449] + +2014-04-08 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: change pointer initial values to NULL (instead of 0). + [b5caca9954f3] + +2014-04-07 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TEncEntropy: cleanup bRD arg, fix 4:4:4 intra chroma rate [OUTPUT + CHANGE 4:4:4] + [aec57ffb18de] + +2014-04-06 Rafaël Carré + + * source/common/CMakeLists.txt, source/common/param.cpp: + strtok_r: fix detection on Windows + + * Make sure HAVE_STRTOK_R is always defined, to 0 if absent + * Fix typo in #if + * mingw-w64's pthread.h #defines strtok_r, make sure to undef it + before we use our own definition. + [a4cb624267f3] + +2014-04-05 Steve Borho + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + api: drop param.vui.bEnableVuiParametersPresentFlag and + bEnableAspectRatioIdc + + These two params can be implied from other parameters, and having + them present only makes the VUI configuration more complicated than + it needs to be. + + This commit also drops the debugging --vui CLI option + [c1300ae4e7ba] + +2014-04-04 Steve Borho + + * doc/reST/cli.rst: + rest: we don't abbreviate integer anywhere else + [ac1bcf04c144] + + * doc/reST/cli.rst: + rest: fix the straw-man + + The two-argument command line doesn't work for YUV since you need to + provide the resolution and frame rate + [aa1a86a1bd35] + + * source/common/param.cpp: + param: force bRepeatHeaders for the still picture profile + [4bb0073e3d15] + + * source/encoder/api.cpp, source/x265.cpp: + log: move logging of version and build info into libx265 + + Now this data will be reported in the logs even when x265 is used as + a shared or static library + [9f77f0228735] + +2014-04-03 Steve Borho + + * source/test/pixelharness.cpp: + testbench: prevent 0 height in plane copy tests + + The ASM functions handle the last row specially and cannot handle + height of 0, it causes testbench crashes when rand() rolls the wrong + dice. + [ac9e57296fa8] + + * doc/reST/cli.rst, source/CMakeLists.txt, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/common/param.cpp, + source/encoder/frameencoder.cpp, source/x265.cpp, source/x265.h: + api: add support for access unit delimiters (--aud) + [6327400944ee] + + * source/x265.cpp: + cli: add missing --no-dither option to getopt list + [8273932bc5b7] + + * source/Lib/TLibEncoder/TEncEntropy.h: + TEncEntropy: nit + [1d2ab46f13d2] + + * doc/reST/cli.rst: + rest: nit + [d73898ae3e47] + + * source/common/param.cpp: + param: nit reorder of RC mode checks to avoid uncrustify problems + + uncrustify was seeing FOO(bar < x || bar > y) and assuming this was + a template instantiation and changing it to FOO(bar y). + Reordering the two comparisons avoids this problem + [2e376af3a467] + + * source/common/param.cpp: + param: enforce a minimum picture size + + Do not allow the user to configure a picture smaller than at least + one CTU + [343414c96b01] + + * source/encoder/frameencoder.cpp: + frameencoder: use m_isReferenced when configuring SAO in + compressFrame() + + In some pessimal situations, the slice's reference state could even + be changed by the time compressFrame() starts. This prevents any + race hazard. + [8c946aca5824] + + * source/encoder/frameencoder.cpp: + frameencoder: comment nit + [9c1cc2aa053a] + + * Merge with stable + [eef9a0050728] + +2014-04-03 Gopu Govindaswamy + + * source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + frameencoder: store the reference state of the picture in + FrameEncoder + + We find that reference state of the reference frame changed during + the encode when we use frame-thread > 1 this cause the CU level QP + for the frame is non-deterministic, this is leading the non- + deterministic encoded output for the frame, to avoid this store the + reference state of the frame to FrameEncoder->m_isReferenced and + when the QP is calculate for CU, refer the reference state of the + frame from FrameEncoder->m_isReferenced this stat will never change + during the encode + + Moved slice reference state initialization from dpb to FrameEncoder + initSlice() + [36a66ea7a27e] + +2014-04-03 Steve Borho + + * .hgtags: + Added tag 0.9 for changeset 82bbd2bf3b49 + [640f9177eeb0] + +2014-04-03 Murugan Vairavel + + * source/common/x86/pixel-a.asm: + asm: fix invalid read in upShift routine + [82bbd2bf3b49] [0.9] + +2014-04-02 Steve Borho + + * Merge with stable + [c0362b478e23] + +2014-04-02 Gopu Govindaswamy + + * source/encoder/dpb.h: + dpb: Allow two L1 refs when b-pyramid is enabled [CHANGES OUTPUTS] + + Consider this common case: if we have 5 consecutive (display order + frames) that are determined to be P1-B1-B2-B3-P2 by the lookahead. + When b-pyramid is enabled, the middle B will be encoded first and + used as a reference by the two following B frames (in encode order); + P1-P2-B2ref-B1-B3 + + frame L0 L1 P1 P2 P1 B2ref P1 P2 B1 P1 P2 B2ref B3 B2ref P1 P2 + + When B1 is encoded, both B2ref and P2 should be available as L1 + references, this will improve the encode compression efficiency when + b-pyramid is enabled (closes #12) + [d815c4a8fa74] + +2014-04-02 Min Chen + + * source/test/pixelharness.cpp: + testbench: use different stride on calcrecon + [89af57686794] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/test/pixelharness.cpp: + remove unused parameter *recon from assembly code + [fdfad9734231] + +2014-04-02 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/compress.cpp: + cleanup m_cuColocated[] + [ccb2b7c26bb6] + +2014-04-02 Steve Borho + + * source/encoder/weightPrediction.cpp: + weight: properly reset weights when no-residual early-out is taken + + This fixes a hash mismatch seen with a Main10 encode of sintel-480p + [e03388e98ecc] + + * source/encoder/frameencoder.cpp: + frameencoder: fix white-space nit, add comment + [67c0aa70a125] + +2014-04-02 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + param: fix typo in if-check. + [261b3c2e788e] + +2014-04-02 Aarthi Thirumalai + + * source/encoder/encoder.cpp: + param: set aq strength to 0 in CQP + [dc887415f6df] + +2014-04-02 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp: + Merge from stable + [3f27daf35506] + + * source/encoder/frameencoder.cpp: + frameencoder: removing assign qp inconsistencies which were + triggered for unreferenced P frames + [606da0b6bc58] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/test/pixelharness.cpp: + Backed out changeset: a6930bfbd908 + + This changeset causes crashes. Needs to be re-examined. + [d0b5ea32525b] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + calcQpForCU: remove m_pic input parameter. + [03bad90e94ad] + +2014-04-01 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/test/pixelharness.cpp: + remove unused recon[] from assembly code + [a6930bfbd908] + +2014-04-01 Steve Borho + + * source/Lib/TLibEncoder/TEncCfg.h, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/dllmain.cpp: + Merge with default (feature freeze for 0.9) + [1fc0fda2b08b] + +2014-04-01 Kavitha Sampath + + * doc/reST/cli.rst, source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/common/param.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp, source/x265.cpp, source/x265.h: + weightb: add CLI options for explicit weightb [CHANGES OUTPUTS] + + weightB will be disabled by default for this release. This patch + fixes the hash mistakes and other fetaure bugs (the reasons it had + been disabled) + + HEVC only signals a single denom for all references so the L1 + reference must use the same denom as the L0 reference or no weight + at all. A more exhaustive search might be added later for slower + presets. Also, the lookahead will need to be modified to model + weightB behavior so it can be more effective. + + This patch changes how the unweighted references are coded, so even + weightP outputs change slightly. + [0206822d9fea] + +2014-04-01 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: lower log level of restart indicators + [f8aa296d60c3] + +2014-03-31 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: nits + [6edc3ed24643] + +2014-04-01 Nabajit Deka + + * source/common/x86/pixel-a.asm: + asm: fix build error caused by usage of 64-bit dependent register in + Win32 versions + [dd189fd26f47] + +2014-03-31 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: do not assume frames[] array has been initialized + + slicetypeAnalyse() may not have been called, so use picture list[] + instead + [7ce180ca05b3] + +2014-03-31 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/pixel-util8.asm, + source/test/pixelharness.cpp: + remove macro NEW_CALCRECON + [5d607fd4531f] + +2014-03-31 Murugan Vairavel + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + testbench: code for testing input pixel upShift/downShift primitives + [dc9a6a87db56] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for input pixel upShift/downShift + [c4ea6cffe2b3] + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/pixel.cpp, + source/common/primitives.h: + primitives: added C primitives for upShift/downShift of input pixels + [a30786caa6c3] + +2014-03-31 Selvakumar Nithiyaruban + + * doc/reST/cli.rst, source/x265.cpp: + cli: add cli option for dither + [b521f535442a] + + * source/CMakeLists.txt, source/common/common.cpp, + source/common/common.h, source/filters/filters.cpp, + source/filters/filters.h: + dither: port dither related functions from x264 + [106fc00d4eab] + +2014-03-31 Steve Borho + + * source/common/winxp.cpp, source/common/winxp.h: + xp: fix header guards for XP support, fixes MinGW build + + If no _WIN32_WINNT version is specified, MinGW defaults to XP SP3. + Our include guards were checking for <= XP instead of checking for + any version before Vista which was the version that introduced + native CONDITION_VARIABLE support + [ae07405973b7] + + * source/encoder/frameencoder.cpp: + frameencoder: prevent deadlock in non-wpp mode + [f3c97e82ab04] + + * source/CMakeLists.txt: + cmake: pick the old policy for MACOSX_RPATH (only applicable for + cmake 3.0) + [4564298c30f6] + +2014-03-29 Steve Borho + + * source/CMakeLists.txt: + cmake: allow MinGW to target XP by default + + This makes MinGW to use our workaround CONDITION_VARIABLE + implementation but it seems to be on average better than asking + MinGW to compile for Vista. + [6f7b323061dc] + +2014-03-11 Aarthi Thirumalai + + * source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/cturow.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + vbv: enable row restarts when mid-frame qp adjustemets are + inadequate + [1410caf09a39] + +2014-03-29 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/threading.h, source/common/winxp.cpp, + source/common/winxp.h: + restore WINXP_SUPPORT build option, workaround for + CONDITION_VARIABLE on XP + + This adapts x264's code for an XP-safe pthread_cond_t to make an XP- + safe CONDITION_VARIABLE (which was introduced in Windows Vista) + + x265 will use native CONDITION_VARIABLE unless the WINXP_SUPPORT + cmake option is enabled. It forces _WIN32_WINNT=_WIN32_WINNT_VISTA + for MinGW for this purpose. + [3f78e639d9ce] + +2014-03-27 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/param.cpp: + refine CUSize related + + rename to clarify. division to shift. + [5fb4400d76c2] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + sbh: early continue for all zero coeff group + [0705bb47bfcc] + +2014-03-27 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + move buffer outside parenthesis + [9b378e860ddb] + +2014-03-28 Steve Borho + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/common/common.h, + source/common/pixel.cpp: + common: combine duplicate ClipY and ClipC templates, return pixel + type + + and fix some hungarian prefixes I encountered + [b9a62f4491a7] + +2014-03-27 Steve Borho + + * source/Lib/TLibCommon/TComRom.h, source/common/common.h: + common: move clip templates to common.h + [c2fdc391d226] + + * doc/reST/cli.rst: + rest: fix tpyo + [5ec673a3e635] + + * source/x265.cpp: + cli: add link to documentation to end of `x265 --help` output + [e3b1fb25c1d5] + + * doc/reST/cli.rst: + rest: consistent capitalization of page name + [49528dce611c] + + * doc/reST/cli.rst: + rest: improve --ssim, --psnr, recon, and VUI descriptions + [ec97d22b021c] + + * source/Lib/TLibCommon/TComTrQuant.h: + TComTrQuant: fix compiler warning in debug builds + [0692d586def7] + + * doc/reST/cli.rst: + rest: improve rate control documentation + [6a139bbaddb8] + + * doc/reST/cli.rst: + rest: cleanup --input description + [3c366c8973e0] + +2014-03-26 Steve Borho + + * source/CMakeLists.txt, source/encoder/api.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/x265.cpp, source/x265.h: + api: change x265_encoder_headers() to return byte count on success + [8ce32bd20ec7] + +2014-03-27 Steve Borho + + * doc/reST/cli.rst: + rest: improve docs for --me and --early-skip + [6e25dfa0cc9e] + +2014-03-26 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + nits + [8e0c2d24fc45] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/common.h, + source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp: + replace TCoeff with coeff_t + [ca35d9b58b55] + + * source/Lib/TLibCommon/TypeDef.h, source/common/common.h: + common: move X265_DEPTH definition to common.h + [9fd80b32a24e] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + replace Pel with pixel + [9c0419c1781b] + + * source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/encoder/compress.cpp, + source/encoder/cturow.cpp: + replace UChar with uint8_t + [bf2075b6e879] + +2014-03-24 Steve Borho + + * source/CMakeLists.txt: + cmake: move assembly source into ASM source group in VS + [29efd2537b56] + +2014-03-26 Steve Borho + + * doc/reST/cli.rst: + rest: this sentence was perhaps redundant + [7d37c06c80f4] + + * doc/reST/introduction.rst: + rest: make life slightly more difficult for spambots + [2d41a9d48e3b] + +2014-03-24 Satoshi Nakagawa + + * source/encoder/frameencoder.cpp: + fix chroma lambda weighting + [d38335a9375a] + +2014-03-26 Steve Borho + + * doc/reST/cli.rst: + rest: nits + [e6862130b35b] + +2014-03-25 Steve Borho + + * source/common/param.cpp: + param: ignore leading double-slash in names passed to + x265_param_parse + + It might be ambiguous to API users whether these are required, + looking at our documentation, so ignore them if they do. + [2cda667fd786] + + * doc/reST/cli.rst: + rest: improve --output, add documentation for --repeat-headers + [c82b0f98e3fb] + + * doc/reST/cli.rst: + rest: improve --tu-*-depth, --ref, and --tskip descriptions + [2404a23fb957] + + * doc/reST/cli.rst: + rest: improve the description of --rect --amp + [0de64941eec8] + + * doc/reST/cli.rst: + rest: document the behavior of --subme + [939de245ea5f] + + * doc/reST/introduction.rst: + rest: update licene email address + [82e2254874b1] + + * source/x265.cpp: + cli: remove dead options from getopt long-options list + [9f5870075f72] + + * source/x265.h: + api: fix documented defaults for cuTree and AQ + [f482d57a6ab5] + + * doc/reST/cli.rst: + rest: CRF options are doubles + [455e4c8a5b3d] + + * doc/reST/cli.rst, source/CMakeLists.txt, source/common/param.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + api: drop VUI options which are not fully implemented + + It is better to not have to document that these are unfinished + [8f770a36875d] + + * doc/reST/cli.rst: + rest: add a couple of missing options, improve descriptions of + others + [02a2a5bd4330] + + * doc/reST/cli.rst: + rest: improve CLI docs, fix some wrong descriptions, moar better + + I've moved --weightp and --refs to the ME section since they have no + bearing on slice decision and lots of impact on motion search. Some + new options are still missing, will add those in a later commit + [11d301883894] + + * doc/reST/cli.rst: + rest: large cleanup of CLI opt docs + [8181f224ec27] + + * doc/reST/cli.rst: + rest: fix csv logging descripion, move debug options to the end + [6ece6e038ca8] + + * doc/reST/cli.rst: + rest: use comma to separate short and long options + + This looks better, and fixes cross referencing + [9ee05bffbc68] + + * doc/reST/Makefile, doc/reST/conf.py, doc/reST/index.rst: + rest: add Makefile for generating HTML from reST, cleanup some nits + [c52c2f8c3e32] + + * Merge with stable + [fd5e313eca45] + +2014-03-25 Sagar Kotecha + + * doc/reST/cli.rst, doc/reST/conf.py, doc/reST/index.rst, + doc/reST/introduction.rst: + Add reST script to generate x265 user manual + [a1d9b54e57d2] + +2014-03-25 Steve Borho + + * source/CMakeLists.txt, source/common/common.h, + source/common/param.cpp, source/encoder/api.cpp, + source/encoder/frameencoder.cpp, source/x265.cpp, source/x265.h: + api: add param.bRepeatHeaders - insert stream headers in each + keyframe NAL + + This is apparently useful for raw stream formats + [e739e579609a] + +2014-03-24 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + cleanup on TComTrQuant::getTUEntropyCodingParameters + [4318d47d9348] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + faster sign(X) and N^2 on TComTrQuant::xRateDistOptQuant + [10e614dca6d4] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + improvement by replace SHIFT to MASK_AND + [33617683915d] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp: + optimize: replace g_groupIdx[] by getGroupIdx() + [b39d26118f09] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + reduce g_minInGroup from uint32_t to uint8_t + [f09130afa3dd] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + improvement TEncBinCABAC::writeOut by mask operator and local + variant + [88c66aece128] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + improvement TEncBinCABAC::encodeBin by temporary variant and reduce + AND operator + [5c5cb411263e] + +2014-03-24 Steve Borho + + * source/x265.cpp: + cli: fix missing reference to superfast preset + [aa08b7f2d420] + +2014-03-21 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: nits (no change) + [fdd7c6168cf4] + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + Merge with stable + [07670cfdc215] + +2014-03-21 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + vbv: fix race condition in processRowEncoder, store row qp directly + in m_pic->m_rowDiagQp. + [21eb4a43e02f] + +2014-03-18 Min Chen + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp: + use mask operator to avoid branch + [79b76dcaacd8] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + replace parameters (oneCtx, absCtx) by pointer m_estBitsSbac->.. + [da5f379974c0] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + optimize: rewrite TComTrQuant::xGetCodedLevel + [190f1b500219] + +2014-03-20 Steve Borho + + * source/encoder/encoder.cpp: + encoder: auto-disable weightp with 4:4:4 inputs, until it is fixed + [fe3fcd9838c0] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: validate unidirectional MVPs prior to trying MV0 bidir + (closes #34) + + This should fix some rare non-determinism as well as prevent the + crash seen in issue #34. [CHANGES OUTPUTS] + [e06f2a068622] + + * source/x265.pc.in: + pkgconfig: allow user-supplied lib folder (closes #40) + [8830c8bf15cb] + +2014-03-19 Steve Borho + + * source/CMakeLists.txt, source/common/common.h, + source/common/param.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/x265.cpp, source/x265.h: + add experimental support for interlaced content (field encoding) + (refs #15) + + * adds param.interlaceMode + * removes VUI params that are now handled automatically + * adds --no-interlace --interlace=tff|bff|prog|false CLI options + * signals interlace source flag, clears progressive source flag + + This initial implementation requires the user to provide fields + (half-height) in the correct temporal order; so not very useful for + exisitng Y4M or YUV interlaced input files. + + When interlacing is enabled, the encoder emits PictureTiming SEI + messages that indicate top or bottom field for the decoder. + [e35c8c03ee9f] + +2014-03-20 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + vbv: fix race condition in processRowEncoder, store row qp directly + in m_pic->m_rowDiagQp. + [5638c2311653] + +2014-03-19 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: more QP cleanup for subCUs, inter modes. + [27e0620327e5] + +2014-03-19 Steve Borho + + * source/encoder/motion.cpp: + motion: add missing parens in COST_MV() macro (fixes #41) + [221d8aee9aa1] + + * source/encoder/motion.cpp: + motion: UMH fix to match x264 behavior + [fcb916ec982f] + + * source/input/yuv.cpp, source/input/yuv.h: + yuv: use ThreadSafeInteger to manage ring buffer + [96d1d690d2ab] + + * source/encoder/api.cpp, source/encoder/encoder.cpp: + encoder: keep copy of input parameters (refs #43) + + We do not want to allow the user to change the param used by the + encoder during the encode, nor do we want to suffer from cleanup + ordering issues. We can't crash if the user released the param + struct they allocated prior to closing the encoder + [56a382eb1b72] + + * source/input/y4m.cpp, source/input/y4m.h: + y4m: use ThreadSafeInteger to manage ring buffer + [bbf6010764be] + +2014-03-19 Gopu Govindaswamy + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + TEncBinCoderCABAC: fix MingGW build "declaration of 'byte' shadows a + global declaration" + [7bb421429129] + +2014-03-19 Steve Borho + + * source/CMakeLists.txt: + cmake: tabs to spaces + [459c96998a62] + + * source/CMakeLists.txt: + cmake: pkgconfig file must be installed in user-specified lib folder + (refs #40) + [ab9c23ebb527] + + * source/common/threading.h, source/encoder/framefilter.cpp: + thread: add an incr() method to ThreadSafeInteger to avoid redundant + locks + [1b57d57ff2d9] + +2014-03-19 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComDataCU.cpp: + TEncCU: further refine qp offsets + [148553629f2a] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: all SUbCUs will now copy QP array directly from parent CU, + instead of resetting to qp(0). + [648db5e86622] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: add QP-independent initSubCU + [de4eb4d4bd01] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: add QP-independent initEstData + [d566947b5493] + +2014-03-18 Steve Borho + + * source/CMakeLists.txt: + cmake: icpc is unable to link cli to shared lib + [f58d9c3840a3] + + * source/CMakeLists.txt: + cmake: fix MinGW build by forcing Vista as minimum target O/S (fixes + #44) + + This removes the WINXP_SUPPORT option; XP cannot be supported until + someone contributes a version of ThreadSafeInteger that is + functional on XP + [45af76338c8e] + +2014-03-19 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/encoder/frameencoder.cpp: + TComDataCU: QP for a CU always stored in TComDataCU::m_qp. + [591bceb1b87a] + +2014-03-18 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + intraPred: remove qp resets + [d8d41f4cf354] + + * source/encoder/frameencoder.cpp: + frameencoder: clean up slice qp clipping + [9167d2441670] + +2014-03-18 Steve Borho + + * source/CMakeLists.txt: + cmake: remove /Oi build flag for Visual C++ builds + + This flag was actually never necessary for use of SIMD intrinsics, + but it was forcing VC to use intrinsic versions of math functions, + and for some versions of VC this caused odd stack corruption in + Win32 debug builds. + + https://connect.microsoft.com/VisualStudio/feedback/details/776885 + /possible-bug-in-visual-studio-2012-c-compiler-related-to-intrinsic- + math-functions + [2a63de6cbb11] + + * source/CMakeLists.txt: + cmake: make lib and bin install folders user configurable (closes + #40) + [a02a4ba9629e] + + * source/cmake/cmake_uninstall.cmake.in: + cmake: fix uninstall target warnings from shell quote escaping + issues + [6a77b52eddcb] + + * source/CMakeLists.txt: + cmake: link cli app to shared library on POSIX systems (closes #37) + + If you want the CLI to link against the static library on POSIX, + just disable the generation of the shared library. + [f91b930aea7e] + + * source/x265.h: + api: declare x265_cli_csps as static const (closes #39) + + Fixes link errors when x265.h is included by multiple C files. + [dc700298419d] + +2014-03-18 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/encoder.cpp: + encoder, sbac, quant: use only macro QP_BD_OFFSET, remove + unnecessary get methods + [4c8ea265d55e] + + * source/encoder/encoder.cpp: + sps: reuse encoder macro QP_BD_OFFSET + [551956198184] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp: + frameencoder: clean up qp clipping + [2952312a52e5] + +2014-03-18 Steve Borho + + * source/encoder/encoder.cpp: + encoder: fix for VERBOSE_RATE (leftover HM feature) (closes #42) + [0ac7e8729d90] + +2014-03-18 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + encodeResandCalcInterCU: cleanup, no logic change + [32c9e30aee7a] + +2014-03-16 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp: + DC only for HIGH_BIT_DEPTH + [7b86d42683be] + +2014-03-17 Min Chen + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + optimize: rewrite TEncBinCABAC::encodeBin + [3bbcf9f8a701] + +2014-03-17 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + cleanup m_pattern + [d600c8f8f036] + +2014-03-16 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.h: + cleanup unused m_mvField{A,B,C} + [7dbae9022757] + +2014-03-17 Steve Borho + + * source/encoder/encoder.h: + encoder: cleanup nits + [6627c821ca1f] + + * source/input/y4m.cpp: + y4m: fix copy-paste bug in range checks + [c688c11a0f12] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/threading.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp: + threading: introduce ThreadSafeInteger class + + This class uses a condition variable to implement a + producer/consumer access protocol with a single writer and multiple + readers for safe multi-core synchronization + [c0155c7bb6ca] + +2014-03-18 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: fix previous bad patch import + [8dbcfae4dffc] + +2014-03-17 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.h: + TEncCU: cleanup + [cf92f28e5f93] + +2014-03-17 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp: + Merge with stable + [8d5deb7cafd8] + +2014-03-16 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: nits + [e7e150e4166d] + +2014-03-14 Min Chen + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + optimize: rewrite TEncSbac::xWriteCoefRemainExGolomb + [b2617cb09a1a] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + optimize: improvement TComTrQuant::getSigCtxInc, avoid shift by mask + [9e9bdc0dd2c5] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + optimize: rewrite TComTrQuant::xGetICRateCost + [b8460fba2783] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/common/threading.h: + optimize: rewrite TComTrQuant::xGetICRate + [b6954c4f480f] + +2014-03-13 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp: + encode: avoid repetitive statements; no logic change + [c1ecc3eb288d] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + compress/TEncCU: no reason why mode decision should reset the dqp + flags. + [6c64fbd96968] + + * source/encoder/encoder.cpp: + encoder: Adding a TODO comment on the final goal. + [b7e392e2b720] + + * source/encoder/encoder.cpp: + vbv: set DQP as true if VBV is enabled (and AQ disabled). + + Unless this is set, the different QP's for each CU wont be encoded. + This worked thankfully until now, since VBV was always used at high + quality (AQ on) settings. + [b82c87d0a896] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: set dqpflag as true in the CU encoder if aqmode enabled + [d72b7a5c8176] + +2014-03-17 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp: + prevent deadlocks from frame dependencies on Linux + [eba8844609f2] + +2014-03-14 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: mvpIdx shares storage with mergeIdx, do not set for + merge CUs + [ba3ddc1848ff] + +2014-03-14 Wenju He + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: initialize variables, handle malloc failures more cleanly + [93ea767e7df0] + +2014-03-14 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h, source/test/mbdstharness.cpp: + asm: 8bpp and 10bpp code for idct8x8 module + [a4cb4fbff864] + +2014-03-14 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad16-a.asm: + added asm primitives for 10bpp sad functions + [f36c9130de66] + +2014-03-14 Steve Borho + + * source/encoder/frameencoder.cpp: + Merge with stable + [d5a4296dbfe7] + +2014-03-14 Santhoshini Sekar + + * source/encoder/frameencoder.cpp: + vbv: bugfix-calculate intraCuCostPerRow for vbv + [394481c40cf9] + +2014-03-14 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: init best AMVP candidate to zero. + [ed48f84e541b] + +2014-03-13 Steve Borho + + * source/encoder/motion.cpp: + motion: remove unused file static array + [7b5699e6bb75] + + * source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove redundant temp buffer + + TComPrediction (which TEncSearch derives from) already has + m_predTempYuv + [bb1ecd4f154d] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: keep merge data together in a struct + + This is for clarity, convenience, and to avoid some redundant work + [ebf86c054d05] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: combine motion candidate search into AMVP search loop + [32eaed85f7c0] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: inline MVP selection from AMVP candidates + [81911e5df59c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xGetBlkBits + [dc4af8a48dc8] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: inline xRestrictBipredMergeCand + [80952375aff1] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: inline xGetTemplateCost() + [387471b8ec63] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: be pedantic about setting cu fields [CHANGES OUTPUTS] + + It is somewhat unfortunate that this changes outputs, it means I had + introduced a bug in an earlier commit + [ebb3a25a7ad8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: recombine merge-only path, now it can be accomplished + with continue + [879151f65962] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove redundant EMMS instrutions + + predInterSearch uses no float operations, so a single EMMS at the + end is sufficient + [675837ac633f] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: merge xGetInterPredictionError into xMergeEstimation + [3a2f801dd535] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: refactor predInterSearch to avoid redundant work + [a166b8d0a43e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: switch xGetInterPredictionError() to use SATD [CHANGES + OUTPUTS] + + predInterSearch() was having to remeasure the unidir/bidir + prediction just because it wanted to measure merge with sa8d. By + switching to satd, a number of further simplifications can be made. + [ea5f4ab96610] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Backed out changeset: 524fdbaed8d0 + + bits0 and bits1 include list[N].bits and thus they did include the + list selection bit costs. + [716dc1c8391a] + +2014-03-13 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/encoder/compress.cpp: + remove unused trUnitIdx from subtract() and addClip() + [deb3d531790f] + +2014-03-13 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp: + frameencoder: set QP to all parts of CU. No logic change. + [665355c7e4d9] + +2014-03-12 Min Chen + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + cleanup: remove c++ ref parameter on xWriteCoefRemainExGolomb + [47edf909f518] + +2014-03-12 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: disambiguate variable names + + Now with 75% less insanity! + [5328eec59554] + +2014-03-12 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + optimize: reduce memory and improvement performance by replace + sigCoeffGroupFlag[] to sigCoeffGroupFlag64 + [68601cdea577] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + optimize: simplify operators on loop sigCoeffGroupFlag + [d3e218ecc33f] + +2014-03-12 Steve Borho + + * source/Lib/TLibCommon/TypeDef.h: + TypeDef: explicit 32bit type for TCoeff + [892b8325ed6c] + + * source/Lib/TLibCommon/TypeDef.h: + TypeDef: UChar to uint8_t + [661f6a49b956] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + TComRom: UChar to uint8_t + [f88f6543ca26] + +2014-03-12 Min Chen + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + optimize: use UChar on g_convertToBit to avoid signed extend + [41ea3d34f224] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + optimize: reduce scan table size by uint32_t -> uint16_t + [aedad0264282] + +2014-03-12 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: replace UChar with uint8_t + [999e3120deda] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: nit + [9e554cec74ab] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: use HEVC MVP candidates as motion candidates [CHANGES + OUTPUTS] + [99441b4af293] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp: + TEncSearch: remove unused adaptive search range array + [87ad1232188f] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: MV0 bidir bitcost not based on list costs, so no need to + remove them + [524fdbaed8d0] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: refactor predInterSearch to use simple structure + + This removes the need for keep every possible MVP and MVP idx + around, and it hopefully makes the code a lot more readable. + [7c8b9445bec6] + +2014-03-12 Deepthi Nandakumar + + * Merge + [4fdcea7426f1] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp: + TComDataCU: cleanup + [cc3d77015d1e] + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h: + encodeQP: cleanup + [94d3dfe14558] + +2014-03-12 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: initialize MotionEstimate function pointers before merge + estimation + [8a69cfea0304] + +2014-03-11 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.h, + source/encoder/api.cpp: + common: remove mostly unused cycle count macros + [e4cf59adfb84] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: break ties in favor of uni-directional MC [CHANGES + OUTPUTS] + + If bidir has the same cost as unidir, pick the unidir prediction. + Bidir will average the two directional prediction and thus tends to + generate blurred predictions. Uni-directional prediction is better + when costs are the equal. + [c24eda418b5c] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/common.cpp, source/common/common.h, + source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + Merge with stable + [bbd35c26c4e4] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: dehungarians and other nits + [83d649aaf7ce] + +2014-03-10 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + revert getSigCtxInc() to 0.7, this version was faster + [87638be9bc6b] + +2014-03-11 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + vbv: bug fix - down shifting of satdcost for 10 bit depth. + [c06ee069df5e] + + * source/common/common.cpp, source/common/common.h: + rc: move qp2qscale and qscale2qp function to common + [561adb9b21f6] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + vbv: bug fixes - move row Predictors, frameSizeEstimated to Frame + Encoder scope. + + Improves quality and controls buffer underflow when Frame + parallelism is enabled. + [1b84d0748a08] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/ratecontrol.cpp: + vbv: use row wise IntraSatdCost to predict vbv intra frame bits. + + If P/B slice is predominatly intra coded, use intra SatdCost to + predict the row bits in vbv. + [380c4bf5ff75] + + * source/encoder/ratecontrol.cpp: + vbv: bug fix - down shifting of satdcost for 10 bit depth. + [7832cb8323dd] + +2014-03-10 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix some comments + [e40524d079b4] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: reorder code for more clarity + [504c81462e30] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: host bMergeOnly logic to the top of predInterSearch + + This makes merge-only easier to follow and more efficient, and makes + the full search code easier to read and easier to maintain. This + should have no change in outputs + [c3156f2b7c97] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: localize scope of bidir MV and ref idx arrays + [1f1aa83d20ed] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: hoist bMergeOnly check to only be performed once + [0574550353e6] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove list1 hacks + [0d4182301313] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: localize scope, rename, and remove defaults for + listSelBits array + [85e8a21a261b] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove mvTemp[][] array, just use outmv stack var + [d546b400d156] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: localize scope and rename listMv + [f91dc85070fe] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: localize scope of mvzero + [de73109c06d5] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: localize scope and rename listRefIdx + [dc977fbf3ed1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: localize scope of costTemp + [132393836a2e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: simplify and localize scope of bitstemp + [c095646a2ef0] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: rename bits to listbits[] for clarity + [7495c029daf6] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: replace costbi with listCost[2] + [12841ab3ce3f] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove useless distBiP argument from xEstimateMvPredAMVP + [621f9c82eb14] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanups in predInterSearch(), no logic change + [275edc95de11] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/weightPrediction.cpp: + weight: match x264 weight analysis and logging [OUTPUT CHANGE] + + This only weights the first reference, and will only weight chroma + if luma was weighted. Further work remains, we must add an + unweighted dup of the weighted reference into the P slice's L0 + reference list. + [f3988b70f4aa] + +2014-03-10 Murugan Vairavel + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/shortyuv.cpp, + source/common/shortyuv.h: + shortyuv: integrated asm primitives for blockcopy + [1417ba917211] + +2014-03-11 Aarthi Thirumalai + + * source/common/common.cpp, source/common/common.h, + source/encoder/ratecontrol.cpp: + rc: move qp2qscale and qscale2qp function to common + [5fb584576fc9] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + vbv: bug fixes - move row Predictors, frameSizeEstimated to Frame + Encoder scope. + + Improves quality and controls buffer underflow when Frame + parallelism is enabled. + [925b4b3a01c0] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/ratecontrol.cpp: + vbv: use row wise IntraSatdCost to predict vbv intra frame bits. + + If P/B slice is predominatly intra coded, use intra SatdCost to + predict the row bits in vbv. + [092bc3753097] + +2014-03-10 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComMotionInfo.h: + tcommotion: cleanup removed unused methods + [37a33cbf5b57] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h: + tcommotion: use Checked malloc to allocate memory + [728df17d1e5f] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + tencsbac: remove unused SCALING_LIST_OUTPUT_RESULT macro + [9c57962437b6] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + tencsbac: subLayerOrderingInfoPresentFlag always true, the if() loop + will never executed + + vps_sub_layer_ordering_info_present_flag is always set to true, not + required to have extra storage for this, removed + subLayerOrderingInfoPresentFlag + [752bdc470b06] + +2014-03-10 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + cleanup unused intra !bLumaOnly path + [ee6e6fd867eb] + +2014-03-10 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix bMergeOnly override check, did not intend to change + behavior + [505d4a83704a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix gcc warning, remove redundant assignemnt + [f76a92a2b00b] + +2014-03-09 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: rename bUseMRG to bMergeOnly for clarity + [50d7910ddd61] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: disable TMVP range check if frame threading disabled, code + style nits + [82a66870fc8b] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + TEncSearch: remove default argument from encodeResAndCalcRdInterCU() + [74d4709422f6] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + TEncSearch: remove confusing default arguments from + predInterSearch() + + And completely remove bLuma which was always true + [f8926fb14c42] + +2014-03-08 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + me: skip TMVP merge candidates that are beyond vertical search range + + Because of frame parallelism, pixels more than search range below + coincident may not be available in references. Sometimes these merge + candidates happened to be the best cost, resulting in hash + mismatches. + + Because xMergeEstimation() can now return with no valid merge modes, + the caller must deal with the potential of no merge mode. + predInterSearch() is sometimes called to only measure merge modes + (skipping ME for AMP partitions for some presets), so now + predInterSearch() can also possibly exit without generating a valid + prediction. + [f29f0ba91dfa] + +2014-03-09 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + TEncCu: nits, no logic change + [1ff43004128f] + +2014-03-07 David T Yuen + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + Replaced Encoder* with x265_param* as member of several classes + + Replaced Encoder* with x265_param* as member of TEncCu, framefilter, + ratecontrol, CostEstimate and Lookahead Added + m_CUTransquantBypassFlagValue to TEncCu since it's part of Encoder + and not x265_param framefilter's processRow and processRowPost + needed Encoder* to run Encoder::signalReconRowCompleted + [0bd2465e3d4a] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/lowres.cpp, source/common/lowres.h, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/encoder.cpp, source/encoder/slicetype.cpp: + Removed Encoder* as member of DPB and replaced it with field + m_bOpenGOP + + Also added int bframes to Lowres since only param->bframes is passed + to its methods thus eliminating one of DPB's uses of Encoder* + [13e2c2fe091d] + +2014-03-08 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/pixel-util8.asm, + source/test/pixelharness.cpp: + cleanup m_sharedPredTransformSkip[] + + NEW_CALCRECON macro is TODO mark for asm experts, to optimize + register assignment. + [d3bfe4152e67] + +2014-03-07 Steve Borho + + * source/encoder/encoder.cpp: + Merge with stable + [93861c42b879] + + * source/encoder/encoder.cpp: + encoder: back-port VUI content fixes from default branch + + Take values from param structure, which is where these fields are + actually configured. + [b7e2854cf1c4] + +2014-03-07 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm: added code for blockcopy_ss and cleaned up asm primitives of + blockcopy + [2bf727dca27d] + +2014-03-06 Steve Borho + + * source/Lib/TLibEncoder/TEncSbac.cpp: + TEncSBac: remove unused define + [33b67a53b6de] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: nits + [d049b579ff18] + +2014-03-06 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + share m_mvpIdx[0] as m_mergeIdx + [51f8673f8b35] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/param.cpp, + source/encoder/compress.cpp, source/encoder/cturow.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/reference.cpp: + CU is square + + unify TComDataCU::m_width and m_height to m_cuSize + [5c502cce2a1f] + +2014-03-06 Steve Borho + + * source/common/param.cpp: + param: print vbv-init as a float + [7209562f4cc8] + +2014-03-06 Murugan Vairavel + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + Testbench code for blockcopy_ss + [d46917d3ac2d] + + * source/common/pixel.cpp, source/common/primitives.h: + C primitive for blockcopy_ss + [c078ccf260b2] + +2014-03-06 Yuvaraj Venkatesh + + * source/common/x86/intrapred16.asm, source/common/x86/intrapred8.asm, + source/common/x86/ipfilter16.asm, source/common/x86/ipfilter8.asm, + source/common/x86/mc-a.asm, source/common/x86/pixel-util8.asm, + source/common/x86/pixeladd8.asm, source/common/x86/sad-a.asm, + source/common/x86/ssd-a.asm: + cleanup the labels in assembly code with trailing colon + [c97fc1244d0b] + +2014-03-06 Steve Borho + + * source/Lib/TLibCommon/ContextTables.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp, + source/common/common.h, source/common/cpu.cpp, source/common/cpu.h, + source/common/dct.cpp, source/common/intrapred.cpp, + source/common/ipfilter.cpp, source/common/md5.cpp, + source/common/md5.h, source/common/mv.h, source/common/param.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/shortyuv.cpp, source/common/threading.h, + source/common/threadpool.cpp, source/common/threadpool.h, + source/common/vec/blockcopy-sse3.cpp, source/common/vec/dct- + sse3.cpp, source/common/vec/dct-sse41.cpp, source/common/vec/dct- + ssse3.cpp, source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/bitcost.cpp, source/encoder/bitcost.h, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/ratecontrol.cpp, source/input/input.cpp, + source/input/y4m.cpp, source/input/yuv.cpp, + source/output/output.cpp, source/output/y4m.cpp, + source/output/yuv.cpp, source/output/yuv.h, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/mbdstharness.cpp, source/test/pixelharness.cpp, + source/test/testbench.cpp, source/test/testharness.h, + source/test/testpool.cpp, source/x265.cpp: + common: consolidate system header includes into common.h + + Hopefully this will prevent random compile failures in the future + from missed cmath, climits, or memory.h includes. + [ef83ccb3d34d] + + * source/common/param.cpp: + param: spelling nit + [102ad4518b1c] + + * source/common/param.cpp, source/x265.cpp: + clarify valid values for --vbv-init + [889edfd2c4c3] + +2014-03-05 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: replace magic numbers with appropriate macro + [a1d82df4d45e] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unused checkNumPocTotalCurr argument to + setRefPicList() + [a35934b41412] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unused checkCRA function + [f1f5303613fc] + + * source/encoder/dpb.cpp: + dpb: remove commented code + [1b8ebfd190a4] + + * source/common/param.cpp, source/x265.cpp, source/x265.h: + cli: add --crf-max option for param.rc.rfConstantMax + [ba92d0695116] + + * source/x265.cpp: + cli: show compiled assembly status and detected SIMD arches on x265 + --version + [37309f09aaa1] + + * source/x265.cpp: + cli: invalid preset or tune is a non-recoverabe error + [228c0004d401] + + * source/x265.cpp: + cli: show condensed error message if no arguments are given + [105fe9b2d189] + + * source/x265.cpp: + cli: reorder help for better clarity + [64e76dcaf172] + + * source/x265.cpp: + cli: add --no- options to some VUI fields + + This is a little pedantic since they default to false and likely + won't change at any time. + [3ffbee76c1f2] + + * source/x265.cpp: + cli: fix AQ CLI help. 0, 1, and 2 are valid options + [bbaab3feef92] + + * source/x265.cpp: + cli: improve help text by describing command argument data types + [f1dd1c83f3c6] + + * source/x265.cpp: + cli: improve CLI help for --overscan + [f38f72976bcd] + + * source/common/param.cpp, source/common/param.h, source/x265.cpp: + cli: change --log option to --log-level, allow strings or ints + + getopt allows partial matches, so --log N will still work like it + always has. x265_parse_param() will accept "log" or "log-level" + + Since the first entry in the name list corresponds to -1, the string + list is not placed in x265.h, where all indices are 0 based. + [50a04c4cf0ff] + + * source/x265.cpp: + cli: move "fps" between resolution and frame rate, same as input + file line + [01547efd182d] + + * source/common/param.cpp: + param: add VBV info to log data + [081efde65d54] + + * source/CMakeLists.txt: + cmake: use x86 alias list rather than repeated string compares (nit) + [e7d3835fe752] + +2014-03-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + primitives: cleaned up asm_primitives of intra_pred_ang + [3272416ac9b0] + +2014-03-05 Steve Borho + + * source/encoder/encoder.cpp, source/output/y4m.cpp, + source/output/yuv.cpp: + encoder: assign internal colorspce to output pic.colorSpace + [992b1b9e5cff] + + * source/CMakeLists.txt, source/Lib/TLibCommon/TComPicYuv.cpp, + source/encoder/encoder.cpp, source/input/y4m.cpp, + source/input/yuv.cpp, source/output/y4m.cpp, source/output/yuv.cpp, + source/x265.h: + api: change meaning of pic.stride to be in bytes rather than pixels + (fixes #35) + + x264's pic.plane pointer is a uint8_t* so their input strides are + byte based, ffmpeg is currently assuming our input strides are byte + based. This commit will make that assumption correct. + + This fixes non-4:2:0 YUV file read at the same time, and bumps + X265_BUIDLD + [eadec14402d6] + + * source/CMakeLists.txt: + cmake: add i686 to x86 alias list (closes #33) + [6d55869ed5e2] + +2014-03-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: enable 10bit chroma_vpp and chroma_vps interpolation filters + [c40c379f283c] + +2014-03-04 David T Yuen + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/framefilter.cpp: + Removed Encoder::m_csp and replaced it with x265_param::internalCsp + [91936aab5ae9] + +2014-03-04 Steve Borho + + * source/encoder/weightPrediction.cpp: + weight: ensure chroma weights are initialized when skipped + [5cad3652bee8] + + * source/common/primitives.cpp: + primitives: prevent multiple setup calls from reiniting primitive + table + + We don't want the function table to change in the middle of an + encode. Improve logging so that --asm which does not + result in any bitmap matches still reports "none!" + [1a3d37f0a072] + + * source/encoder/weightPrediction.cpp: + weight: simplify and remove redundant logic + [92c2f95b230b] + + * source/encoder/weightPrediction.cpp: + weight: do not attempt chroma weights if luma was not weighted + [CHANGES OUTPUT] + + This heuristic seems to improve a number of clips, particularly when + chroma distortion is not considered in mode decision. This could + cause some odd colors to be encoded, particularly noticable with + large blocks. + [864ed3e0d82e] + + * source/encoder/reference.cpp, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp: + weight: use correct round value for denom 0 [CHANGES OUTPUT] + + The denom -without the intermediate bit-depth factor applied- must + be compared with zero, else the output of weight_pp will not match + the real output of pixel weighting. This was done wrong in two + different ways in three different places + [cc64612d8e5e] + + * source/CMakeLists.txt: + cmake: bump X265_BUILD to 8, many API changes + + These changes should be mostly harmless, I don't expect many API + users have used the VUI fields yet, so a recompile against the new + x265.h should be sufficient. + [1bd1e6dbce28] + + * source/CMakeLists.txt, source/dllmain.cpp: + cmake: dllmain is no longer necessary since use of object libraries + [6a6bf84ad146] + + * source/x265.cpp: + cli: do not call x265_setup_primitives() from printVersion() + + It now leads to redundant console output and it was interfering with + --asm command line arguent functionality + [5fd56621f2fe] + + * source/common/param.cpp, source/encoder/api.cpp, source/x265.cpp, + source/x265.h: + api: move cpuid to x265_param, same as x264 + [e0bdcfc9e20b] + + * source/common/common.cpp, source/common/common.h, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + api: move x265_ssim() to be an internal function, give a more + descriptive name + + This function really shouldn't have been a public function, and it + was never added to the exports list. + [8e1c69c0d4f1] + + * source/common/param.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + api: move VUI settings into vui sub-struct of x265_param + [3825b0059d6c] + + * source/encoder/CMakeLists.txt: + cmake: TEncCfg.h is caput + [58bbbdd0a699] + +2014-03-03 David T Yuen + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/param.cpp, + source/encoder/api.cpp, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/x265.cpp, source/x265.h: + Merged TEncCfg into Encoder. A few VUI tweaks and fixes. + [1020e2ac9890] + +2014-03-04 Steve Borho + + * Merge with stable + [31731a78d994] + + * .hgtags: + Added tag 0.8 for changeset 527d03c56d68 + [91ada5e78ee1] + +2014-03-03 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/param.cpp: + cmake: detect strtok_r, use workaround when not present + [3cbde0b893e3] + + * source/encoder/frameencoder.cpp, source/encoder/slicetype.cpp: + Merge with stable + [7fbecd22a0cd] + +2014-03-03 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/encoder/framefilter.cpp: + tcomsao: only get method is unused, set lcuboundary is required + + references 742641e0f796 + [62fe3f905981] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h: + tcomsao: replaced All Pel type into pixel type + [f3125b431ee5] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h: + tcomsao: use X265_MALLOC and X265_FREE in create() and Pel replaced + with pixel + [459b108299e0] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/encoder/framefilter.cpp: + tcomsaq: remove unused get and set saolcuboundary methods + [776983eda6ba] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + tencsao: Pel replaced with pixel type + [0a2fcf8690b8] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + tencsao: new and delete replaced with X265_MALLOC and X265_FREE + [02f953896795] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h: + tencsao: remove unused calcSaoStatsBlock + [2de527d2826d] + +2014-03-03 Steve Borho + + * source/Lib/TLibEncoder/TEncEntropy.cpp: + TEncEntropy: fix gcc warnings + [e34503ff6627] + +2014-03-03 Satoshi Nakagawa + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + residual coding unit is always square + [684781cad358] + +2014-03-03 Nabajit Deka + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + test bench : Modify chroma_p2s test function to handle csp. + [eb7eb10568fb] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h: + asm : remove chroma_p2s_i444, can be replaced by luma_p2s + [e686e589f3ca] + +2014-03-04 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/slicetype.cpp: + vbv: bug fixes in --no-cutree flow. Ignore intracosts in calculating + satdcost per cu. + [527d03c56d68] [0.8] + +2014-03-03 Nabajit Deka + + * source/test/mbdstharness.cpp: + test bench : fix for test bench failure, caused by redundant malloc. + [6662df480e39] + +2014-03-02 Steve Borho + + * source/common/shortyuv.cpp: + shortyuv: use optimized primitives where available + [288a83d7e289] + + * source/common/shortyuv.cpp, source/common/shortyuv.h: + shortyuv: combine add/subtract methods together + [ab05e3d951e1] + + * source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/shortyuv.cpp, + source/common/shortyuv.h, source/encoder/compress.cpp: + rename TShortYuv to ShortYuv + [0352ee7f3c5a] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/common/shortyuv.cpp, source/common/shortyuv.h, + source/encoder/cturow.cpp: + use checked malloc in TShortYuv::create and TEncCu::create + + And replace unsigned int with uint32_t in TShortYuv.cpp + [a9bbdfaf2a59] + + * source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/common/CMakeLists.txt, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/shortyuv.cpp, + source/common/shortyuv.h: + rename TShortYUV.* to shortyuv.* + [5f2d25407800] + + * source/common/param.cpp: + param: ignore assignments within conditionals in param.cpp + [54e2dcf770c4] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + TComPattern: remove hungarian prefixes + [5a7c6e8536ac] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: switch from Pel to pixel type + [47a0575aad04] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + used checked malloc in TComYuv and TEncSearch, cleanly catch malloc + failure + [f6ae34250453] + + * source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/mbdstharness.cpp, source/test/pixelharness.cpp: + testbench: use CHECKED_MALLOC to cleanup init functions + [d5269597c860] + + * source/common/param.cpp: + param: MSVC build fixes + [fad6fba7cf2c] + +2014-03-01 Steve Borho + + * source/common/param.cpp, source/x265.cpp, source/x265.h: + api: add a fastdecode tune option, improve tune and preset CLI help + + To match x264's tune names, removed "-" from zerolatency in x265.h + but allowed either string to match internally. Same with fast- + decode. + [dc82d1805136] + + * source/test/testbench.cpp: + testbench: improve CLI help and error handling, use parseCpuName + [46a799d747ae] + + * source/common/primitives.cpp: + primitves: update CPU name print logic from x264 + [93ba16e4f140] + + * source/common/param.cpp, source/x265.cpp: + param: fix handling of --no-scenecut within x265_param_parse(), add + --no-b-adapt + [fcbe9a361b5a] + + * source/common/common.h, source/common/param.cpp, + source/common/param.h, source/common/primitives.cpp, + source/x265.cpp: + cli: replace --cpuid with --[no-]asm arguments + + The vector class library is long gone, so it is past-due to adopt + x264's logic for specifying CPU architectures, or disabling ASM + altogether. + [26bd96a193d4] + +2014-02-28 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + copy m_origYuv[depth] from m_origYuv[0] + [ac6edd2ffb65] + +2014-03-02 Deepthi Nandakumar + + * source/encoder/weightPrediction.cpp: + weightp: implicit cast warning on vc10-x86. + + Is lambda intended to be int? + [4f2cbc5a981a] + +2014-02-28 Praveen Tiwari + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/common/CMakeLists.txt, source/common/loopfilter.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/loopfilter.asm, source/common/x86/loopfilter.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + asm: split SAO_EO_0 into separate primitive func + + added assembly code and testbench support added loopfilter.cpp, + loopfilter.h, loopfilter.asm files + [000f86d72337] + +2014-02-28 Kavitha Sampath + + * source/common/CMakeLists.txt, source/common/bitstream.h, + source/encoder/weightPrediction.cpp: + weightp: use struct to cache data for reuse, refactor MC of + reference planes + + * do not consider intra/mv cost during MC phase + * unconditionally motion-compensate luma and chroma blocks + * include slice header cost estimate in weight analysis + * weightCost() needed different paths for luma, chroma, and chroma444 + * pass a single stride to weightCost() + [518313140b03] + +2014-02-28 Steve Borho + + * source/common/param.cpp, source/encoder/encoder.cpp, source/x265.h: + Merge with stable + [5e9559d366b3] + +2014-02-28 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPicYuv.cpp: + tcompicyuv: initialize NULL to cu and bu offset buffers. + [f6d079ad85bc] + +2014-02-27 Steve Borho + + * source/common/common.cpp, source/common/param.cpp, + source/encoder/encoder.cpp, source/encoder/weightPrediction.cpp, + source/x265.h: + api: make log-level 4 semi-official and expose in public API + [994f046a8111] + +2014-02-28 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + square transform only + [9b43c262124e] + + * source/common/x86/asm-primitives.cpp: + asm: enable count_nonzero for HIGH_BIT_DEPTH + [df831b319c08] + +2014-02-27 Steve Borho + + * source/common/param.cpp: + tune: ensure lookahead is disabled for zero-latency + [61c752e11424] + + * source/common/primitives.cpp, source/common/x86/asm-primitives.cpp: + asm: cleanup 444 chroma primitive setup, and other primitve reuse + [8189f9e9a39f] + + * source/encoder/encoder.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + log: report consecutive B-frame histogram, as x264 does + [b70a88e210f1] + + * source/common/common.h, source/x265.h: + api: expose X265_BFRAME_MAX to the user + [83859780b174] + + * Merge with stable + [a892e66d5738] + +2014-02-27 Sagar Kotecha + + * source/common/common.h, source/common/param.cpp, source/x265.cpp: + param: add more validation checks to prevent encoder crashes + [013589124615] + +2014-02-27 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter16.asm, source/common/x86/ipfilter8.h: + asm: 10bpp code for vertical luma interpolation filters. + [89a2d8780835] + +2014-02-27 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + vbv: bug fixes to stop mid frame qp adjustments to increase + erratically. + + maintain bufferFill for each FrameEncoder context and use that in + rowWise qp adjustments for vbv. also fixes bugs in predicting row + wise bits from satd costs. + [875566aed8b1] + +2014-02-27 Steve Borho + + * Merge with stable + [8a84c10e5116] + + * source/common/threading.h: + threading: on linux gettimeofday() requires , UINT32_MAX + is C only + [bbe3f479c41f] + +2014-02-27 Min Chen + + * source/CMakeLists.txt: + fix Issue #30, disable stack execute on gcc + [8e8328cad1be] + + * source/common/x86/ipfilter16.asm: + fix typo problem on HIGH_BIT_DEPTH with non-PIC mode + [52a47362c5c3] + +2014-02-27 Praveen Tiwari + + * source/common/x86/intrapred8.asm: + all_angs_pred_32x32, asm code improvement + [63aadc802f18] + +2014-02-27 Deepthi Nandakumar + + * Merge from stable + [b92f341688c4] + + * source/encoder/ratecontrol.cpp: + ratecontrol: fix bitrate issue in 10-bit CRF mode + [2fb85daef8af] + +2014-02-27 Steve Borho + + * source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, source/common/vec + /intra-sse41.cpp, source/common/vec/intra-ssse3.cpp, + source/common/vec/ipfilter-sse41.cpp, + source/encoder/weightPrediction.h: + Merge with default (prep for 0.8) + [0a6dd816d2e2] + +2014-02-26 Steve Borho + + * source/common/threading.h: + threading: add a timedWait() method to Event class + [c9a0802b64ac] + + * source/common/threading.cpp, source/common/threading.h: + threading: use a pthread conditional variable instead of semaphore + + This allows more control over the exact behavior, removes the global + nature of the semaphore on some systems, and will allow the addition + of a timed wait to the Event class. It appears to resolve the + deadlocks reproduced in the pool test on Mac OS X; hopefully on + Linux as well. + [8ac1e112f3ea] + +2014-02-26 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter16.asm, source/common/x86/ipfilter8.h, + source/test/ipfilterharness.cpp: + asm: 10bpp code for chroma interpolation filters + [d317026aa0ad] + +2014-02-26 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp assembly code for intra_pred_ang16 - all modes + [41bc98a92b49] + +2014-02-07 Steve Borho + + * source/test/CMakeLists.txt, source/test/testpool.cpp: + reintroduce thread pool unit test + + It is reproducing a deadlock on POSIX roughly once per 1M runs + [d36764800215] + +2014-02-26 Sagar Kotecha + + * source/common/param.cpp: + param: add some more validation checks + [fa23612e2850] + +2014-02-26 Santhoshini Sekar + + * source/encoder/frameencoder.cpp: + rc:bug fix-store average Qp as decided by AQ only if aq is enabled + [9b0c9b76d902] + +2014-02-26 Praveen Tiwari + + * source/common/x86/intrapred8.asm: + all_angs_pred_32x32, asm code improvement + [0b9c77b41599] + +2014-02-26 Satoshi Nakagawa + + * source/encoder/slicetype.cpp: + fix: uninitialized variable + [483e699a9527] + +2014-02-26 Steve Borho + + * source/encoder/encoder.cpp: + encoder: tweak slice stats to more closely match x264 outputs + [46207f6f5c8c] + +2014-02-25 Steve Borho + + * source/common/threading.cpp, source/common/threading.h: + threading: use atomic increment when building semaphore object + + Required moving atomic defines to the top of threading.h, removing + the need for potentially redundant includes of unistd.h + [948626475a46] + +2014-02-26 Aarthi Thirumalai + + * source/encoder/encoder.cpp: + encoder: log avg Qp decided by AQ for each slice type + [b47fc23c75df] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + TComPic: store average Qp (as decided by AQ) used per frame. + [298133a7bd53] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/ratecontrol.cpp: + remove redundant state m_avgQpRc from TComSlice. + + This has been moved to TComPic already. + [3bfd7b2a54ed] + + * source/encoder/slicetype.cpp: + lookahead: bug fix for b-adapt 0 cost estimates (closes #26) + + don't overwrite lastNonB reference with current frame, 'lastNonB' + state was already replaced with current frame's lowres at this + point. + + it mostly affected b-adapt=0 but could have affected any of the + modes if they had not already estimated this particular P frame cost + before this point. + [9eea995a2347] + +2014-02-23 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + reduce unnecessary CU data set/clear (no neighbor access) + + mvd, mvpIdx, mergeIndex + [f37c84ffe293] + +2014-02-24 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + refine MC - cleanup unnecessary code paths, reduce copies for non-WP + uni-pred + [5241a8efb684] + +2014-02-25 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, + source/test/ipfilterharness.cpp: + asm: enable 10bit asm filter functions, fix test harness + [0b63fa220e19] + + * source/common/CMakeLists.txt, source/common/x86/const-a.asm, + source/common/x86/ipfilter16.asm: + asm: add new file for 10bpp asm filter functions + [a8e822a95fad] + +2014-02-25 Steve Borho + + * source/encoder/api.cpp: + api: check for NULL param pointer in x265_encoder_open + [b7a1a636b77b] + +2014-02-25 Sagar Kotecha + + * source/common/common.cpp, source/encoder/CMakeLists.txt, + source/encoder/api.cpp, source/encoder/encoder.cpp: + move public encoder and picture functions to api.cpp + [5fde224bab34] + +2014-02-25 Steve Borho + + * source/common/CMakeLists.txt, source/common/vec/intra-sse41.cpp, + source/common/vec/vec-primitives.cpp: + vec: remove intra-sse41.cpp, full asm coverage for intra prediction + [b456566376ad] + +2014-02-25 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + all_angs_pred_32x32, asm code + [ea1c03303b16] + +2014-02-25 Steve Borho + + * source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/x265.cpp: + input: workaround MSVC runtime bugs in older compilers (fixes #27) + + Prior to VC11, ifstream::pos_type could not handle 64bit sizes + correctly + [a4c676d671d6] + + * source/encoder/weightPrediction.cpp: + weight: bump weight analysis logging to --log 4, reduce spew + [1c467346ece8] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: log the average QP used per frame, per I slices, P, and B + slices + [a41b6a5491ed] + + * source/encoder/encoder.cpp: + encoder: rename _param parameter to p for readability + [cfc5647635e0] + + * source/test/mbdstharness.cpp: + mbdstharness: nits + [9b48a2847d97] + + * source/test/mbdstharness.cpp, source/test/pixelharness.cpp: + testbench: remove redundant casts of X265_MALLOC() output + [62da491b8b22] + +2014-02-25 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: store qp in TComPic::avgQpRc in CQP. + [b86a5ccd3474] + +2014-02-24 Steve Borho + + * source/common/param.cpp, source/common/param.h, + source/input/input.cpp, source/input/input.h, source/input/y4m.cpp, + source/input/y4m.h, source/input/yuv.cpp, source/input/yuv.h, + source/x265.cpp: + cli: refactor handling of input file parameters + + * allow y4m header to provide aspect ratio + * allow user to override Y4M aspect ratio or frame rate + * allow user to provide data possibly missing from Y4M header + * do not clamp framesToBeEncoded to predicted file size (stop at EOF) + [651c0bc9e280] + + * source/encoder/encoder.cpp: + encoder: enable timing info in VPS + [5e375c097de4] + +2014-02-25 Dnyaneshwar G + + * source/common/x86/const-a.asm, source/common/x86/mc-a.asm, + source/test/pixelharness.cpp, source/test/pixelharness.h: + asm: fixed invalid testbench input for addAvg primitive, fixed + addition overflow for some block sizes. + [7d009bc2953b] + +2014-02-24 David T Yuen + + * source/common/param.cpp, source/x265.cpp, source/x265.h: + Updated x265_param_parse to use parseName & a few nit fixes + [1f6311b0b2a7] + +2014-02-25 Steve Borho + + * source/common/threading.h: + threading: recover include of unistd.h within #if __GNUC__ section + + Fixes MinGW build + [a36a669d09e8] + +2014-02-24 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: nits + [6214d2609ea8] + + * source/common/param.cpp: + param: simplify checks of max ctu size, improve abort message + [b8dab8ef744c] + + * source/common/threading.h: + threading: include process id in name passed to sem_open() and + sem_unlink() + + This should prevent race conditions between x265 processes spinning + up on the same CPUs, and generally make name conflicts very rare. + [1a93191f0c23] + + * source/x265.cpp: + cli: remove undocumented and unimplemented --depth CLI option + [18894c99e1a7] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h, source/common/param.cpp, + source/test/intrapredharness.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/test/testbench.cpp: + Remove g_bitDepth; internal bit depth is always known at compile + time + + Simplify param->internalBitDepth checks, remove test bench hacks + [e0af601defb0] + +2014-02-24 Sagar Kotecha + + * source/common/CMakeLists.txt, source/common/common.cpp, + source/common/common.h, source/common/param.cpp, + source/common/param.h, source/encoder/encoder.cpp, source/x265.cpp: + consolidate param functionality into param.cpp + [6d584320108c] + +2014-02-24 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: implement abr reset for single pass ABR + VBV + [ebaa34c8f651] + + * source/encoder/frameencoder.cpp: + vbv: refactor, implement row wise qp updates only if vbv is enabled. + [d0aea0cfd263] + + * source/encoder/ratecontrol.cpp: + rc: bug fix - clip qp before setting into TComPic:m_avgQpRc. + + This fix resolves Encoder crash caused due to invalid qp being used + in each CU. + [acaed9dbaae2] + +2014-02-20 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + tcomrom: remove unused g_sigLastScan8x8 and g_sigLastScanCG32x32 + variables + [57ce7f0f4f4c] + +2014-02-23 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + vbv: bug fix in clipQScale + [80caa9f00d7c] + +2014-02-22 Steve Borho + + * source/encoder/encoder.cpp: + encoder: use more frame threads when WPP is disabled + + Use up to one frame thread per CPU, but cap the frame threads at + half the number of CTU rows. Any more frame threads than that is + generally counter- productive. + [d1cd52bb3461] + +2014-02-23 Steve Borho + + * source/common/common.cpp: + common: c is a char pointer + [07537439599b] + +2014-02-24 Deepthi Nandakumar + + * source/Lib/TLibCommon/NAL.h: + NAL: Define a default constructor for base class to prevent warning + [6e7c2f5b7b22] + + * source/common/common.cpp: + common: assignment inside conditional expression + [39e2c8bda975] + +2014-02-22 Steve Borho + + * source/common/threadpool.cpp, source/common/threadpool.h, + source/encoder/encoder.cpp: + encoder: use cpu count, rather than pool size, to auto-detect frame + threads + + Normally the pool size is the same as the detected cpu count, but + when WPP is disabled, the thread count is 1, and this was breaking + auto-detection of frame threads. Now it will properly use -F4 on + desktops when WPP is disabled. + [734f106295df] + + * source/output/output.h, source/output/y4m.h, source/output/yuv.h, + source/x265.cpp: + cli: add a line of logging describing reconstructed image file + [8ec8aba042cc] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + TEncSbac: nits + [a640cde93d87] + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + TEncSbac: cleanup header + [c61a1bf68c57] + + * source/Lib/TLibCommon/TComBitStream.cpp: + TComBitstream: nits + [7fb5a8e1465a] + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibEncoder/NALwrite.h: + NAL: further simplifications + [d8d61736f2fc] + + * source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h: + NAL: cleanup write method + + This whole class hierarchy is way over-engineered + [44a42cd7f186] + + * source/Lib/TLibEncoder/NALwrite.h, source/encoder/frameencoder.cpp: + NAL: use explicit reset method instead of copy constructor + [155fdcd113fb] + + * source/common/common.cpp: + common: avoid calling atobool() on non-boolean strings + [f641d88f95dd] + + * source/CMakeLists.txt: + cmake: make /WX optional for MSVC + [050273a10519] + + * source/common/common.cpp: + common: refactor logic to avoid compiler warnings + [a6cf678f7981] + +2014-02-21 Steve Borho + + * source/encoder/encoder.cpp: + encoder: warn about the incomplete nature of the range extensions + [82b05314cc4c] + + * source/common/common.cpp, source/x265.h: + api: add an enum for extended SAR + [9d550a10215b] + + * source/common/common.cpp: + common: only validate sarWidth and sarHeight when aspectRatioIdx == + 255 + [06d3b23ef3b6] + + * source/common/common.cpp: + common: fix --extended-sar CLI option atobool() was setting + bError=true + [58ba28e0c9a5] + + * source/common/common.cpp: + common: disallow SAR numerator or denominator of 0 + [f4bbbae743de] + + * source/common/common.cpp: + common: improve error detection and handling in x265_param_parse + + * check return code of sscanf + * check int parameters are ints, bools are bools, etc + + API users (not using the CLI and getopt) + * allow no-foo and nofoo + * allow foo-bar or foo_bar + * no-wpp=1 works as expected + [0c3cb8fadb6f] + + * source/Lib/TLibCommon/TComPic.cpp, source/common/lowres.cpp, + source/common/lowres.h: + lowres: no need to disable AQ on malloc failure, encoder will abort + + And thus there's no need to pass aqMode as an int pointer + [ff3306fae207] + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, source/common/common.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/x265.cpp: + coding style - hand cleanup sections better than uncrustify wanted + to + [66c56fc5dfb9] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp, + source/common/primitives.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.h, source/encoder/compress.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/weightPrediction.cpp, source/test/mbdstharness.h, + source/test/pixelharness.cpp, source/x265.cpp, source/x265.h: + mechanically enforce coding style (uncrustify) + [e118b38a5ef8] + + * source/common/common.cpp: + common: g_convertToBit cannot be indexed with UINT_MAX either + [047cdb785e84] + +2014-02-21 Sumalatha Polureddy + + * source/common/common.cpp: + common: validate maximum ctusize + [ee58b2bb6f3c] + + * source/common/common.cpp: + common: validate "bframe" and "maxCUSize" for positive values + [b8346a9f7020] + +2014-02-21 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + m_tempPel as static const + [b4d6162139f5] + +2014-02-20 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + vbv: integrate row level vbv ratecontrol at each major row diagonal. + [2115a7771ce1] + +2014-02-21 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + vbv: implement row wise vbvRateControl at each row diagonal + [fea3b6fa376b] + +2014-02-20 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + vbv: fix bugs in vbv flow with single pass ABR + [dd1bca7091d5] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + vbv: Add row predictors, rc states for vbv. + [b89687d958f8] + + * source/encoder/slicetype.cpp: + vbv: enable vbvLookahead for Keyframes; accumulate frame rowSatds + from lowres rowSatds. + [34774abb8677] + +2014-02-21 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/lowres.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/x265.h: + vbv: Introduce states to hold row data for row level VBV + ratecontrol. + [00de531690c9] + +2014-02-21 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Backed out changeset: cb3a2795a60e + + This was found to change output bitstreams, so the vectors must not + have beeen always zero + [5e2043f89aa1] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComYuv: combine copyPartToPartLuma and copyPartToPartChroma + + The new function tries to ensure the proper chroma block is copied, + when necessary. + [13f73b241382] + +2014-02-20 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove redundant m_hChromaShift, m_vChromaShift + + TComPrediction has these members, and TEncSearch derives from + TComPrediction + [406abea93732] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: use X265_MALLOC for UChar buffers, and uint8_t type + [6b476469c212] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: use X265_MALLOC to allocate pixel buffers, and pixel + type + [fc3db1cfef2a] + +2014-02-21 Satoshi Nakagawa + + * source/common/x86/pixel-util8.asm, source/test/mbdstharness.cpp, + source/test/mbdstharness.h: + asm: update count_nonzero, add testbench + [0c19c44af2d3] + +2014-02-20 David T Yuen + + * source/common/common.cpp: + Added vui parameter checking + [6ade3939ef2d] + +2014-02-20 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp: + temp fix for out-of-range tmvp (prevents hash mismatch with slower + presets) + + we're not sure why this is necessary, so it must be investigated + further. + [a78935e92bf9] + +2014-02-20 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: rename m_qtTempTransformSkipTComYuv to + m_qtTempTransformSkipYuv + [894bde574bc1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nits + [f32bbe3ab4f8] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: rename m_qtTempTComYuv to m_qtTempShortYuv + + It was always stupid to have the member variable include the full + type name, and it has been the wrong type name for more than 10 + months. + [6107faa8f8a6] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: merge copyPartToPartLuma + [5b08cf274fd7] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: merge copyPartToPartChroma() with copyPartToPartYuv + [1ed4cd65af19] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: remove unused m_predAllAngsBuf, properly init + pointers + [125459e883c3] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, source/encoder/compress.cpp: + TComPrediction: remove trivial access methods, use pixel types + [3bb5337cc665] + + * source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: remove default argument values for internal + functions + [744553cfdaca] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: do not pass output pointer as deference + + The function cannot change the pointer + [a090acec64cd] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TComPrediction: add missing m_ prefixes to member variables + [8a79a2a0433c] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: remove unnecessary dup pointer checks + [034b21f14834] + + * source/Lib/TLibCommon/TComYuv.h: + TComYuv: remove more default argument values + [01dfbf18f791] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: merge methods together for clarity + [873a5a72d85e] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/encoder/compress.cpp: + TComYuv: remove default argument values + [9c79a713e9af] + + * source/Lib/TLibCommon/TComYuv.h: + TComYuv: make some methods private, for future optimizations + [7afb7b6815c0] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: remove hungarian prefixes + [3d89b437f7f7] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: remove unused copyPartToPartChroma() method + [549f5bf10211] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComYuv: fix size of chroma partition copy (resolves placebo heap + corruption) + [a6eee111fd6f] + +2014-02-20 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + tcomrom: move ADI intraModeNumFast table to estIntraPredQT(), no + need for global variable + [3993d404361b] + +2014-02-20 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: fix bug in acEnergyVar with multiple color spaces + [fec3cab87043] + +2014-02-20 Gopu Govindaswamy + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + tcomrom: scaning order table g_sigLastScan replaced with g_scanOrder + + Scaning order table initialization moved into initRom() and same + scaning order table can be used for both 444 and 420 + [d4669065e692] + +2014-02-20 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup useless getMvPred*(): always zero mv + [cb3a2795a60e] + +2014-02-20 Dnyaneshwar G + + * source/common/x86/dct8.asm: + asm: correct improper macro + [ce20bff2027e] + +2014-02-20 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + remove unused code + [2cede5a7c1bf] + +2014-02-20 Murugan Vairavel + + * source/common/x86/pixel-util8.asm: + asm: modified the range of scale value in dequant + [613fbb0687c9] + +2014-02-20 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + reduce addClip + [fe90b71dc775] + +2014-02-20 Deepthi Nandakumar + + * source/CMakeLists.txt: + api: bump up X265_BUILD to 7 (refer previous commit) + [6cb88b30deef] + +2014-02-19 David T Yuen + + * source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/common/common.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + Added command line options to generate a VUI and add it to the coded + bitstream + [04a72988d48f] + +2014-02-19 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/intrapred16.asm, source/common/x86/mc-a.asm, + source/test/pixelharness.cpp: + asm-16bpp: code for addAvg luma and chroma all sizes + [4670fed41ec7] + +2014-02-19 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: recover CRF behavior following fps changeset + a6c9a476b205 + [3389061b75a4] + + * source/encoder/weightPrediction.cpp: + weightp: do not log weights if none are enabled + [d2ebbbbdbe75] + + * source/encoder/frameencoder.cpp, + source/encoder/weightPrediction.cpp: + weightp: don't allow the PPS weightp flag to be changed by weight + analysis + [16602968225f] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: white-space nits + [e465b8c259f7] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + TEncSbac: white-space nits + [f60c76257811] + +2014-02-19 David T Yuen + + * source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + Fixed ENC_DEC_TRACE warnings and errors + [1c78bd13a14f] + +2014-02-19 ashok + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp: + fix for 420 binary mismatch for --preset=slower option + [591ca91f0501] + +2014-02-19 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/dct8.asm, source/test/mbdstharness.cpp: + asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and + idst4x4 primitives + [96e94ea91f58] + +2014-02-19 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp: + prevent implicit type conversion warnings from MSVC + [9ddc919f678a] + +2014-02-18 Steve Borho + + * source/CMakeLists.txt, source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/common/common.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/slicetype.cpp, source/input/input.h, + source/input/y4m.h, source/input/yuv.h, source/output/output.cpp, + source/output/output.h, source/output/y4m.cpp, source/output/y4m.h, + source/x265.cpp, source/x265.h: + api: add support for float or rational FPS [API CHANGE] + + Since x265_param was changing anyway, I went ahead and changed + inputBitDepth to internalBitDepth, which has always been its real + function. + + X265_BUILD is bumped to 6 + [a6c9a476b205] + + * source/x265.cpp: + cli: add x264's 'seek', leave HM's 'frame-skip' as undocumented + alias + + We want to support x264's CLI options as much as possible, but don't + want to break any existing scripts + [1a0d5b456b19] + +2014-02-17 Sumalatha Polureddy + + * source/encoder/compress.cpp: + compress: improve rd 1 performance + + a. Always allow intra mode for mode decision b. increase more skips + at each depth(do encode of best merge candidates initially and if + the merge is skipped, dont do inter modes) + [9f4d3ef34e5a] + +2014-02-18 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: change RateControl::lastSatd to currentSatd, add + comments + [757d3e3db046] + +2014-02-18 Satoshi Nakagawa + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/common/primitives.cpp: + nextState table + [f1951bb4c2ae] + +2014-02-18 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: remove unused alloc and stride + [8571d160aedb] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + backout unintended commit + [1be6b8c8b9ed] + + * source/encoder/encoder.cpp: + encoder: show per-slice type SSIM as dB + [720768692efe] + +2014-02-17 Steve Borho + + * source/Lib/TLibCommon/TComRom.cpp: + TComRom: use x265_malloc/free for globals + [7b5b3a5475a7] + + * source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp: + TComRom: prevent multiple allocations or frees of globals + [d2a61d579c30] + +2014-02-17 Nabajit Deka + + * source/test/ipfilterharness.cpp: + testbench : test bench correction for chroma_p2s + [3505b249397d] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h: + asm : asm routine for chroma_p2s for 4:4:4 color space format + [df79cdee5d46] + +2014-02-17 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 16bpp code for quant and dequant_normal + [765d6225b252] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + testbench: adding seperate input buffer for idct and updated qp + value for quant + [797e8bb43887] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h: + asm: fix for illegal instruction usage in ipfilter + [46a9e97caaba] + +2014-02-17 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + cleanup unused variables + [a3dc5ba8cc95] + + * source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm: + primitives: add count_nonzero + [12d752ac7c4f] + +2014-02-17 Steve Borho + + * source/encoder/weightPrediction.cpp: + weightp: fix V plane debug output + [5bf042e30d30] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: nit + [4a06d2485b45] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: remove dup line + [4ba139f9df57] + + * source/Lib/TLibCommon/TComSlice.h, source/common/lowres.h, + source/encoder/reference.h: + weight: remove odd struct renaming, remove unused wpACDCParam + [30edc6f7475e] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unused m_weightACDCParam and methods + [d2312e05014a] + + * source/encoder/weightPrediction.cpp: + weightp: add debug logging for weight analysis + [a1047dfeebb6] + + * source/encoder/encoder.cpp: + encoder: include the hash output into the debug output string + [393f6ef8b7bd] + +2014-02-16 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + api: change name back to inputBitDepth, to avoid breaking ffmpeg + [ce96cdb390fe] + +2014-02-15 Steve Borho + + * source/CMakeLists.txt, source/common/cpu.cpp, + source/common/primitives.cpp: + non-x86 build fixes + [291b3a358a22] + +2014-02-14 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp, + source/common/cpu.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/threading.h, + source/common/threadpool.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel.h, source/encoder/compress.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp, + source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/x265.h: + uncrustify all source - mechanically apply coding style + [39ecb3aa82ee] + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/common.cpp, + source/x265.cpp, source/x265.h: + enforce new color space requirements, improve help + [76f13355e520] + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/common.cpp, + source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + decouple input depth from internal depth; allow input depth != + internal depth + + Shift and mask input samples as necessary to reach internal depth. + The input depth is no longer in x265_param, where it never really + belonged since pic.bitDepth was the only number that mattered. + + Add validations for input color space and bit depths. No CLI option + was added for internal bit depth since it is not currently runtime + configurable. + [b1f5fd61883a] + + * build/msys/make-x86_64-w64-mingw32-Makefiles.sh, build/msys + /toolchain-x86_64-w64-mingw32.cmake: + build: add example cross-compile script for Win64 on MinGW32 + [0911885e0f28] + +2014-02-15 Deepthi Nandakumar + + * source/test/ipfilterharness.cpp: + testbench: disable chroma_p2s. + [289b4ef4ecee] + + * source/test/ipfilterharness.cpp: + ipfilterharness: resolve compile errors with CSP + [482aa2e636f9] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + Merge with 444 changes. + [8662459da60b] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Backed out changeset: a3a9e0fb1a87 + + Conflicts with the 444 patches. Will resolve merge and import it in + a seperate patch. + [98f5088cff19] + +2014-02-14 ashok + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + Modify TComLoopFilter structure to support multiple color space + formats + [c731d494e9ca] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/common/TShortYUV.h, + source/common/common.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/encoder/encoder.cpp: + Modify asm primitives functions to support multiple color space + formats + [deb0cffa87fa] + + * source/Lib/TLibEncoder/TEncEntropy.cpp: + Modify TEncEntropy structure to support multiple color space formats + [a65cb22fcf10] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + Modify FrameFilter structure to support multiple color space formats + [6abd54cbff84] + + * source/Lib/TLibCommon/TComDataCU.cpp: + Modify TComDataCU structure to support multiple color space formats + [1e8fbc073d4b] + + * source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + Modify TEncSbac structure to support multiple color space formats + [19532171622a] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + Modify TComPrediction structure to support multiple color space + formats + [ad8a991d626e] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Modify TComPattern structure to support multiple color space formats + [503e3794098a] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Modify TEncSearch structure to support multiple color space formats + [fb14a98a97e4] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp: + Modify ChromaScale table to support multiple color space formats + [57156005d32f] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h: + Modify TComRom structure to support multiple color space formats + [3572d9d04ff7] + +2014-02-14 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: use pixel type instead of Pel type + [d77a549b8061] + + * source/common/version.cpp: + version: today, our max supported input bit depth is 10 + [6b197e8561ec] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: always set depth and colorspace fields + [d329e698d420] + + * source/encoder/encoder.cpp: + encoder: use x265_log for per-frame debug log, use dB for SSIM + [da7f6bebdd30] + + * source/encoder/encoder.cpp: + encoder: now only one QP per row in CSV output + [a661d9eef931] + +2014-02-14 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: cleanup, repetitive code + [5d76e8a19fed] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + ratecontrol: cleanup, remove sliceQPBase. + + Always a duplicate of sliceQp. + [303634977dc4] + + * source/encoder/ratecontrol.cpp: + ratecontrol: cleanup + [4906f1fe7d06] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: clean up struct; remove baseQp + [8d4fd1d98337] + +2014-02-14 Steve Borho + + * build/msys/make-Makefiles.sh: + build: change eoln of msys bash script to unix + [ad21da9b607c] + +2014-02-14 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + square transform only + [a3a9e0fb1a87] + +2014-02-14 Nabajit Deka + + * source/common/x86/pixeladd8.asm: + asm: Clean up and minor modifications in pixel_add_ps 16bpp asm + functions(4xN) + [248b665970e8] + +2014-02-14 Steve Borho + + * source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/encoder/CMakeLists.txt, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + remove unused HM WeightPredAnalysis files + [ed310b17ff66] + + * source/encoder/encoder.cpp: + encoder: do not generate digest string if we are not going to print + it + [d43e8e0c950d] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: report the hash digest from the correct frame encoder + [d6559298428a] + +2014-02-14 Sumalatha Polureddy + + * source/encoder/compress.cpp: + compress: Bug fix in rd2 + + the sa8d cost in rd2 for inter and intra are different for inter, + totalbits = 0, for intra, totalbits = cabac bits for now, making the + totalbits = 0 for both inter and intra + [d90a4adcb492] + + * source/encoder/compress.cpp: + compress: missed few lines of code while applying previous patch + + 1. Increase the eraly skips in rd2 2. Sa8d cost is not calculated, + but used in the code + [11ffc3cfe0d8] + +2014-02-14 Steve Borho + + * source/encoder/reference.cpp: + reference: remove unnecessary duplicate variable + [0d033b5677da] + + * source/CMakeLists.txt: + cmake: on MSVC, CMAKE_CXX_IMPLICIT_LINK_LIBRARIES and PLATFORM_LIBS + may be empty + [f46c3f816fe7] + +2014-02-13 Steve Borho + + * source/CMakeLists.txt: + cmake: add a blacklist of libs to keep from x265.pc Libs.private + [757b127f8ede] + +2014-02-13 Tom Vaughan (tom vaughan + + * source/common/common.cpp: + Remove redundant settings from performance presets + [0265344d0727] + +2014-02-13 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/encoder/compress.cpp: + const tables + [2ce38565571e] + +2014-02-13 Murugan Vairavel + + * source/common/x86/ipfilter8.asm: + asm: cleanups for ipfilter functions to reduce register counts + [fcfe87ee36b7] + +2014-02-13 Tom Vaughan (tom vaughan + + * source/common/common.cpp: + Remove redundant settings from performance presets + [8093e808bfee] + +2014-02-13 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + fix bug on TSKIP + [402b11d9df80] + +2014-02-13 Steve Borho + + * source/encoder/compress.cpp: + compress: fix gcc warning, make array const + + c:/mcw/x265/source/encoder/compress.cpp:414:22: warning: variable + 'bestMergePred' set but not used [-Wunused-but-set-variable] + TComYuv* bestMergePred; + [d9e6f16baa29] + + * source/CMakeLists.txt: + cmake: more general fix for CFLAGS breaking resource compiles + + There are some versions of rc.exe that also balk at being given + cl.exe flags + [8fdcf1ba38f3] + + * source/common/common.cpp, source/x265.h: + api: improve documentation of x265_picture, properly initialize + pic.colorSpace + [1420cb397447] + +2014-02-13 Murugan Vairavel + + * source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp, + source/test/mbdstharness.h: + testbench: added stress test cases for all functions in + mbdstharness.cpp + [5e4e4bda0e7e] + +2014-02-13 Deepthi Nandakumar + + * Merge + [5e104ed219c3] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/compress.cpp: + compress: improve rd 2 performance (Patch from Sumalatha) + + a. Always allow intra mode for mode decision b. increase more skips + at each depth(do encode of best merge candidates initially and if + the merge is skipped, dont do inter modes) + + Bitrate increase compared to rd 3 (4-6%) on most HD clips, quality + maintained. Performance impact to be measured in regression tests. + + Sintel 720p clip needs to be investigated further. Poor show here. + [313e42bdd4ea] + +2014-02-13 Steve Borho + + * source/common/pixel.cpp: + weightp: add assert to protect assembly limitiation + [df4531a24c4d] + + * source/encoder/weightPrediction.cpp: + weightp: pad width passed to weight_pp(), it requires 16byte + alignment + [9dffa4473096] + + * source/common/threading.h: + threading: fix compile when XP support is enabled + [7c24e5a920b2] + + * source/CMakeLists.txt, source/cmake/version.cmake: + cmake: be quiet about missing optional packages + [f0d82bf20584] + + * source/CMakeLists.txt, source/cmake/version.cmake: + cmake: simplify version detection, use single set of variables + [e1088cae157c] + + * source/cmake/version.cmake: + cmake: use findGit.cmake, which was available in cmake 2.8.8, our + min version + [726323914cd2] + + * source/CMakeLists.txt: + cmake: workaround for MinGW windres + [0fcc87d05d10] + +2014-02-12 Steve Borho + + * source/CMakeLists.txt: + cmake: change name of MSVC built shared library to avoid filename + conflicts + + the shared library and CLI exe were both trying to write x265.pdb + and x265.ilk with unhelpful results. + [21832083908f] + + * source/CMakeLists.txt, source/x265.rc.in: + cmake: add resource file for Windows version info on DLL and EXE + + This sadly doesn't work when compiling under MinGW; cmake has an + unresolved bug that passes all CFLAGS to windres, which pukes on + them. + [54189fefdc51] + + * source/x265.cpp: + cli: use real log commands + [218f8304978e] + +2014-02-12 Min Chen + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + fix bug on TSKIP + [d6774d83f39c] + +2014-02-12 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + cleanup TEncSbac + [d22564466556] + +2014-02-12 Steve Borho + + * source/encoder/encoder.cpp: + encoder: remove two completed TODOs + [21c2724dfcd1] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: remember abort condition and stop accepting input frames + [f894b457aca8] + +2014-02-12 Santhoshini Sekar + + * source/common/lowres.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + vbv: lookahead + [adee518df8ab] + +2014-02-12 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm-16bpp: assembly code for IntraAng32x32 all modes + [cca149489a6f] + +2014-02-12 Nabajit Deka + + * source/common/x86/pixel-util8.asm: + asm : Clean up and minor modifications in pixel_sub_ps asm + functions(2x4, 2x8, 6x8) + [d83f25c4ae1d] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Test bench : Stress test cases for remaining filter functions. + [3eb456f69e34] + +2014-02-12 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang8 - all modes + [5457f23edb5d] + +2014-02-13 Gopu Govindaswamy + + * source/encoder/ratecontrol.cpp: + ratecontrol: to set B-ref frame QP as Lower then B-Frame QP for RC + with CQP mode + [1894c1c35bac] + +2014-02-12 Kavitha Sampath + + * source/encoder/weightPrediction.cpp: + weightp: fix hash mismatch + + all references should have same luma and chroma denominator + [51c86499d3bd] + +2014-02-12 Steve Borho + + * source/x265.h: + api: we don't need stuttering header guards + [817f5bf391d2] + + * source/x265.h: + api: mark externed variables for import from Windows shared library + [c9fc8dab7b9a] + + * source/CMakeLists.txt, source/common/cpu.cpp: + cmake: a few fixes for building on ARM (Rapsbery Pi in particular) + + Next step will be bringing over ARM cpu detect assembly functions: + x265_cpu_neon_test() x265_cpu_fast_neon_mrc_test() + [5ddbdaefb783] + + * source/encoder/weightPrediction.cpp: + weightp: clip lowres MV before using for motion compensation + + Even the lowres MVs with very restricted merange can go beyond the + available pixels. This was causing memory access exceptions on some + clips. + [19d7752a4f9c] + + * source/encoder/slicetype.cpp: + slicetype: cast mvmin/mvmax indices to signed 16bit values + [a3df372438a4] + +2014-02-12 Min Chen + + * source/common/x86/blockcopy8.asm: + asm: fix bug in cvt16to32_shl + [eedfa574e07e] + +2014-02-11 Santhoshini Sekar + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + vbv: set vbvminrate + [a8b2456aabef] + + * source/encoder/ratecontrol.cpp: + rc: Don't do resetABR in CRF. + [c11f0459464c] + +2014-02-11 Nabajit Deka + + * source/common/x86/blockcopy8.asm: + asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8) + [bf2f60a2d425] + +2014-02-03 Nabajit Deka + + * source/test/ipfilterharness.cpp: + testbench: Added stress test cases for + check_IPFilterLuma_ps_primitive, check_IPFilterLuma_hps_primitive + and check_IPFilterLumaHV_primitive filter functions + [eb19b59558c0] + +2014-02-11 Murugan Vairavel + + * source/common/x86/ipfilter8.asm, source/test/ipfilterharness.cpp: + asm: Optimizations and cleaups on ipfilter functions + [4e5ab7003f23] + +2014-02-03 Nabajit Deka + + * source/test/ipfilterharness.cpp: + testbench: Added stress test cases for chroma_pp, chroma_ps and + chroma_hps filter functions + [43d6027b977b] + +2014-02-11 Steve Borho + + * source/encoder/encoder.cpp: + encoder: protect public APIs against NULL pointer arguments + [3dd1f72225e6] + + * source/cmake/version.cmake: + cmake: set X265_LATEST_TAG from latesttag: line of .hg_archival.txt + + The net effect of this is that non-tagged release bundles will be + capable of installing shared libraries with effective sonames. + [d6fdfa9f4938] + + * source/x265.h: + api: improve documentation of x265_picture dts, pts + [973ad4575a27] + +2014-02-10 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: pass bools as bool literals + [07b5d6b82f5f] + +2014-02-11 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + cleanup AMVP related + [2316e8e33512] + +2014-02-10 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup unused + [b449d4d4f02d] + +2014-02-10 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp: + pic: use C style comments + [01d0f7758171] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + pic: mask impossible input bits + [8538c3383ade] + + * source/common/lowres.cpp: + lowres: nits + [96d16486e317] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/lowres.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/encoder/weightPrediction.cpp: + TComPicYuv: replace xExtendPicCompBorder with an optimized function + + We've had a primitive for the side borders for some time, it just + wasn't hooked up. This function never should have been a method of + that class. + [47592ed6aa2c] + + * source/x265.cpp: + help: --cutree is a boolean flag + [d002f45eee16] + + * source/x265.cpp: + help: move --cutree just after aq options + [75f724dd6d4c] + + * source/CMakeLists.txt: + cmake: prevent extra errors when system has no C++ compiler + [6b3defda16c1] + + * source/encoder/weightPrediction.cpp: + weightp: avoid redundant chroma extensions + [ce7191f49948] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/dpb.cpp: + TComPic: add m_chromaPlanesExtended member var + [8d98425e0a0a] + +2014-02-10 Kavitha Sampath + + * source/encoder/weightPrediction.cpp: + weightp: extend chroma borders before mcChroma, remove redundant + checks + [62c760413522] + +2014-02-07 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + weightp rewrite without a class + + weightp analysis for the main encoder is performed with motion + compensation (using lowres motion vectors from lookahead) when + available. The lowres luma plane is used for luma analysis and the + chroma planes are analysed as-is. + [4ec4065fc392] + +2014-02-10 Steve Borho + + * source/Lib/TLibCommon/TComPic.h: + TComPic: clean up data member ordering, add comments + [17847935873a] + +2014-02-08 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + improve getInterMergeCandidates() + [2fc9c0a08534] + +2014-02-09 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup MPM related + [0b44c6c07582] + +2014-02-09 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + api: repair `--keyint -1` behavior; single keyframe at beginning of + stream + + Also, disable scene cut detection to save a few cycles in lookahead. + Fix and/or improve documentation for the keyint parameters + [60e6a7339027] + +2014-02-07 Murugan Vairavel + + * source/common/x86/ipfilter8.asm: + asm: cleanup unused registers interp_4tap_horiz_pp_2xN and 4xN + [fa9f7b56d4d8] + +2014-02-07 Steve Borho + + * source/common/CMakeLists.txt, source/common/vec/intra-ssse3.cpp, + source/common/vec/vec-primitives.cpp: + vec: remove intra-ssse3.cpp, full assembly coverage + [7da1a8d3bbbe] + +2014-02-07 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: intra_pred_ang16 code for all remaing modes + [990dbb374285] + +2014-02-07 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h, + source/common/x86/mc-a.asm, source/common/x86/pixel.h: + asm: correction of function declaration to sse4 + [95fc15598e3e] + +2014-02-07 Yuvaraj Venkatesh + + * source/common/x86/sad-a.asm: + asm: fix sad_x4 stress case failure on AVX2 + [cfbe679e73dc] + +2014-02-07 Deepthi Nandakumar + + * source/encoder/slicetype.cpp, source/x265.h: + x265: remove X265_TYPE_KEYFRAME. + + Not used, and not required. IDR/I-slice can be chosen at the outset + based on openGOP. + [c1cea0534e6b] + +2014-02-07 Steve Borho + + * source/common/dct.cpp: + dct: disable assertion for 10bit builds + + The assertion is there to protect 8bpp assembly + [d2d181f1881a] + + * source/encoder/weightPrediction.cpp: + weightp: avoid MSVC warnings about implicit bool to int casts + [da1dda5e762a] + +2014-02-06 Steve Borho + + * source/common/vec/intra-ssse3.cpp: + Backed out changeset: 5634d0322161 + [53c6acae9b0a] + + * source/common/vec/intra-ssse3.cpp: + vec: remove unused angAP array + [5634d0322161] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h: + TComBitstream: simplify push_back(), fix clear(), give buffsize an + 'm_' prefix + + The size of the allocated buffer is not reset on clear. Issue an + error message if any mallocs fail, to at least indicate an encoder + in serious trouble. + [f0e14af85843] + + * source/Lib/TLibCommon/TComBitStream.cpp: + TComBitstream: fix check for NULL m_fifo member + + There was no point in checking for NULL in the if() expression + because m_fifo was used in the else. (identified by clang) + [436cf988b016] + + * source/Lib/TLibCommon/TComBitStream.cpp: + TComBitstream: remove hungarianness from parameters and auto-vars + [ddbbb1f92ce3] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/primitives.cpp, + source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/encoder/weightPrediction.cpp, + source/x265.cpp: + Fix problems found by clang static analyzser in Xcode + + These were mainly stores that were never read + [1a68f0dd9acb] + + * source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + weightp: remove useless m_dstStride variable + [c54271b906da] + + * Merge + [21d808d834c4] + + * source/Lib/TLibCommon/TComPicYuv.h: + nit + [1776b9a58585] + + * source/encoder/weightPrediction.cpp: + weightp: don't use m_ prefix for non member variable + [8f025ee0a506] + + * source/encoder/weightPrediction.cpp: + weightp: do not blindly assume 4:2:0 chroma dimensions + [9bc4b7b1454e] + + * source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + weightp: non-trivial constructors and destructors should not be in + headers + [d87b6e92c996] + + * source/common/vec/intra-ssse3.cpp: + vec: remove intraPredAng32x32, full asm coverage + [40bec5582eca] + +2014-02-06 Dnyaneshwar G + + * source/common/x86/intrapred8.asm: + asm: fix Intrapred_ang[32x32] mode 10 and 26 failure on Mac + [a079afc4e6c7] + +2014-02-06 Gopu Govindaswamy + + * source/encoder/slicetype.cpp: + slicetype: bug fix for cuTree, use int32_t for listamount and + propagate_amount to calculate valid propagate_cost + [6d5207b8b2ef] + +2014-02-06 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/sad16-a.asm: + asm: modified satd and sad asm functions in 16bpp to avoid overflow + [ffe13a5eccb9] + +2014-02-06 Murugan Vairavel + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + testbench: stress test support for all pixelharness functions + [b86a25eb7968] + +2014-02-06 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraAng32x32 all modes + [76fa0811c4e7] + +2014-02-06 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + remove unnecessary copyToPicLuma() call + [db0c1dfc3a11] + +2014-02-05 Steve Borho + + * source/common/common.cpp: + common: use ATOMIC_CAS32 to update an int + [634bc0b1c246] + + * source/common/threadpool.cpp: + threadpool: use aligned malloc to allocate sleep bitmap + + This kills three birds with one stone - it removes a source of + possible un- alignment, it removes the restriction of max 64 + threads, and it further simplifies pool start and stop + [53b8daed7df5] + + * source/encoder/slicetype.cpp: + slicetype: fix 10bpp intra pixel preparations - found by valgrind + [fc90c9b265fd] + + * source/common/vec/intra-ssse3.cpp: + vec: remove 4x4 and 8x8 intra mode prediction functions, asm + coverage + [8c9e1b3564e8] + + * source/common/vec/intra-sse41.cpp: + vec: remove 4x4, 8x8, and 16x16 allangs functions; covered by + assembly + [bf4dbea1e4f5] + + * source/common/x86/asm-primitives.cpp: + asm: remove redundant macro definition + [ea99e4d138cd] + +2014-02-05 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + all_angs_pred_16x16, asm code + [906d972bb4b7] + +2014-02-04 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred8.asm: + asm: intra_pred_ang8 asm code for all modes + [669000ad4a0d] + +2014-02-04 Murugan Vairavel + + * source/common/x86/intrapred8.asm: + asm: Modifications to intrapred16 modes 3, 4, 32 and 33 such that it + uses TRANSPOSE_STORE macro of intrapred32 + [cd73618857c5] + +2014-02-05 Steve Borho + + * source/CMakeLists.txt: + cmake: only officially support 16bpp builds on x64 architectures + [2f54c7616ef8] + +2014-02-05 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + abr: reset ABR to prevent high bitrate peaks in single pass ABR + + Long series of blank frames in video followed by detailed content + causes heavy ABR underflow and overall bitrates surges high for a + long while. This patch detects this condition in Single pass ABR + mode and resets ABR - to not consider history from blank frames and + continue from following scene-cut. + [fc86625df0d9] + +2014-02-05 Gopu Govindaswamy + + * source/common/lowres.cpp: + lowres: initialize weightedCostDelta to avoid Valgrind reporting + uninitialized memory + [8d9abc152370] + + * source/encoder/slicetype.cpp: + cuTree: bug fix for frameCostRecalculate + [0776a6722375] + +2014-02-05 Steve Borho + + * source/encoder/encoder.cpp: + encoder: try not to leak memory after malloc failures + [def14c0234f5] + + * source/common/lowres.cpp: + lowres: fix gcc/clang compile errors + + gcc doesn't like to jump to labels that bypass variable + initializations, even if they are not used passed the jump + [e05898ee63c0] + +2014-02-05 Deepthi Nandakumar + + * source/common/x86/asm-primitives.cpp: + Backed out changeset: a88c7bbfba61 + + The addAvg x86 versions have now been fixed + [1374f1168c5c] + +2014-02-05 Satoshi Nakagawa + + * source/common/x86/mc-a.asm: + fix addAvg + [d20c11f2775f] + +2014-02-04 Steve Borho + + * source/common/lowres.cpp, source/common/lowres.h: + lowres: there was no need to pass an int* as an int32_t* + [76be476dfed5] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/lowres.cpp, + source/common/lowres.h, source/encoder/encoder.cpp, source/x265.cpp: + use checked mallocs in TComPic::create() and in functions it calls + + This is the most obvious place memory allocation failures will + occur, at encoder startup. So it is best to catch them cleanly. + [6d0b6602e730] + + * source/test/pixelharness.cpp: + pixelharness: actually report chroma addAvg as addAvg, not add_ps + [bed6e485d6c1] + + * source/test/pixelharness.cpp: + pixelharness: fix luma_addAvg tests + [43966a9a13ba] + + * source/common/x86/asm-primitives.cpp: + asm: disable x86 versions of addAvg - they are broken + [a88c7bbfba61] + + * source/common/common.cpp: + log: move lookahead options all together, rate-control at the end + (nit) + [51011b224a7c] + + * source/common/lowres.cpp: + lowres: initialize downscale planes immediately after alloc + + Valgrind was reporting potential uninitialized memory being used by + the lowres weightp cost estimate function. It is weighting the + entire padded luma plane instead of weighting just the real pixels + and then extending them. The problem is that the buffer stride is + wider than (width + padw*2). We round up the stride to the nearest + multiple of 32 so the row starts are well aligned, and those pixels + at the very right edge of the buffer were never written to. They + should never be used by the encoder, but the weight_pp primitive + does try to weight them; and the last step of the weight function is + a clip, and that conditional move is what triggered the valgrind + warning. + [8d2b20447b8a] + + * source/test/ipfilterharness.cpp: + ipfilterharness: fix loop bounds, caused crashes in 16bpp + [c16c7b8416ee] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: weight the extended lowres frame prior to MC cost + estimate + + valgrind spotted this problem where the top and bottom padded areas + of the lowres reference frame were not weighted into the weightedRef + buffer, so any lowres MVs used for MC that referenced past the top + or bottom picture boundary would access uninitialized pixels. + [592675e50c29] + + * source/common/wavefront.cpp, source/common/wavefront.h: + wavefront: rename bitmaps for clarity, add more comments + [23d30a6d4db1] + + * source/Lib/TLibCommon/CommonDef.h, source/common/common.h: + common: move malloc/free macros to our header + [0bd4e7603ea1] + + * source/common/common.cpp, source/x265.cpp: + cli: change --keyint short option to capital I, add min-keyint, + [no-]shortcut + + All these options now match x264's command line features + [2beb0bfb9503] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: avoid bogus warning from VC9/VC10 + [4be0ca7b4448] + +2014-02-04 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + cleanup unused variables + [b54aa2713111] + +2014-02-03 Steve Borho + + * source/Lib/TLibEncoder/NALwrite.cpp: + NALWrite: simplify emulation detection + [ff430d39d428] + + * source/input/y4m.cpp: + Merge with stable + [f121e16811be] + + * source/input/y4m.cpp: + y4m: better handling of eof during frame header reads + + It wasn't checking the stream state properly (a bug introduced when + ifs was made into a pointer for stdin) and doing a memcmp against + possibly uninitialized stack memory, which just happened to usually + have a proper frame header in it since the reader thread stack use + is very predictable. + + This was the last warning reported by valgrind on the stable branch + [4b8901ae94ec] + + * source/encoder/slicetype.cpp: + slicetype: use x265 naming scheme for auto vars and parameters + [930b251ac6b7] + + * source/input/y4m.cpp: + y4m: reorder functions for clarity + [8de9b432ba15] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: replace ints and long ints with bools where appropriate + [b0a594fe1867] + + * source/input/y4m.cpp: + y4m: simplify guessFrameCount + [c111f1efc9c2] + +2014-02-03 Satoshi Nakagawa + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + reduce context + [d5326d9bf1b2] + +2014-02-03 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: backout c4e99fde0b and fix indentation + [6aa952372175] + +2014-01-21 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred8.asm: + assembly code for intra_pred_ang8_5 + [2297a3777658] + +2014-02-03 Deepthi Nandakumar + + * Merge + [169a7d7c51ef] + +2014-01-20 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: code for intra_Pred_Ang16x16 mode 32 + [3131a2ac8ec6] + +2014-01-21 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred8.asm: + assembly code for intra_pred_ang8_4. + [dc2de7c4f6c1] + +2014-01-20 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred8.asm: + asm: code for intra_Pred_Ang16x16 mode 4 + [c50f78691043] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: code for Intra_pred_Ang16x16 mode 3 and 33 + [e9867f0a16a2] + +2014-02-03 Steve Borho + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/NALwrite.cpp, source/common/TShortYUV.cpp, + source/common/common.cpp, source/common/lowres.cpp, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/reference.cpp, source/encoder/slicetype.h, + source/encoder/weightPrediction.cpp, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/mbdstharness.cpp, source/test/pixelharness.cpp: + common: change X265_MALLOC macro to return typed pointer + + One less opportunity for a stupid mistake + [a260f55429e3] + + * source/CMakeLists.txt: + cmake: treat empty CMAKE_SYSTEM_PROCESSOR as x86 (fixes #25) + [900a13b0b50a] + + * source/encoder/encoder.cpp: + follow x264's keyframe-min logic [CHANGES OUTPUTS] (closes #24) + + If no --keyint-min is specified, default to max/10, and clamp the + value to between [1, max / 2 + 1]. This allows x265 to insert I + frames when scene cuts are detected between keyframe-min and + keyframe-max + [898ccce491e9] + +2014-02-03 Dnyaneshwar G + + * source/Lib/TLibCommon/TComYuv.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/common/x86/const-a.asm, source/common/x86/mc-a.asm, + source/common/x86/pixel.h, source/test/pixelharness.cpp: + asm: code for addAvg luma and chroma all sizes + [71841b07b8ee] + +2014-02-03 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp: + frameencoder: use macro MAX_MAX_QP + [55b4d5135e06] + +2014-02-03 Satoshi Nakagawa + + * source/encoder/frameencoder.cpp: + fix + [c89f04114391] + +2014-02-02 Steve Borho + + * source/Lib/TLibEncoder/TEncSbac.cpp, source/encoder/CMakeLists.txt: + cmake: remove two MSVC warnings overrides, HM code has been somewhat + sanitized + [aab88ed13364] + + * source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/common/CMakeLists.txt: + cmake: remove two MSVC warnings overrides, HM code has been somewhat + sanitized + [8150374cb0d2] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt: + cmake: when x86 arch is detected, set gcc -march=i686 globally + + This enables a number of cleanups in the internal cmake scripts + [eff52bc89e94] + +2014-02-01 Steve Borho + + * source/CMakeLists.txt: + cmake: prevent warnings from recent builds of cmake + + cmake really should ignore cmake_policy(SET foo) if foo is not yet + supported, so every cmake user doesn't have to check the exact + version which introduced that backward compatibility option.. but... + [bb33ab0f4ef9] + + * source/common/x86/x86inc.asm: + x86inc: Make ym# behave the same way as xm# + + x264 commit 0997c288be10 + [81f2c587a0a7] + + * source/common/x86/x86inc.asm: + x86inc.asm: allow x64 output format + + x264 commit 3361d59a0a83d + [683361fd76c2] + + * source/common/x86/x86inc.asm: + asm: pull in pengvado's header patch to speed up yasm compiles + [15f1f927bcfe] + +2014-01-29 Satoshi Nakagawa + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + reduce unused context models + [33929c36a646] + +2014-02-01 Steve Borho + + * source/CMakeLists.txt: + cmake: fix warning + [737ceb148a27] + + * source/CMakeLists.txt: + cmake: use strlower on CMAKE_SYSTEM_PROCESSOR + [9e2b076968e1] + + * source/CMakeLists.txt: + cmake: only allow assembly to be enabled for X86, our only asm + platform + [2a7ff626383d] + + * source/CMakeLists.txt: + cmake: reorg main file for readability, no behavior changes + [2812a45ace5c] + + * source/CMakeLists.txt, source/common/cpu.cpp, source/x265.h: + cpu: port ARM cpu detection code from x264 + [0e734b111b1e] + + * source/CMakeLists.txt: + cmake: add two more system processor names that are synonyms of x86 + [68f2d08654b9] + + * source/CMakeLists.txt, source/common/cpu.cpp, + source/common/primitives.cpp: + cmake: improve handling of unknown system processor + [7f1d29a897c1] + + * source/encoder/weightPrediction.cpp: + weightp: add math include for POSIX systems + [389328343ccd] + +2014-01-31 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: use X265_DEPTH instead of g_bitDepth + + On 8 bit builds, bit depth is known at compile time, allowing the + compiler to optimize away a few of these operations. + [413ad959a5c6] + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/common.cpp, + source/common/cpu.cpp, source/common/intrapred.cpp, + source/common/ipfilter.cpp, source/common/lowres.h, + source/common/pixel.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/threadpool.cpp, + source/common/vec/intra-sse41.cpp, source/common/vec/intra- + ssse3.cpp, source/common/vec/vec-primitives.cpp, source/common/x86 + /asm-primitives.cpp, source/encoder/compress.cpp, + source/encoder/cturow.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/encoder/weightPrediction.cpp, + source/input/y4m.cpp, source/input/yuv.cpp, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp, source/x265.h: + uncrustify source (mechanical coding style enforcement) + + A few changes that uncrustify wanted to make have been left out of + the commit for style reasons. + [9d0abf80eeb1] + + * source/encoder/weightPrediction.cpp: + weightp: cleanups + [fb048ad78e78] + + * source/encoder/weightPrediction.cpp: + weightp: vc11-win32-debug workarounds + [461316bc1dd5] + + * source/encoder/ratecontrol.cpp: + ratecontrol: add missing braces + [c4e99fde0b0b] + + * source/encoder/ratecontrol.cpp: + ratecontrol: white-space nits + [4aed055bd1ed] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Merge with stable + [65003e385629] + +2014-01-30 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/cpu.cpp, source/common/primitives.cpp, + source/common/vec/vec-primitives.cpp, source/encoder/CMakeLists.txt: + cmake: attempt to support non-x86 compile targets + [8769cd7b97ac] + +2014-01-31 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: white-space nits + [58cff481d6ed] + + * source/test/ipfilterharness.cpp: + testbench: fix signed/unsigned comparison warning + [24e448ed4341] + +2014-01-30 Nabajit Deka + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h, + source/test/testharness.h: + testbench: add stress test case for luma_pp filter function + [897067ac23ac] + + * source/test/pixelharness.cpp: + testbench: fix for random test bench failure caused by pixeladd_ss + [8f066e4e48e9] + +2014-01-31 Steve Borho + + * .hgtags: + Added tag 0.7 for changeset d24e2a8c4326 + [edf64ac976ea] + + * source/encoder/slicetype.cpp: + slicetype: comment nits + + Remove a comment copied from x264 that has no bearing in x265, and + fix the alignment of another comment. + [d24e2a8c4326] [0.7] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: alloc wpScalingParam instance as a struct member + + This is a workaround for VC11. When x265 was compiled for debug + targeting Win32 the stack was being reported as corrupted by + weightCostLuma(). No other compiler or build option would report any + problems (not even valgrind). In the VisualStudio debugger the stack + would be obviously garbaged once the function was entered. Moving + `w` off of the stack makes the VC11 debugger happy again. + [86081bfcacf9] + + * source/encoder/slicetype.cpp: + slicetype: use explicit float type constant + [e04f2b3dea39] + + * source/encoder/slicetype.cpp: + slicetype: prevent divide-by-zero and sqrtf(0) + [3bc0651c0f40] + +2014-01-31 Praveen Tiwari + + * source/common/x86/pixel-a.asm: + asm: fix for potential mismach between ASM and no-ASM outputs + [539d1b0561b1] + +2014-01-30 Steve Borho + + * source/Lib/TLibCommon/TComPic.h, source/common/common.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + Merge with stable + [eb3713ab0641] + +2014-01-28 Steve Borho + + * source/common/threadpool.cpp: + threadpool: use a wait event per worker thread + + For simplicity, this patch caps the number of worker threads to 64. + The bitmap could be trivially extended if necessary. + + This removes the common wake event, which complicated startup and + shutdown and flush events. + [6fe8d1d519f7] + +2014-01-30 Steve Borho + + * source/encoder/cturow.h, source/encoder/frameencoder.cpp: + cturow: detect and prevent simultaneous row access + + Temporary workaround until we are certain the findJob() race hazards + are indeed resolved completely. + [564eefbb3812] + +2014-01-28 Steve Borho + + * source/Lib/TLibCommon/TComPic.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + encoder: refactor frame encoder recon row synchronization + + The previous approach depended on a common event (owned by TComPic) + being triggered multiple times for each row, one trigger per + referencing frame, but I believe this was fragile as one frame + encoder could steal notifications from another. + + In the new scheme, each frame encoder waits on its own sync event + when it blocks for recon pixels. When a frame encoder finishes + reconstructing a CU row, it calls a top-level encoder function which + determines if any frame encoders are blocked on that POC and wakes + them up. + + This should prevent deadlocks from frame encoder synchronization + [4a4c4cbe9c67] + +2014-01-30 Steve Borho + + * source/common/wavefront.cpp: + wavefront: eliminate redundant reads of m_queuedBitmap + [6d5f2f61341a] + + * source/common/wavefront.cpp: + wavefront: use x265_malloc for bitmaps, to ensure alignment + [adf571b1bb94] + + * source/Lib/TLibCommon/CommonDef.h, source/common/common.h: + common: consolodate malloc/free funcdefs to common.h + [71f6479dc354] + +2014-01-30 Deepthi Nandakumar + + * source/common/x86/asm-primitives.cpp, source/encoder/encoder.cpp: + Merge bug fixes from stable. + [fffdf3dce410] + +2014-01-30 Yuvaraj Venkatesh + + * source/common/x86/sad-a.asm: + asm: modified pixel_sad asm function to avoid overflow + [b852f74bdd8c] + + * source/common/x86/intrapred16.asm, source/test/intrapredharness.cpp: + asm: fixed hash mismatch on 16bpp due to intra_pred_ang + [c0ec570c0105] + +2014-01-29 Yuvaraj Venkatesh + + * source/common/x86/pixel-a.asm: + asm: fix for 32-bit build satd overflow issue. + [86743912a5b0] + +2014-01-30 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + log: print Summary for per-frame logging + [e879873ce926] + + * source/encoder/encoder.cpp: + log: print ssim(dB) in per-frame csv logging + [46aa0de4a8da] + +2014-01-29 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: fix overflow due to pixel_satd asm function for 64-bit build + [d6091cb46ae1] + +2014-01-28 Steve Borho + + * source/encoder/encoder.cpp: + nit: line up WPP log info with other config items + [4ec459e04f9e] + + * source/encoder/weightPrediction.cpp: + weightp: fix lowresMvCosts[] indexing, add comment for future work + [8552e8cc1a3c] + + * source/encoder/compress.cpp: + Merge with stable + [923edbb08a59] + +2014-01-28 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: insert check for merge MV candidates. + [7f4537c4db7a] + +2014-01-28 Aarthi Thirumalai (aarthi + + * source/encoder/ratecontrol.cpp: + rc: bug fix in crf mode ;correct qscale set for all the frames. + [854ff1616d38] + +2014-01-28 Kavitha Sampath + + * source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + weightP: fix crash due to access of lowres array + + fix crash due to access of lowres array of references with invalid + mvs in weightCost + [728f31cc6eee] + +2014-01-28 Steve Borho + + * source/common/common.cpp, source/encoder/slicetype.cpp, + source/x265.cpp: + Merge with stable + [3568c1b19947] + + * source/encoder/slicetype.cpp: + slicetype: fix initial threshold passed to slicetypePathCost + [ddd4e4e328d2] + +2014-01-27 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup g_convertTxtTypeToIdx[] + [633421b8faf6] + +2014-01-28 Deepthi Nandakumar + + * source/encoder/weightPrediction.cpp: + weightP: when difPoc is large, prepare to avoid MC - part 2. Patch + from Kavitha + [cb4b0033c7fc] + + * source/encoder/weightPrediction.cpp: + weightP: when difPoc is large, prepare to avoid MC. Patch from + Kavitha broken up. + [3be5ca58d9d2] + + * source/encoder/frameencoder.cpp: + weightP: build error fix + + Inconsistency due to import from stable branch + [4ead340677bf] + + * source/x265.cpp: + x265: allow only 10-bit input depths in HIGH_BIT_DEPTh builds + [dd0ef09680fe] + +2014-01-27 Steve Borho + + * source/common/common.cpp: + common: prevent 8bit encodes with HIGH_BIT_DEPTH builds + + Some of the interpolation 16bpp assembly routines make assumptions + that the encode depth is 10bits, so HIGH_BIT_DEPTH builds will + generally cause decoder hash mismatches if they encode 8bpp streams. + Prevent this until we have a proper long term solution. + [773b87c2855c] + + * source/common/common.cpp, source/encoder/frameencoder.cpp, + source/encoder/weightPrediction.cpp: + Merge with stable + [3c2441447f2a] + + * source/encoder/frameencoder.cpp: + frameencoder: remove commented function which no longer exists + + compressMotion() was removed 4 months ago + [26a5e720f290] + + * source/common/common.cpp, source/encoder/frameencoder.cpp: + me: add one more pixel lag for DIA search's relaxed search bounds + [dc4e57833aae] + + * source/common/common.cpp: + common: do not allow encodes other than 4:2:0 to start + [10fc60881bbf] + +2014-01-24 Kavitha Sampath + + * source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + WeightPrediction: check difPoc <= bframes+1 to allow weight analysis + for valid references + [a02a028afea9] + +2014-01-27 Deepthi Nandakumar + + * source/common/wavefront.h: + wavefront: add missing initializer + [4fcfa56420fb] + +2014-01-24 Min Chen + + * source/common/x86/pixel-a.asm: + fix SATD32x32 16bits cumulate sum overflow (x86 version only) + [83767892376d] + +2014-01-27 Steve Borho + + * source/common/common.cpp, source/encoder/frameencoder.cpp: + Merge with stable + [b59b1e579f78] + +2014-01-25 Satoshi Nakagawa + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + HM: context table + [1ea8a52b4f54] + + * source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + cleanup TComCUMvField::m_cAMVPInfo + [2fc36d0fd1b2] + +2014-01-27 Satoshi Nakagawa + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + magic number + [ed2e9fe1a732] + +2014-01-26 Satoshi Nakagawa + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + reduce unused context models + [e0c81b78e529] + +2014-01-27 Steve Borho + + * source/common/common.cpp, source/encoder/frameencoder.cpp: + me: add a two pixel pad to the max ME range when calculating + reference lag + + We must account for subpel refine search range when calculating how + many rows of reference frames must be encoded ahead of the current + frame. Without this we saw non-deterministic decoder hash mismatches + with some videos. + [b173809575c6] + +2014-01-24 Steve Borho + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: add build flag to disable read thread for debug purposes + [1ac9148a3661] + + * Merge with stable + [237bf6667405] + + * source/input/yuv.cpp, source/input/yuv.h: + yuv: support colorspaces in YUV input files (closes #13) + [13dac38f54ac] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: use ifstream::good() instead of !ifstream::fail() + + good() implies bad, fail, and eof flags are all false. The fail() + flag does not include eof status or the bad bit for I/O errors. + [dba087c3613b] + + * source/input/y4m.cpp: + y4m: prevent infinite loop on malformed y4m frame headers + [9867ebc4b164] + +2014-01-24 Nabajit Deka + + * source/test/ipfilterharness.cpp: + asm : Fix for luma_vss test bench failure + [7f09cfcf176c] + +2014-01-24 Steve Borho + + * source/encoder/slicetype.cpp: + Merge with stable + [7cd3a3195598] + + * source/encoder/slicetype.cpp: + slicetype: prevent compiler warnings about uninitialized variables + [2a2e5711f63b] + + * source/common/lowres.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + Merge with stable + [047fd4670cbe] + +2014-01-24 Satoshi Nakagawa + + * source/encoder/slicetype.cpp: + fix b-pyramid for fixed GOP + [ffee1032eaed] + +2014-01-24 Steve Borho + + * source/encoder/reference.cpp: + reference: pad width of weighted region to multiple of 16 + [2ab2bb459d1a] + + * source/common/lowres.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + nits + [c02e59ab4ee9] + + * source/encoder/slicetype.cpp: + slicetype: fix bcost behavior with signed ints + [0431eb3404e7] + +2014-01-24 Deepthi Nandakumar + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: change all costs to int64_t + [8dd9b7ac5b61] + + * source/common/common.h: + Merge with stable + [807495b7a9fc] + + * source/common/common.h: + lowres: fix msys compile error + [f45d9772cc40] + +2014-01-23 Steve Borho + + * source/encoder/slicetype.cpp: + nit + [23c65133c555] + + * source/common/lowres.cpp, source/encoder/slicetype.cpp: + white-space and other nits + [438a2258504d] + + * source/encoder/slicetype.cpp: + slicetype: repair I frame placement with lookahead disabled + + if ((true || foo) && bar) properly degrades to if (bar) + [f0ae3d8d2b03] + + * source/common/common.cpp: + preset: re-disable lookahead at ultrafast preset + [c2af7808ef8d] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Merge with stable + [099a71435329] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: upgrade frame cost variables to uint64_t + [629d0a685dcb] + +2014-01-24 Deepthi Nandakumar + + * source/test/pixelharness.cpp: + testbench: added the chroma_addAvg primitives. + [5460e85ae178] + +2014-01-23 Steve Borho + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: remove unused numDecided member variable + [f2766083c252] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Merge with stable + [416713ab5821] + +2014-01-21 Steve Borho + + * source/common/lowres.h, source/encoder/reference.cpp, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: reorg to make frame cost estimates re-entrant (for RC and + WP) + + * moves frame cost calculations into a separate class deriving from + WaveFront + * frames[] array now always declared on stack for better re-entrant + behavior + * re-orders functions in slicetype.cpp for clarity + * internal methods were made protected, for documentation purposes + only + * fixes a few minor problems discovered during the reorg + * removes deprecated ReferencePlanes.unweightedFPelPlane + [e9ec7787cf5e] + +2014-01-22 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + cleanup initPattern() + [8732434ea913] + +2014-01-20 Satoshi Nakagawa + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + reduce CI_NUM + [6a12fc6e6fdc] + +2014-01-23 Steve Borho + + * source/encoder/slicetype.cpp: + Merge with stable + [cbd2ea65f87d] + + * source/encoder/slicetype.cpp: + slicetype: issue EMMS before returning a thread to the work pool + [a42dd0dfe90d] + + * source/common/primitives.h: + primitives: move luma_addAvg close to other weight/avg functions + [7aa3ea411568] + + * source/common/primitives.h: + primitves: fix a comment + [5bf76ded5209] + + * source/common/primitives.h: + primitive: remove two dead funcdefs + [a4c0b6e35dd1] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + primitive: remove dead ipfilter_sp and ipfilter_ss + [ee36ffef7648] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + primitive: remove dead ipfilter_ps + [688f3951c90c] + + * source/test/pixelharness.cpp: + pixelharness: remove dead primitive test + [177421068f53] + +2014-01-23 Nabajit Deka + + * source/Lib/TLibCommon/TComPrediction.cpp: + asm : Hook up chroma_vps and chroma_vss with the encoder. + [356ec7ff8a68] + + * source/Lib/TLibCommon/TComPrediction.cpp: + asm : Hook up luma_vps and luma_vss with the encoder. + [f5ab67ba11b7] + +2014-01-23 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComYuv.cpp, source/common/pixel.cpp, + source/common/primitives.h: + asm-primitives: addAvg, fix bug pointed out by Ashok. + [f1bd676fd90f] + +2014-01-22 Steve Borho + + * source/common/CMakeLists.txt, source/common/ipfilter.cpp, + source/common/primitives.h, source/common/vec/ipfilter-sse41.cpp, + source/common/vec/vec-primitives.cpp, + source/test/ipfilterharness.cpp: + vec: drop unused vectorized chroma_vsp primitive, ipfilter-sse41.cpp + [29c1940c8acb] + +2014-01-22 Nabajit Deka + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/weightPrediction.cpp: + asm : Hook up chroma_vsp with the encoder. + [d2bfe01c0f29] + +2014-01-22 Deepthi Nandakumar + + * Merge from stable + [d8a1e3bebdf8] + +2014-01-22 Kavitha Sampath + + * source/encoder/weightPrediction.cpp: + WeightPrediction: Do not use lowres MV/MVcost for invalid + MVs/MVcosts + [9497c55d7be2] + + * source/encoder/weightPrediction.cpp: + weightp: fix hash mismatch when --ref > 3 + [d56dd4d8e08b] + +2014-01-21 Steve Borho + + * source/encoder/slicetype.cpp: + Merge with stable + [0f0ad4c094bd] + + * source/encoder/slicetype.cpp: + slicetype: fix cuTree mv indexing (bug found by herman.chen@rock- + chips.com) + + This drops the bitrate almost 20% and the SSIM from 0.4-0.9 dB. I + believe this needs rebalancing. + [3cf5a75a8002] + + * source/input/yuv.cpp: + yuv: skip frames one at a time to prevent offset overflow + [e12bb1346bef] + +2014-01-21 Deepthi Nandakumar + + * Merge + [ce41ee0f5c8c] + +2014-01-17 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred8.asm: + assembly code for intra_pred_ang8_3 + [b51c1866363d] + +2014-01-20 Steve Borho + + * source/x265.cpp: + cli: tweak aq-strength CLI help + [950c9a864cb6] + + * source/encoder/slicetype.cpp: + slicetype: white-space fixes + [925e612b0591] + + * source/encoder/encoder.cpp: + encoder: fix the slicetype char table + [7bfd1b01953c] + + * Merge with stable + [b5b7d8e64024] + +2014-01-20 Gopu Govindaswamy + + * source/Lib/TLibEncoder/NALwrite.cpp: + Nalwrite: removed EMULATION_SIZE macro and calculate the + emulationSize from Encoded bitstream size + [21a5fb7ab965] + +2014-01-17 Aarthi Thirumalai + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + rc: avoid issues from zero-residual lookahead blocks, introduce a + small bias + [ffb53cd1f953] + + * source/encoder/encoder.cpp, source/encoder/slicetype.cpp: + lookahead: call sliceTypeAnalyse when necessary + + call sliceTypeAnalyse even when cutree is on or lookaheadDepth or + scenecutThreashold > 0 + + performs lookahead when lookaheadDepth > 0, activates lookahead for + cutree when b-adapt/bframes =0 and cutree is set , also enables + scenecut for bframes = 0 cases. improves psnr/ssim by .5 dB. + [bca352c8689e] + +2014-01-20 Deepthi Nandakumar + + * source/common/common.h: + NALwrite: remove unused macro + [cf79f89c783c] + +2014-01-20 Gopu Govindaswamy + + * source/Lib/TLibEncoder/NALwrite.cpp: + Nalwrite: removed EMULATION_SIZE macro and calculate the + emulationSize from Encoded bitstream size + [356d91e22b25] + +2014-01-17 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred8.asm: + asm: IntraAng32x32 Mode[17] + [9f7fca027b41] + +2014-01-18 Dnyaneshwar G + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + testbench support for addAvg primitive + [56ce4f7669c6] + +2014-01-17 Dnyaneshwar G + + * source/Lib/TLibCommon/TComYuv.cpp, source/common/pixel.cpp, + source/common/primitives.h: + primitive function for luma and chroma for loops in addAvg(). + [c88314c4a1a1] + +2014-01-09 Shazeb Nawaz Khan + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + Re-enabling new weightp + [e90e39c3a035] + +2014-01-17 Steve Borho + + * source/common/common.cpp: + common: do not report ssim costs by default + + It costs CPU cycles to measure SSIM, do not do this unless the user + asks for them with --ssim + [385560ac328d] + + * source/common/common.cpp: + common: remove trailing white-space + [299bbf5f06c2] + + * source/encoder/encoder.cpp: + white-space fixes, reorder for clarity + [838e485c6365] + + * source/encoder/encoder.cpp: + update/fix comments + [c8c8a0273eff] + + * source/common/common.cpp, source/encoder/encoder.cpp: + move param fixups to Encoder::configure() + [7855cee45b8c] + +2014-01-15 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: code for intra_pred[BLOCK_32x32] mode 2 and 34 + [1d7ea03e1a38] + +2014-01-16 Steve Borho + + * source/encoder/encoder.cpp: + stats: simplify slice type lookup + [3d747041271f] + +2014-01-16 Xun Xu, PPLive Corporation + + * source/CMakeLists.txt, source/Lib/TLibCommon/TComPic.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/slicetype.cpp, source/x265.h: + add dts to x265_picture, handle same as x264 + [57b3238680c5] + +2014-01-16 Steve Borho + + * source/Lib/TLibEncoder/TEncSbac.cpp: + TEncSbac: remove hungarian prefixes from loop vars + [243b01e81109] + +2014-01-12 Satoshi Nakagawa + + * source/encoder/compress.cpp: + remove duplicate code + [188617e76d60] + +2014-01-15 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + fix blockCbpBits[] + [04aae8fd88a0] + +2014-01-15 Min Chen + + * source/Lib/TLibCommon/TComDataCU.cpp: + cleanup initCU() + [37b4ca796088] + +2014-01-15 Steve Borho + + * source/x265.cpp: + x265: help nit + [e5d28e2c5a82] + +2013-12-17 Kavitha Sampath + + * source/common/common.cpp, source/encoder/dpb.cpp, + source/encoder/slicetype.cpp, source/x265.cpp, source/x265.h: + slicetype: remove --refresh and use --open-gop(default: enable) + [27c2dac98a3c] + +2014-01-14 Steve Borho + + * Merge with stable + [7af141be0e7a] + +2014-01-09 Satoshi Nakagawa + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup MVPNum + [e7624ab39cff] + +2014-01-14 Steve Borho + + * source/common/wavefront.cpp: + wavefront: bug fix + [9e923f539d89] + +2014-01-13 Steve Borho + + * source/common/wavefront.cpp: + wavefront: consider enabled bitmap status in + checkHigherPriorityRow() + [8e0fa5fcbf15] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv: pad chroma allocations, fix valgrind warnings + [aae31685d8c7] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: add a row and col of padding for lowres interpolation + [8d3cdf1a846e] + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: tweak order of compiles to improve parallel build times + + Move assembly and intrinsic files to front of the build + [3bc604fdd380] + + * Merge with stable + [5a607dd446ea] + +2014-01-13 Min Chen + + * source/Lib/TLibCommon/ContextTables.h: + HM: Fix for #576: Context table for CBF + [c9cefa67691c] + + * source/Lib/TLibCommon/ContextTables.h: + HM: Fix for #501: Decoding part_mode with inter_4x4 can use CNU + context + [de98453fa608] + +2014-01-09 Shazeb Nawaz Khan + + * source/Lib/TLibCommon/TComSlice.h, + source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + fix for hash mismatch in new weightp + [cd6c34bb4172] + +2014-01-10 Steve Borho + + * Merge with stable + [b2e7d8da2838] + +2014-01-09 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: code for intra_pred[BLOCK_16x16] mode 2 and 34 + [c5aa7ae59fc7] + +2014-01-09 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: intra_pred_ang8_2 asm code + [acbe568e7366] + +2014-01-07 Min Chen + + * source/common/x86/ipfilter8.asm: + improvement interpolate_H_pp + [b2a0cfe4837b] + +2013-12-28 Min Chen + + * source/Lib/TLibEncoder/TEncEntropy.cpp: + cleanup reduce condition check for getUseDQP() + [a03cc8c4d739] + +2014-01-08 Steve Borho + + * source/VectorClass/README.txt, source/VectorClass/instrset.h, + source/VectorClass/vectorclass.h, source/VectorClass/vectori128.h, + source/VectorClass/vectori256.h, source/VectorClass/vectori256e.h, + source/common/vec/ipfilter-ssse3.cpp, source/common/vec/pixel- + sse41.cpp, source/common/vec/pixel-ssse3.cpp, + source/common/vec/pixel16-sse41.cpp, + source/common/x86/intrapred.asm, source/common/x86/pixel-util.asm, + source/test/testpool.cpp: + Merge with default, prepare for 0.7 tag + [0d70188e80bc] + +2014-01-07 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: fix memory access violation due to scale2D_64to32 + [c4edab8dab65] + +2014-01-07 Min Chen + + * source/common/x86/ipfilter8.asm: + correct number of xmm register on interp_8tap_horiz* + [ca7bde495318] + +2014-01-06 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: simplify luma intra prediction function + [4811da38078c] + + * source/Lib/TLibCommon/TComPrediction.cpp: + wtf? a useless comment and if()/else() with two identical + statements? + [c1cf926c20e0] + + * source/Lib/TLibCommon/TComBitStream.cpp: + TComBitStream: fix loop bounds so we do not check past end of buffer + [bd9b395c80c7] + + * .hgignore: + ignore vim swap files + [6d40ab7be379] + + * source/Lib/TLibCommon/TComBitStream.cpp: + TComBitstream: simplify and streamline start code checks + [e1ee0fc31e79] + + * source/Lib/TLibCommon/TComBitStream.cpp: + TComBitStream: rename variables for clarity + + There was no point making cnt an unsigned variable when the return + value is signed, this just adds more compiler warnings + [324d99e3d6ac] + + * source/encoder/motion.cpp: + motion: add early out for subpel refine if bcost is already zero + [63d6b04fe201] + + * source/encoder/slicetype.cpp: + slicetype: better prevention for compiler warnings and misbehaviors + [54835bf61c11] + + * source/common/x86/asm-primitives.cpp: + asm: disable x265_scale2D_64to32_ssse3, DUMA finds access violations + + I tried simple buffer padding workarounds, adding 16 bytes at the + start and end of bufScale, but it was still causing the access + violation. + [d4bef967ae10] + +2014-01-01 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h: + slice: nits + [abd4da45823c] + + * source/encoder/weightPrediction.cpp: + weight: clarify max denom adjustments + [75f7a9434289] + + * source/encoder/weightPrediction.cpp: + weight: nits + [89e57c446a81] + + * source/encoder/weightPrediction.cpp: + weight: alloc intermediate weight buffer once per plane + [c35d653ab515] + + * source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + weight: use m_ prefix consistently for all member variables + [d809090f70ad] + + * source/encoder/weightPrediction.h: + weightPrediction: remove unused member variables, fix shadow warning + [0715d03808f3] + +2014-01-06 Min Chen + + * source/encoder/frameencoder.cpp: + fix every execute output different bitstream when SAO enabled + [99f28c405b5c] + +2014-01-03 Steve Borho + + * source/input/y4m.cpp: + y4m: use loop to skip frames, avoid 32bit size wrap problems + [f96c85f03b77] + +2014-01-02 Deepthi Nandakumar + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + Backed out changeset: revert to HM-based weightP + [8137881d4cad] + +2013-12-30 Shazeb Nawaz Khan + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + Integrating new weight analysis in encoder + [dcae0b69d9b3] + + * source/encoder/CMakeLists.txt, source/encoder/weightPrediction.cpp, + source/encoder/weightPrediction.h: + Importing x264 weight analysis to encoder + [affdfa4b5537] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Moving macro to header + [f5427379b40d] + +2013-12-30 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: bug fix to improve quality for the first I frame. + [c86f18d7eb2a] + +2013-12-30 Deepthi Nandakumar + + * source/common/common.cpp: + common: tune-ssim sets aqmode as AUTO_VARIANCE (2). Gives higher + ssim. + [c561cd778ef5] + +2013-12-27 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + cutree: bug fixes. correct the timescale used in getQScale() + [964e5bc90ad2] + +2013-12-27 Deepthi Nandakumar + + * source/common/common.cpp: + aq, cutree: completely turn of AQ, at fastest presets + + Prevent AQ from running unnecessarily with strength 0. + [8b5c5fe7fbc9] + + * source/common/common.cpp: + aq, cutree: preset changes + + 1. Disable AQ at ultrafast/superfast presets 2. Disable CUTree at + ultrafast/superfast/veryfast/faster presets (requires b-adapt to be + non-zero). 3. tune-psnr disables only AQ, CUtree stays enabled (at + appropriate presets). + [cb2a18cc1d14] + +2013-12-26 Murugan Vairavel + + * source/common/x86/blockcopy8.asm: + asm: fix for mismatch in 10bpp block copy + [0210e9c4a6f9] + +2013-12-24 Aarthi Thirumalai + + * source/common/common.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/x265.h: + aq: Add AQ_AUTO_VARIANCE feature for performing adaptive + quantization. + [2f83ed50ed9a] + +2013-12-25 Deepthi Nandakumar + + * source/encoder/encoder.cpp, source/x265.cpp: + encoder: nits + [08bad990fd66] + +2013-12-24 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + csv: formatting ssim output + [d74f2e0856b4] + + * source/common/common.cpp, source/x265.cpp: + common: change tune-ssim to default. When tune-psnr is enabled, AQ + and CUTree are turned off. + [cb48e00e1fd8] + +2013-12-23 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + encoder: increase precision, add ssim db info to csv output. + [bb7057e7ed6d] + + * source/common/common.cpp: + common: [OUTPUT CHANGES]: change default ratecontrol mode to CRF-28. + AQ on (strength 1.0), CUTree ON. SSIM reporting On, PSNR off. + [aa2856a374a1] + +2013-12-18 Deepthi Devaki + + * source/encoder/compress.cpp: + rd: optimize rd 0 + [3e98f0e66e0b] + + * source/encoder/compress.cpp: + rd: fix incorrect check for rdlevel + + Merge-skip check should be only for rdlevel 2 + [c69ef62da2b6] + + * source/encoder/compress.cpp: + rd: modify recon generation in rd 0 to have lesser mem copies + [edeccf4c6cdd] + + * source/Lib/TLibCommon/TComDataCU.cpp: + rd: remove unnecessary mem copies + [f766e7c3b165] + +2013-12-17 Deepthi Devaki + + * source/encoder/compress.cpp: + compress: clean up + [c1802fef73b4] + + * source/encoder/compress.cpp: + rd: move merge-skip check before other modes are checked + + in rd 1,0 merge-skip is chosen only based on sa8d threshols. + Checking other Inter/inter modes is unnecessary overhead. + [06ea3901bf35] + + * source/encoder/compress.cpp: + rd: fix error in merge-skip identfication in rd 1,0 + + Cost was not calculated if the best mode is merge-skip in rd 1,0 + [526c10b6a808] + + * source/encoder/compress.cpp: + compress: remove unused sbaccoder load + [ee272784fa6e] + + * source/encoder/compress.cpp: + compress: remove redundant initializations + + depth is already initialized in initsubcu + [3aa3ed0fefe8] + + * source/encoder/compress.cpp: + compress: remove unnecessary code + + reco and resi buffers is not used in this function + [8c90ad61a306] + + * source/encoder/compress.cpp: + compress: remove redundant initializations + + skipFlags are set to false during initialization. + [e476d497f3cd] + + * source/encoder/compress.cpp: + rd: check DQP before best cu is copied to the pic in rd 0 + [f9c427b44459] + + * source/Lib/TLibCommon/TComDataCU.cpp, source/encoder/compress.cpp: + rd: remove redundant checkDQP and add checkDQP for rd 0 + + checkDQP is called on bestCU, so no need to call it after each + encode. For rd0, call checkDQP after residual encode at depth 0. + [5828f199d745] + +2013-12-19 Shazeb Nawaz Khan + + * source/encoder/ratecontrol.cpp: + Consider rounded width, height for ssd calculation + [8133378e2250] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Fix typo in extending height to round to multiple of 16 + [146051557e0d] + +2013-12-18 Gopu Govindaswamy + + * source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + encoder: print global ssim in db + [60c2a49909f2] + +2013-12-18 Rafaël Carré + + * source/CMakeLists.txt: + cmake: .pc file is useful as well when using static libs + [1c37c4401ea4] + + * source/cmake/version.cmake: + cmake: take revision from git if within a git repo + [54445c440ea0] + + * source/cmake/version.cmake: + cmake: verify existance of hg repo + [387cd1635278] + +2013-12-18 Deepthi Nandakumar + + * source/common/common.cpp: + common: tune-ssim turns on aq strength + [09d93b8d5994] + + * source/common/common.cpp: + common: adjust tabs + [7f1da6982201] + + * source/common/common.cpp: + common: update print params and defaults. + + By default, tune-psnr is turned on, which means CUTree is enabled + but AQ strength is zero. + [3055eacc88ae] + + * source/common/common.cpp: + common: this check taken care of in encoder::configure() + [7db25026df8e] + + * source/common/common.cpp: + common: enable CUtree by default. tune=psnr is the default setting + [06052d963404] + + * source/common/common.cpp: + common: remove meaningless warning + + aq=1 is required for cutree. Therefore, aq=1, aqstrength=0 is a + valid case for all configs that want cutree but do not want aq (eg, + tune-psnr). + [b9b1533b52d5] + + * source/encoder/encoder.cpp: + encoder: increase precision for SSIM printf + [381ff6bcade6] + +2013-12-18 Gopu Govindaswamy + + * source/common/common.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/x265.cpp: + cutree: Enable CUTree for ratecontrol when tune=ssim and bug fix for + cutree + + 1. added cutree into cli option, default cutree is disabled 2. + --tune=ssim then aq and cutree is enabled 3. Fixed bug in slicetype- + frameCostRecalculate() the qp offset for B and B-REF from lowres + qpAqOffset, for non-B from lowres qpoffset 4. Fixed bug in + Ratecontrol-getQScale - the clip duration should (frameduration * + timescale) when cutree is enabled + [6edecf959a30] + +2013-12-17 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + rd: remove unnecessary sbaccoder loads + [37e0b00adeec] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + rd: fix wrong chroma mode used for mode decision + [119d09b54b95] + +2013-12-16 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + rd level: use cu coeff array while computing Intra coefficients + instead of temp buffer + [37c57d47a9d4] + +2013-12-17 Min Chen + + * source/common/x86/intrapred8.asm: + fix yasm warning 'trailing garbage after expression' + [e1465ef2659c] + + * source/common/x86/pixel-util8.asm: + fix crash in pixel_var_16x16_internal, because + RateControl::acEnergyCu() call it with unalignment pointer + [c944aa8fbf2f] + +2013-12-17 Gopu Govindaswamy + + * source/encoder/ratecontrol.cpp: + rc: bug fix for variance data calculation for weighted prediction + when aq-mode is disable + [517eadc29cf6] + +2013-12-17 Deepthi Nandakumar + + * source/common/vec/vec-primitives.cpp: + vec-primitives: avoid warnings + [9ffeea38b119] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: correct initialization list order (warnings on non-VS + compilers). + [7474532db9ec] + +2013-12-16 Gopu Govindaswamy + + * source/encoder/ratecontrol.cpp: + rc: Calculate the variance data for weighted prediction if aq-mode + is disable + [baf811614a7a] + +2013-12-16 Sumalatha Polureddy + + * source/encoder/compress.cpp: + compress: remove unused variables + [91b9cd801cbb] + +2013-12-16 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + rd level: use cu coeff array while computing coefficients instead of + temp buffer + [34137673f778] + +2013-12-16 Sumalatha Polureddy + + * source/encoder/compress.cpp: + rd level: fix for output mismatch + + For boundary conditions, cost calculation was wrong. + [3dae450a06a4] + +2013-12-16 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + rd level: remove unnecessary calculations in intra residual encoding + + calcrecon calculates recon,reconQT and reconIpred which is + redundant. Use add and block copy instead of calcrecon. + [9bb16a023918] + +2013-12-16 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: fix gcc warning + [57efb99765cc] + + * source/Lib/TLibEncoder/TEncCu.h: + TEncCu: reintroduce a macro which was removed while merging + [9171e78ee412] + +2013-12-11 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + all_angs_pred_8x8, asm code + [43656ecbe66b] + +2013-12-16 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix gcc warning + + remove unused variables and shadowed declarations + [238d7f272d1e] + +2013-12-13 Deepthi Devaki + + * source/encoder/compress.cpp: + white space nits + [f157836c3713] + + * source/encoder/compress.cpp: + rd level: modify mode-decision logic for rd level 0 + + In rd level 0, sa8d cost used for mode decision across depths. + dct,quant is not done during mode-decision and no recon is + generated. Hence Intra has to use original pixels as reference. + Residual encoding is done at depth 0 and Intra prediction for the + best mode will be done again with recon as reference. + [d2a1b3409656] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + rd level: add functions to do residual encoding at depth 0 for best + modes. + + This function will be used for rd level 0. + [e05b990cfb93] + +2013-12-13 Gopu Govindaswamy + + * source/common/common.cpp, source/common/lowres.cpp, + source/encoder/ratecontrol.cpp: + rc: Initialized qp and aq offsets and QscaleFactor if aqmode is + disabled these variables need for cutree analyse + [c6ff3ede1f29] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Slicetype: recalculate the frame cost without re-running lookahead + for cutree + [582e96661432] + +2013-12-14 Deepthi Nandakumar + + * source/output/yuv.cpp: + Merge with stable + [4b0163d06ba1] + + * source/output/yuv.cpp: + yuv: fix more 32-bit compile warnings + [bca251133e70] + +2013-12-13 Steve Borho + + * source/output/yuv.cpp: + Merge with stable + [5cbea1cd3d3a] + + * source/output/yuv.cpp: + yuv: prevent 32bit compiler warnings + [14e9dbedab9d] + +2013-12-12 Steve Borho + + * source/output/y4m.cpp: + y4m: perform file offset math in uint64_t + [292a91f3427e] + +2013-12-13 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp: + rc: implement ratecontrol upto MAX_MAX_QP. + + For RateControl .virtual qps are used from 0 till 69. Howevr,Before + encoding the frame, the qps are clipped back to 51 + [926f7034b2f0] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h: + rc: Initialize virtual qps till 69 + [07d83bc1cfad] + +2013-12-12 Shazeb Nawaz Khan + + * source/Lib/TLibEncoder/WeightPredAnalysis.cpp: + Fix for deltaWeight assert failure in Decoder + [ba2f8f5d3b07] + +2013-12-13 Deepthi Nandakumar + + * source/encoder/compress.cpp: + Merge: changes outputs + [d0cf79d76057] + +2013-12-12 Sumalatha Polureddy + + * source/encoder/compress.cpp: + rd level: fix the incorrect cost calculation for sa8d + + In rd level 1, the distortion is sa8d, so cost calculation should be + calrdsadcost() instead of calcrdcost() + [ba3cc2f01138] + + * source/encoder/compress.cpp: + rd level: remove the redundant splitflag bit calculation with EARLY + EXIT ON + [fdfecd9f4c9e] + + * source/encoder/compress.cpp: + rd level: change the indexing for sa8d threshold array + + The index in threshold array won't be same as depth when the max cu + size is less than 64. + [bf541ca76b3a] + +2013-12-13 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: review/clean up top_skip + [20f9187af731] + + * source/encoder/compress.cpp: + compress: review/clean up merge + [1012911d1895] + +2013-12-12 Steve Borho + + * source/output/yuv.cpp: + Merge with stable + [033a65692b6a] + +2013-12-12 David Bachelart + + * source/output/yuv.cpp: + yuv: Support 4GB+ YUV files for output + [06e88ad6d922] + +2013-12-11 Steve Borho + + * source/common/CMakeLists.txt, source/common/ipfilter.cpp, + source/common/primitives.h, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/vec-primitives.cpp: + primitives: drop ipfilter[FILTER_H_P_S_[4|8], including ipfilter- + ssse3.cpp + [a87f12ebb55b] + + * source/common/x86/pixel-a.asm: + merge with stable + [25f412ecaba2] + + * source/common/x86/pixel-a.asm: + ssd: backport EMMS fix for SSD_4xN + [d2385a789c8a] + +2013-12-11 Nabajit Deka + + * source/Lib/TLibCommon/TComPrediction.cpp: + asm : Hook up chroma_hps with encoder. + [93a92dafb667] + +2013-12-11 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: 10bpp code for calcrecon_16x16 and 32x32 + [350b0757d1a0] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for calcrecon_4x4 and 8x8 + [59c09449211d] + +2013-12-11 Dnyaneshwar G + + * source/test/intrapredharness.cpp: + testbench: fix invalid memory compare for 16bpp primitives + [da972cf802fb] + +2013-12-11 Murugan Vairavel + + * source/test/pixelharness.cpp: + asm: 10bpp test bench code for calcrecon + [99f3527f9b7b] + +2013-12-11 Min Chen + + * source/common/x86/intrapred16.asm: + asm: remove reduce operators in intra_pred_planar32 + [5d21f5c91495] + +2013-12-11 Dnyaneshwar G + + * source/common/x86/intrapred16.asm: + asm: fix hash mismatch in intra_pred_planar_32x32 16bpp + [0d2771085aec] + +2013-12-11 Min Chen + + * source/common/x86/ssd-a.asm: + asm: fix miss EMMS in pixel_ssd_ss_4xN + [3e6b5f65f47c] + +2013-12-11 Yuvaraj Venkatesh + + * source/common/x86/intrapred16.asm: + asm: fix hash miss match due to intra_pred_ang4_17 + [8f634eac2ca0] + +2013-12-10 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_planar_32x32 + [771e657a11fa] + + * source/common/x86/intrapred16.asm: + assembly code for intra_pred_planar_16x16 for 10 and 12-bit + [2b618395a586] + +2013-12-11 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/lowres.cpp: + tcomtrquant: when useRDOQ is disabled, it recomputes qpScaled, the + qpScaled for trQuant is already set + [c5f84164ef36] + +2013-12-10 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + rdlevel: implement rdlevel 1 + + At each depth, dct, quant, iquant, idct is done on best mode. No RD + done during TU selection. Recon is generated for intra prediction in + neighbouring quadtree is not broken + [298470fa2d19] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + add functions for Intra residual transform and quantization without + rd + [14ddc3ed548a] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + add function for Inter residual tranform and quantization without RD + [9b7f7303cdb9] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove unused functions + [b938ac5cef81] + +2013-12-11 Kavitha Sampath + + * source/Lib/TLibEncoder/TEncCu.cpp: + log: fix memory overread issue + [40895c94da21] + +2013-12-11 Deepthi Nandakumar + + * source/CMakeLists.txt, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + cmake: Add LOG_CU_STATISTICS macro to enable/disable CU mode + decision statistics. + [a1e7aac486a7] + +2013-12-10 Steve Borho + + * source/encoder/motion.cpp: + motion: nit + [3bf6be9d766b] + + * source/common/x86/intrapred16.asm: + intra: fix 64bit build of intrapred16.asm - Min please review + [dcef9f3bca1e] + +2013-12-10 Deepthi Nandakumar + + * Merge + [c4fdea3fd659] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: 10bpp code for calcresidual_16x16 and 32x32 + [1169201b50c4] + +2013-12-10 Nabajit Deka + + * source/Lib/TLibCommon/TComPrediction.cpp, source/encoder/motion.cpp: + asm : Hook up luma_hps with the encoder. + [af1f46818bed] + + * source/test/ipfilterharness.cpp: + Add comment for luma_hps and chroma_hps test bench code. + [1cc5b2d87d8b] + +2013-12-10 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4 - mode 17,18 + [384d99887688] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for calcresidual_4x4 and 8x8 + [e4c13676c4b5] + +2013-12-10 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4 - mode 14,15,16 + [573a8cfac514] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: 10bpp support for blockcopy_ps and blockcopy_sp + [8f8d4811352a] + +2013-12-10 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4 - mode 11,12,13 + [b29166445321] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: 10bpp blockcopy_ps integration for Luma and chroma partitions + [887206700a13] + +2013-12-10 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp code for intra_pred_ang4_26 + [1eb855251cc5] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm: 10bpp code for blockcopy_ps_64xN + [72e7899bef55] + + * source/common/x86/blockcopy8.asm: + asm: 10bpp blockcopy_ps bug fix + [8e34f135fd9e] + +2013-12-10 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_10 + [d604f25e6eab] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm: 10bpp code for blockcopy_ps_48x64 + [1679ad2da2a1] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm: 10bpp code for blockcopy_ps_32xN + [64c8f43aa7ce] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm: 10bpp code for blockcopy_ps_24x32 + [e7fff01a464b] + +2013-12-10 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp: + 16bpp: enabled blockfill_s primitive + [2cf9944afa92] + +2013-12-10 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm: 10bpp code for blockcopy_ps_16xN + [2e56e8e76f72] + +2013-12-10 Nabajit Deka + + * source/common/ipfilter.cpp: + Bug fix in luma_hps C primitive. + [1863cdede774] + +2013-12-10 Murugan Vairavel + + * source/common/x86/blockcopy8.asm: + asm: 10bpp code for bolckcopy_ps_12x16 + [54e8c012597c] + + * source/common/x86/asm-primitives.cpp: + asm: pixel_add_ps integration code for Luma and chroma partitions + [0c46964557c8] + +2013-12-10 Min Chen + + * source/test/intrapredharness.cpp, source/test/intrapredharness.h: + testbench: fix wrong width parameter in check_planar_primitive() + [5a7f116e3aae] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: Intra Planar 16x16 + [5604254f779e] + + * rename IntraPred.cpp to intrapred.cpp to avoid name conflict + [7810ce2bdb53] + + * source/common/x86/intrapred16.asm: + asm: improvement IntraPredDC_32x32 by replace macro extend by loop + [9a8b0e81330f] + +2013-12-09 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm: 10bpp code of blockcopy_pp for 2xN, 4xN, 6x8 and 8xN blocks + [285a4d8c42a0] + +2013-12-09 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_8 and intra_pred_ang4_9 + [66d8405320d2] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_7 + [f33ca21fe0c2] + +2013-12-10 Min Chen + + * source/common/x86/intrapred16.asm: + asm: alignment branch to 16 bytes + [89fea75bbc1b] + +2013-12-10 Kavitha Sampath + + * source/Lib/TLibEncoder/TEncCu.cpp: + log: fix crash caused by logging after CU analysis + [ef26367cd10c] + +2013-12-09 Steve Borho + + * Merge with stable + [a88c5723d266] + +2013-12-03 Steve Borho + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + sbac: move global tables into x265 namespace + [7d4f5cbc68e7] + +2013-12-09 Steve Borho + + * source/encoder/ratecontrol.cpp: + Merge with stable + [c6c73ef24c97] + + * source/encoder/ratecontrol.cpp: + ratecontrol: avoid reads past the end of chroma buffers + [67e711fde921] + + * source/common/vec/dct-ssse3.cpp: + dct: drop intrinsic DCT 8x8 primitive, we have asm coverage + [eacdbae47e47] + + * source/encoder/ratecontrol.cpp: + ratecontrol: make weightp analysis aware of colorspaces + [f25e60a2b62c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nit + [644d27bb26e9] + +2013-12-09 Shazeb Nawaz Khan + + * source/encoder/ratecontrol.cpp: + Use correct width/height for chroma in ssd calculation + [7bd7937e762b] + +2013-12-09 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_6 + [40e204fcf5d4] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_5 + [0587de3aeb9b] + +2013-12-09 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_planar8 + [4c5a86ff2c99] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_planar4 + [39b7cf1f3a89] + +2013-12-09 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_4 + [6f2660e5a857] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + asm: 16bpp support for intra_pred_ang4_3 + [9116410f11b4] + +2013-12-09 Murugan Vairavel + + * source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_6x8 + [cbd96bfe5732] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_64xN + [6385549fefa3] + +2013-12-09 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/intrapred16.asm: + asm: 16bpp asm code for intra_pred_ang4_2 + [36ffa3ba6039] + +2013-12-09 Min Chen + + * source/common/x86/intrapred8.asm: + asm: little improvement(pextrd -> movd, jmp to alignlemt address) + [e16a67613973] + +2013-12-09 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_48x64 + [69766745fb6d] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_32xN + [ef3bfef112c1] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_24x32 + [46a23a64eb1e] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_16xN + [fdb1ed6efd5b] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_12x16 + [ced72100d067] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_8xN + [1c4d5a2538d5] + +2013-12-09 Deepthi Nandakumar + + * source/x265.h: + x265: fix bad merge + [55d99b6651f2] + + * source/x265.h: + Merge + [c7e7003df711] + +2013-12-09 Min Chen + + * source/test/pixelharness.cpp: + fix bug: avg_pp use weight 32 + [5bb46ef28bc5] + + * source/common/x86/asm-primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/dct8.asm, source/common/x86/dct8.h: + asm: assembly code for DCT8x8 + [8ffd72844276] + +2013-12-07 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_4xN + [4f9dc5dc44e0] + +2013-12-07 Min Chen + + * source/test/testpool.cpp: + Merge branch 'X' + [2353322df540] + +2013-12-06 Steve Borho + + * Merge + [f655add8bcc2] + +2013-12-06 Dnyaneshwar G + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/IntraPred.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.h, source/common/x86/intrapred8.asm, + source/encoder/compress.cpp, source/encoder/slicetype.cpp, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + cleanup: merge Intra Pred PLANAR mode into intra_pred[] + [8773f7f028c2] + +2013-12-06 Min Chen + + * rename IntraPred.cpp to intrapred.cpp + [a5984c686c55] + +2013-12-06 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_2xN + [8eba4667a5e3] + + * source/test/pixelharness.cpp: + 10bpp: testbench code for pixel_add_ps + [41bfe2b249f8] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: 10bpp code of pixel_sub for 16xN, 24x32, 32xN,48x64 and 64xN + [4bb40809a372] + +2013-12-06 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc32 + [3933b4a1380d] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc16 + [056a712852c9] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc8 + [ef6f2bdbaf7d] + + * source/common/CMakeLists.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/const-a.asm, source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc4 + [059af4b49f8e] + +2013-12-06 Min Chen + + * source/common/x86/intrapred8.asm: + asm: cleanup garbage after fucntion declare + [fef16dcf106f] + +2013-12-06 Deepthi Nandakumar + + * source/x265.h: + x265: remove obsolete R-D enums + [dadb3ae865cb] + + * source/x265.h: + rc params: documentation on i/p/bfactor, qpstep, crf + [3c0a1652611e] + + * source/x265.h: + ratecontrol params: documentation for rateTolerance + [08cf1b4ba081] + + * source/x265.h: + ratecontrol parameters: add documentation for qcomp + [c889005baa5f] + +2013-12-05 Steve Borho + + * source/test/CMakeLists.txt, source/test/testpool.cpp: + cmake: drop pool test; not seriously used since May + [ebd32ea1f84e] + + * source/CMakeLists.txt: + cmake: add ENABLE_SHARED cmake option, default to ON + + Allow MSVC users to opt-out; to roughly halve the compile time. + Clean up a few nits in the process. + [707901470d11] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/encoder.cpp: + TEncCu: coding style, lower case initial letter for totalCu + [458173c60685] + + * source/CMakeLists.txt: + cmake: bump X265_BUILD post 0.6, since x265_param has changed + [9442edac08b7] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/cpu.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/vec/vec-primitives.cpp, + source/test/TestBench.cpp: + cmake: rename ENABLE_PRIMITIVES_ASM to ENABLE_ASSEMBLY + + And use the same name for the build define. Also, rename + Setup_Vector_Primitives() to Setup_Instrinsic_Primitives() + [a3a4689496e8] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/primitives.cpp, source/test/TestBench.cpp: + cmake: remove ENABLE_PRIMITIVES_VEC build option + + The use of the few remaining compiler intrinsic functions is now + unconditional. Compiler detection will remove them cleanly in case + they cannot be compiled, so there is no reason to make them a top + level build option + [46cbb9f96850] + + * source/Lib/TLibCommon/CommonDef.h, source/common/IntraPred.cpp, + source/common/primitives.h, source/common/vec/intra-ssse3.cpp, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + primitives: cleanup intra prediction table dimensions + [435c68acd2a1] + + * source/common/CMakeLists.txt: + cmake: move asm-primitives.cpp and asm headers into VS source group + + A cosmetic change only + [89b4d233b57f] + + * source/encoder/compress.cpp: + Merge + [6fd83a8d944c] + +2013-12-05 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm, source/test/intrapredharness.cpp: + all_angs_pred_4x4, asm code for all modes + [be8b52e8f903] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_12x16 + [f2ce16c8d65a] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_8xN + [a30515de4c2d] + +2013-12-05 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad16-a.asm: + asm: 16bpp support for sad_x4 - all block sizes + [ca7d0e04c324] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_6x8 + [9e0b2aa531a9] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_4xN + [952d606c08ed] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: 10bpp code for pixel_sub_2xN + [376c68701fb7] + +2013-12-05 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad16-a.asm: + asm: 16bpp support for sad_x3 - all block sizes + [057581245e2f] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: primitives of sse_ss for 12x16, 24x32, 48x64 and 64xN blocks + [6570bef3bc2b] + +2013-12-05 Min Chen + + * source/common/x86/intrapred8.asm: + improvement by remove reduce ADD instruction in intra_pred_dc16 + [229ec0d4164f] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/IntraPred.cpp, + source/common/primitives.h, source/common/vec/intra-ssse3.cpp, + source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm, source/encoder/compress.cpp, + source/encoder/slicetype.cpp, source/test/intrapredharness.cpp, + source/test/intrapredharness.h: + cleanup:merge Intra Pred DC mode into intra_pred[] + [e1cf58c22166] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 18 + [f4b8486659ad] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 17 & 19 + [06cb5289dfcf] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 16 & 20 + [56506c2913d8] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 15 & 21 + [112dabd56faf] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 14 & 22 + [2c2f9294e033] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 13 & 23 + [d77cf817b11e] + + * source/test/intrapredharness.cpp: + testbench: swap order to call asm code + + Our old intra_pred_ang algorithm will fill buffer before input pLeft + and pabove, in this time, the offset [-1] pixel equal to [4], it + affect detect asm code error, so I swap the order + [3aa3d68f1552] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 12 & 24 + [666ad0f9f68e] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 11 & 25 + [36da30382f5f] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm, source/test/intrapredharness.cpp: + asm: improvement intra_pred_ang by SSE4(pextrd,pextrb) + [ca7d44856001] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for scale2D_64to32 routine + [4d6b8c30f15e] + +2013-12-05 Yuvaraj Venkatesh + + * source/common/primitives.cpp: + integrating asm code for sa8d in primitives.cpp + + there was no separate functions for sa8d in assembly, we are just + re-using sa8d_inter functions for sa8d. + [1f96b2234977] + +2013-12-05 Min Chen + + * source/common/vec/intra-ssse3.cpp: + cleanup unused array intra_ang4[] + [f16f9f0ea93c] + + * Merge branch 'X' + [d11dbd953d6d] + +2013-12-04 Nabajit Deka + + * source/common/x86/ipfilter8.h: + Function declarations for modified luma_hps and chroma_hps + functions. + [fc7883c471e4] + + * source/common/primitives.h, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + Test bench code for luma_hps and chroma_hps + [f29d063bf40d] + + * source/common/ipfilter.cpp: + C primitive changes for luma_hps and chroma_hps. + [d19d6ac4bc4b] + + * source/common/x86/ipfilter8.asm: + asm : Modifications for luma_hps and chroma_hps(extra rows) + [3e8632ea6c95] + +2013-12-05 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/IntraPred.cpp, source/common/primitives.h, + source/common/vec/intra-ssse3.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/intrapred.h, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + asm: simplify code by use intra_pred_ang[][], and avoid build error + when disable yasm + [660a23104096] + + * rename IntraPred.cpp to intrapred.cpp to avoid team's hg merge + conflict + [d7bdf9a5b2c6] + +2013-12-07 Steve Borho + + * source/test/testbench.cpp: + testbench: remove long-since idiotic comment + [b29f2f31ec46] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/cpu.cpp, source/common/primitives.cpp, + source/test/testbench.cpp: + cmake: simplify use of ENABLE_ASSEMBLY build define + + The define is only needed in primitives.cpp. Move all of the CPU + detect fallback code together in primitives.cpp making cpu.cpp more + standalone. Make the testbench only build if the assembly code is + enabled. + [e1fbbf947b60] + +2013-12-07 Deepthi Nandakumar + + * source/x265.h: + rc params: complete documentation for rc params in x265.h + [d8d844b36c6c] + +2013-12-06 Steve Borho + + * Merge + [a482cf5de173] + +2013-12-06 Dnyaneshwar G + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.h, source/common/x86/intrapred8.asm, + source/encoder/compress.cpp, source/encoder/slicetype.cpp, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + cleanup: merge Intra Pred PLANAR mode into intra_pred[] + [c093e7847025] + +2013-12-06 Min Chen + + * rename IntraPred.cpp to intrapred.cpp + [f4166f824c2b] + +2013-12-06 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: 10bpp code for pixel_add_ps_2xN + [967297338b27] + + * source/test/pixelharness.cpp: + 10bpp: testbench code for pixel_add_ps + [13314db77cf8] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: 10bpp code of pixel_sub for 16xN, 24x32, 32xN,48x64 and 64xN + [f27fb7c2676a] + +2013-12-06 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc32 + [d36fb6852698] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc16 + [6d2d7c2a5d79] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc8 + [110d716e67a7] + + * source/common/CMakeLists.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/const-a.asm, source/common/x86/intrapred16.asm: + 16bpp: assembly code for intra_pred_dc4 + [9e24dcae2ebf] + +2013-12-06 Min Chen + + * source/common/x86/intrapred8.asm: + asm: cleanup garbage after fucntion declare + [ad08db06a7c6] + +2013-12-06 Deepthi Nandakumar + + * source/x265.h: + x265: remove obsolete R-D enums + [56a17500909e] + + * source/x265.h: + rc params: documentation on i/p/bfactor, qpstep, crf + [c5e91abfeb05] + + * source/x265.h: + ratecontrol params: documentation for rateTolerance + [49288da0ee3e] + + * source/x265.h: + ratecontrol parameters: add documentation for qcomp + [608874dc84ab] + +2013-12-05 Steve Borho + + * source/test/CMakeLists.txt, source/test/testpool.cpp: + cmake: drop pool test; not seriously used since May + [d5dc48e6cd16] + + * source/CMakeLists.txt: + cmake: add ENABLE_SHARED cmake option, default to ON + + Allow MSVC users to opt-out; to roughly halve the compile time. + Clean up a few nits in the process. + [3e6be1e11720] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/encoder.cpp: + TEncCu: coding style, lower case initial letter for totalCu + [97e917c4a923] + + * source/CMakeLists.txt: + cmake: bump X265_BUILD post 0.6, since x265_param has changed + [6ed44381cb6c] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/cpu.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/vec/vec-primitives.cpp, + source/test/testbench.cpp: + cmake: rename ENABLE_PRIMITIVES_ASM to ENABLE_ASSEMBLY + + And use the same name for the build define. Also, rename + Setup_Vector_Primitives() to Setup_Instrinsic_Primitives() + [f321d4a467fb] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/primitives.cpp, source/test/testbench.cpp: + cmake: remove ENABLE_PRIMITIVES_VEC build option + + The use of the few remaining compiler intrinsic functions is now + unconditional. Compiler detection will remove them cleanly in case + they cannot be compiled, so there is no reason to make them a top + level build option + [a458b8d6b476] + + * source/Lib/TLibCommon/CommonDef.h, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intra-ssse3.cpp, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + primitives: cleanup intra prediction table dimensions + [49c0408febfd] + + * source/common/CMakeLists.txt: + cmake: move asm-primitives.cpp and asm headers into VS source group + + A cosmetic change only + [41e80c591116] + + * source/encoder/compress.cpp: + Merge + [67d755e2a30c] + +2013-12-05 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm, source/test/intrapredharness.cpp: + all_angs_pred_4x4, asm code for all modes + [6d1b07d41cdd] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_12x16 + [9d974915023f] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_8xN + [832d1d134449] + +2013-12-05 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad16-a.asm: + asm: 16bpp support for sad_x4 - all block sizes + [f864064737bc] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_6x8 + [c83d6906f665] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for pixel_sub_4xN + [31b3bf1246c7] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.h, source/common/x86/pixel-util8.asm: + asm: 10bpp code for pixel_sub_2xN + [c36134873a8d] + +2013-12-05 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad16-a.asm: + asm: 16bpp support for sad_x3 - all block sizes + [8f3af42f7f44] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: primitives of sse_ss for 12x16, 24x32, 48x64 and 64xN blocks + [4c9b7eb235a9] + +2013-12-05 Min Chen + + * source/common/x86/intrapred8.asm: + improvement by remove reduce ADD instruction in intra_pred_dc16 + [c9a67d02ad1c] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intra-ssse3.cpp, + source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm, source/encoder/compress.cpp, + source/encoder/slicetype.cpp, source/test/intrapredharness.cpp, + source/test/intrapredharness.h: + cleanup:merge Intra Pred DC mode into intra_pred[] + [7febdbc37965] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 18 + [91fe66f971d2] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 17 & 19 + [59f0433ffca0] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 16 & 20 + [d551487023ba] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 15 & 21 + [2ae36352e08c] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 14 & 22 + [88e38d7f926b] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 13 & 23 + [7995a50e0fc2] + + * source/test/intrapredharness.cpp: + testbench: swap order to call asm code + + Our old intra_pred_ang algorithm will fill buffer before input pLeft + and pabove, in this time, the offset [-1] pixel equal to [4], it + affect detect asm code error, so I swap the order + [b9e0bfacfb8e] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 12 & 24 + [e39c11970ca0] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 11 & 25 + [c8641f015e5b] + + * source/common/x86/asm-primitives.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm, source/test/intrapredharness.cpp: + asm: improvement intra_pred_ang by SSE4(pextrd,pextrb) + [c3d07f251bd8] + +2013-12-05 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for scale2D_64to32 routine + [1845917cb66d] + +2013-12-05 Yuvaraj Venkatesh + + * source/common/primitives.cpp: + integrating asm code for sa8d in primitives.cpp + + there was no separate functions for sa8d in assembly, we are just + re-using sa8d_inter functions for sa8d. + [b7656aa5f346] + +2013-12-05 Min Chen + + * source/common/vec/intra-ssse3.cpp: + cleanup unused array intra_ang4[] + [70a042f36c2c] + + * Merge branch 'X' + [78165334eed6] + +2013-12-04 Nabajit Deka + + * source/common/x86/ipfilter8.h: + Function declarations for modified luma_hps and chroma_hps + functions. + [79d649d551f0] + + * source/common/primitives.h, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + Test bench code for luma_hps and chroma_hps + [06f89ffdba43] + + * source/common/ipfilter.cpp: + C primitive changes for luma_hps and chroma_hps. + [835ee97789af] + + * source/common/x86/ipfilter8.asm: + asm : Modifications for luma_hps and chroma_hps(extra rows) + [6a0f7924321e] + +2013-12-05 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/intrapred.cpp, source/common/primitives.h, + source/common/vec/intra-ssse3.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/intrapred.h, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + asm: simplify code by use intra_pred_ang[][], and avoid build error + when disable yasm + [dcc2e11e5643] + + * rename IntraPred.cpp to intrapred.cpp to avoid team's hg merge + conflict + [b04134971883] + +2013-12-04 Deepthi Devaki + + * source/encoder/compress.cpp: + rdlevel: skip Intra if inter/merge sa8d less than a threshold + + In higher rdlevels Intra is skipped if inter/merge cu cbf is 0. A + threshold of sa8d expects that cu cbf will be 0. Thresholds have to + be refined further. + [e7424e0cb60f] + + * source/encoder/compress.cpp: + rdlevel: compare Merge-skip(merge2Nx2N with no residue) to best + among inter/intra/merge in rdlevel 2 + [4668ede3a332] + + * source/encoder/compress.cpp: + rdlevel: Add code for rdlevel 2 + + Use signalling bits + sa8d cost to choose best among + inter/merge/intra. Encode only best mode at each depth. + [6694ef611b41] + + * source/encoder/compress.cpp: + Enable topskip and earlyexit for all rd levels <= 4 (output changes + for presets faster than "slow") + + Also use the encodeResandCalcRDInter instead of the refactored + estimate function. + [e44315ab36b9] + +2013-12-05 Steve Borho + + * source/encoder/encoder.cpp: + encoder: fix warning of potentially unused locals + [c8ca8c93083b] + + * source/x265.cpp: + cli: fix final stat line handling of optional PSNR and SSIM stats + [ee8f2fa7d82a] + +2013-12-05 Kavitha Sampath + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp, source/encoder/encoder.cpp: + log: collect statistics after CU analysis + + Report percentage of CU blocks encoded at each depth of P/B slice as + EncCU + [ed7c32d83490] + +2013-12-05 Steve Borho + + * source/common/x86/pixel-a.asm: + pixel-a: fix x64 calling convention for several HEVC satd functions + + Bug fix from Min Chen + [64096a4dcdc1] + +2013-12-04 Steve Borho + + * source/CMakeLists.txt: + cmake: use -DHAVE_ALIGNED_STACK=1 when building asm for Xcode + + This matches the command line we use in the Makefile, and prevents + stack corruption when debugging within Xcode + [f9d3154732d9] + + * source/CMakeLists.txt: + cmake: workaround to allow Xcode 5 to link x265 CLI app + [d35b42382331] + + * source/encoder/encoder.cpp: + encoder: remove trailing white-space + [e4a7885f377e] + + * source/test/ipfilterharness.cpp: + oops + [834d6f7608c1] + + * source/test/ipfilterharness.cpp: + ipfilterharness: do not test chroma for i400 color space + + Because, umh, it doesn't have any chroma channels + [716ec3505f43] + + * source/CMakeLists.txt, source/VectorClass/README.txt, + source/VectorClass/instrset.h, source/VectorClass/vectorclass.h, + source/VectorClass/vectori128.h, source/VectorClass/vectori256.h, + source/VectorClass/vectori256e.h, source/common/CMakeLists.txt, + source/common/vec/intra-ssse3.cpp, source/common/vec/ipfilter- + sse41.cpp, source/common/vec/pixel16-sse41.cpp, source/common/vec + /vec-primitives.cpp: + remove Agner Fog's vector classes and their last few users + + We'll take a step back for HIGH_BIT_DEPTH perf for a few days, but + these functions are all expected to have assembly coverage soon. + This allows us to remove a few hacks from our cmake scripts as well. + [421a37d0924d] + + * source/common/vec/pixel16-sse41.cpp: + vec: drop 16bpp sad vector intrinsics, we have ASM coverage + [304354e736dc] + + * Merge + [2f41fae4a776] + +2013-12-04 Min Chen + + * source/test/pixelharness.cpp: + fix crash in ssim_end testbench, the x264 support 10bpp only, but + testbench use 12bpp + [0716a8ca46b4] + +2013-12-04 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for scale1D_128to64 module + [58674cdab926] + +2013-12-04 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: 16bpp support for sa8d_64xN + [6aaea628af95] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: 16bpp support for sa8d_32xN + [2b6d31ae96e1] + +2013-12-04 Murugan Vairavel + + * source/common/pixel.cpp, source/common/x86/asm-primitives.cpp: + asm: 10bpp code for enabling ssim_end_4 + [d73d32097efc] + +2013-12-04 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: 16bpp support for sa8d - 24x32 and 48x64 + [61c46e787877] + +2013-12-04 Dnyaneshwar G + + * source/common/x86/const-a.asm, source/common/x86/intrapred8.asm: + asm: move constant to const-a.asm + [9b062eb8124e] + +2013-12-04 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for transpose 64x64 + [f697c98cb63f] + +2013-12-04 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: 16bpp asm code for pixel_sa8d_16xN + [4547d3d03d1e] + +2013-12-04 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for transpose 32x32 + [146aae425df2] + +2013-12-04 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp: + 16bpp: enabled avt32to16_shr and cvt16to32_shl assembly code + [c38d2f74432f] + +2013-12-04 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for transpose 16x16 + [32bac98d3605] + +2013-12-04 Dnyaneshwar G + + * source/common/CMakeLists.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/sad16-a.asm: + 16bpp: assembly code for sad_NxN functions + [3602d193676d] + +2013-12-04 Min Chen + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 26 + [346830caf664] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 10 + [d142d2ba7168] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 9 & 27 + [f8d0c7b5b502] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 8 & 28 + [e37b4badaaa4] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 7 & 29 + [2f49dab61e52] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 6 & 30 + [2275a3803a80] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 5 & 31 + [f6dbd8dcec6c] + + * source/common/x86/intrapred8.asm: + asm: ALIGN branch target to improvement performance + [f35737d6abfe] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 4 & 32 + [d4fe3e90aebf] + +2013-12-03 Steve Borho + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + sbac: move global tables into x265 namespace + [1d2d60f4eb81] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: remove obsolete argument comment + [55c0bf9d9966] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/x86/asm- + primitives.cpp, source/encoder/compress.cpp: + Merge with stable + [4b455bb3caf6] + + * .hgtags: + Added tag 0.6 for changeset b970ffbdd696 + [6b7550eed359] + + * source/common/x86/asm-primitives.cpp: + asm: fix for icpc build + [b970ffbdd696] [0.6] + + * source/common/vec/intra-sse41.cpp, source/common/vec/intra- + ssse3.cpp: + vec: use square block size enums + [70dba94f6012] + + * source/common/CMakeLists.txt, source/common/vec/pixel-sse41.cpp, + source/common/vec/vec-primitives.cpp: + vec: remove pixel-sse41.cpp, we have full 8bpp pixelcmp coverage in + asm + [8c32edd92522] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + tskip: temporary workaround for heap corruption + + When tskip is enabled and tu-intra-depth is > 1 and --ctu is 64, we + often see heap corruption when the encoder is destroyed. Disabling + transform-skip on chroma seems to be an effective workaround until + we can find a proper fix. + [8e107ce14b43] + +2013-12-03 Aarthi Thirumalai + + * source/encoder/compress.cpp: + aq: bug fix for hash mismatch between recon with decoded output + [c541804106c7] + +2013-12-03 Steve Borho + + * source/encoder/ratecontrol.cpp: + partial backout of 9cfe20b782da + + Aarthi was correct, getSliceType() returns an HM SliceType enum. + Five lines above this it was testing whether getSliceType() == + I_SLICE. + + It is far less than optimal that we have two sets of slice type + definitions, but we have to be careful not to mix them up. + [432a87f546cd] + +2013-12-03 Yuvaraj Venkatesh + + * source/common/vec/intra-sse41.cpp: + fix 16bpp build fail due to intra-sse41.cpp + [6cc25bb700fc] + +2013-12-03 Steve Borho + + * source/common/vec/intra-sse41.cpp: + Backed out changeset: 8a3bb3ecf8f6 + [552d46bc20a9] + + * Merge + [80938e33bb04] + +2013-12-03 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util8.asm: + asm: 10bpp code for transpose 4x4 and 8x8 + [c9fe21f4676e] + +2013-12-03 Min Chen + + * Merge branch 'X' + [04350bfb3147] + +2013-12-03 Murugan Vairavel + + * source/common/x86/ssd-a.asm: + asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN + [78aa21f66013] + +2013-12-03 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: pixel_satd_32xN for 16bpp + [f8b0946edc8f] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: pixel_satd_64xN for 16bpp + [b4740dcd1621] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: pixel_satd - 12x16, 24x32, 48x64 for 16bpp + [69c110c3751b] + +2013-12-03 Dnyaneshwar G + + * source/common/x86/sad-a.asm: + asm: pixel_sad_64xN reduce large code size + [ca60c165ba4b] + +2013-12-03 Murugan Vairavel + + * source/common/x86/ssd-a.asm: + asm: 10bpp fix for alignment in sse_ss routine + [cd018d294e96] + +2013-12-03 Min Chen + + * source/common/vec/intra-ssse3.cpp, source/common/x86/const-a.asm, + source/common/x86/intrapred.h, source/common/x86/intrapred8.asm, + source/test/intrapredharness.cpp: + asm: assembly code for IntraPredAng4x4 Mode 3 & 33 + [9f7dad70e6a9] + + * source/common/x86/const-a.asm, source/common/x86/dct8.asm: + asm: support IDCT4-10bpp + [6f284a447088] + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred8.asm, + source/test/intrapredharness.cpp: + asm: intra_pred_ang Mode 34 + [21adddaee460] + + * source/common/x86/blockcopy8.asm: + asm: fix xmm register error in cvt16to32_shl + [7fa921f5845d] + +2013-12-02 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp: + asm: 10bpp support to sse_ss for 8xN, 16xN and 32xN blocks + [102b5042e596] + +2013-12-02 Min Chen + + * source/common/x86/mc-a.asm, source/common/x86/mc.h: + asm: fix miss avg_64x32 + [b2c7a7310e9f] + +2013-12-02 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm: + 16bpp: assembly code for pixelavg_pp + [3fea1dd70211] + +2013-12-02 Murugan Vairavel + + * source/common/vec/intra-sse41.cpp: + fix: bug in inra-sse41.cpp + [8a3bb3ecf8f6] + +2013-12-02 Min Chen + + * source/common/vec/intra-ssse3.cpp, source/common/x86/intrapred.h, + source/common/x86/intrapred8.asm: + asm: assembly code for IntraPredAng4x4 Mode 2 + [f1ccd8eab389] + + * source/common/primitives.h: + fixup proto of intra_pred_ang_t + [5457fd3a26a7] + +2013-12-03 Min Chen + + * source/test/pixelharness.cpp: + testbench: increment buffer size to avoid crash in check_pixeladd_ss + [21fe86adaa9a] + +2013-12-03 Aarthi Thirumalai + + * source/encoder/compress.cpp: + aq: bug fix for hash mismatch between recon with decoded output + [660ec2c02798] + +2013-12-03 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp, + source/encoder/cturow.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + define new rdlevels, default presets changed accordingly. + + levels 6,5 equivalent to current rd 2, 4 equivalent to current rd 1, + and rest equivalent to rd 0. More parameters will be added to + distinguish each levels. + [86d23688b017] + + * source/encoder/encoder.cpp: + Move rdoQTS initialization after rdoQ is set. + [9f96817c8894] + +2013-12-02 Gopu Govindaswamy + + * source/common/common.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp: + cuTree: integrated CuTree into RateControl and Added b-references + into RC + [9cfe20b782da] + +2013-12-03 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncSearch.cpp: + heap corruption: temporarily disable tskip in chroma. + + Temporary fix for the crash caused by enabling tskip in chroma. + [f539fa01f20c] + +2013-12-02 Steve Borho + + * source/common/TShortYUV.cpp, source/common/TShortYUV.h, + source/common/vec/intra-sse41.cpp, source/input/y4m.cpp: + Merge with stable + [ca7bd538e052] + + * source/input/y4m.cpp: + y4m: initialize colorSpace to avoid crashes after failing to parse + y4m header + [40f9842972da] + + * source/common/vec/intra-sse41.cpp: + vec: disable allangs 32x32 intrinsic primitive for clang + + clang seems to compile it ok, and the testbench passes, but it + causes runtime exceptions when used within the encoder. + [353a34a5d35f] + + * source/common/TShortYUV.cpp, source/common/TShortYUV.h: + Backed out changeset: 5df643257054 + + This was causing memory corruption when --tskip and --rd=2 were used + together. It needs to be further investigated and fixed on the + default branch. + [c2911115b79a] + + * source/common/x86/pixel-a.asm, source/common/x86/pixel.h: + asm: make it more clear that pixel-a.asm has only satd and sa8d now + [70e127d735a5] + + * source/common/CMakeLists.txt, source/common/x86/pixel-a.asm, + source/common/x86/ssd-a.asm: + asm: move ssd functions into their own ssd-a.asm file, similar to + sad-a.asm + [a9f629fac91e] + + * source/common/x86/pixel-a.asm, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/common/x86/pixel.h: + asm: move variance functions to pixel-util8.asm + [eea094a84b9c] + + * source/common/x86/mc.h, source/common/x86/pixel-a.asm, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/common/x86/pixel.h: + asm: move pixel_sub to pixel-util8.asm, move pixel_avg funcdef to + mc.h + [2de04bb5da1d] + + * source/common/x86/pixel-a.asm: + pixel: remove an unused macro + [2ed3b664c370] + + * source/common/x86/pixel-a.asm, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/common/x86/pixel.h: + asm: move scale functions to pixel-util + [a439c19ee304] + + * source/common/x86/pixel-a.asm, source/common/x86/pixel-util.h, + source/common/x86/pixel-util8.asm, source/common/x86/pixel.h: + asm: move SSIM functions to pixel-util + [b091438d1446] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel-util.h, source/common/x86/pixel-util8.asm, + source/common/x86/pixel.h: + asm: move transpose from pixel-a.asm to pixel-util8.asm, add pixel- + util.h + [a182faf23ead] + + * source/common/x86/pixel.h: + asm: remove more unused funcdefs from pixel.h + [41c6dc5b35e8] + + * source/common/x86/blockcopy8.asm, source/common/x86/pixel-util8.asm: + asm: move cvt* functions to blockcopy8.asm + [b6766dc86e2a] + + * source/encoder/frameencoder.cpp, source/encoder/ratecontrol.cpp: + rc: fixups for cutree changes + [dab34fa63c0c] + +2013-12-02 Murugan Vairavel + + * source/common/x86/pixel.h: + asm: removed unused function defnitions from pixel.h + [47ddbf9b5866] + +2013-12-02 Yuvaraj Venkatesh + + * source/common/x86/pixel.h, source/common/x86/sad-a.asm: + cleanup: removed unused code from sad-a.asm + [a615a46d4631] + +2013-12-02 Steve Borho + + * source/common/pixel.cpp: + picel: fix compile error from older gcc + [4508b8c923e6] + + * source/common/CMakeLists.txt: + cmake: fix Win64 vector primitive compile flags + [ccf65888fc2c] + + * Merge with stable + [d8d716eb11b8] + +2013-12-02 Shazeb Nawaz Khan + + * source/Lib/TLibEncoder/WeightPredAnalysis.cpp: + fix for the number of weighted references exceeding 8 in HM weight + analysis + [bf778de26451] + +2013-12-02 Gopu Govindaswamy + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: fix for gcc warnings + [0a8023666206] + +2013-12-02 Murugan Vairavel + + * source/common/x86/pixel-a.asm: + asm: removed unused code from pixel-a.asm + [df0b4f81609e] + +2013-12-02 Steve Borho + + * source/common/pixel.cpp: + pixel: fix 16bpp warnings that were previously hidden by cmake rules + [0a85121531fc] + +2013-12-02 Gopu Govindaswamy + + * source/common/common.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/x265.h: + slicetype: CuTree Implementation for AQ RateControl + + Added Following methods into slicetype for CuTree Implementation + 1.cuTree - Entry Point for CuTree 2.estimateCUPropagate and + estimateCUPropagateCost - Calculate the CU Propagate cost for CU's + 3.cuTreeFinish - update the qpOffset using Precomputed + PropagateCost, weightedCostDelta and lookahead costs + + Added cuTree option into param->rc and make it as a Disable, still + the cuTree is an Under Construction + [c75c3431b108] + +2013-12-02 Steve Borho + + * source/common/CMakeLists.txt: + cmake: nits + [189ac76266a9] + + * source/common/CMakeLists.txt: + cmake: ignore gcc warnings in vector intrinsic files + [bb65f4686d68] + +2013-12-01 Steve Borho + + * source/common/CMakeLists.txt, source/common/vec/pixel-ssse3.cpp, + source/common/vec/vec-primitives.cpp: + vec: remove pixel-ssse3.cpp, its last function has asm coverage + [bd3fad7cffec] + + * source/common/CMakeLists.txt, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-ssse3.cpp, source/common/vec/intra-sse41.cpp, + source/common/vec/ipfilter-ssse3.cpp, source/common/vec/pixel- + sse41.cpp, source/common/vec/pixel16-sse41.cpp: + cmake: ignore unreferenced formal paramter warnings in common/vec + + And remove all the hacks that were in place to avoid those warnings + [b4eef7d41af1] + + * source/common/x86/asm-primitives.cpp: + asm: nits + [e83550d5f10d] + + * source/common/x86/asm-primitives.cpp: + asm: 16bpp fixup + [5db9eba4b2df] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/common.cpp, + source/common/cpu.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy-sse3.cpp, source/common/vec/dct- + sse3.cpp, source/common/vec/dct-ssse3.cpp, source/common/vec/intra- + sse41.cpp, source/common/vec/intra-ssse3.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/blockcopy8.h, + source/common/x86/ipfilter8.h, source/common/x86/pixel.h, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h, + source/input/y4m.cpp, source/input/yuv.cpp, source/output/y4m.cpp, + source/output/yuv.cpp, source/test/intrapredharness.cpp, + source/test/ipfilterharness.cpp, source/test/pixelharness.cpp, + source/test/testbench.cpp, source/x265.h: + uncrustify all source + [5bf05c39e566] + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + Merge with stable + [7f2bc20e9ff1] + + * source/encoder/encoder.cpp, source/x265.def.in, source/x265.h: + api: add alloc/free methods for x265_picture for future safety + [32942ebd5793] + + * source/encoder/encoder.cpp, source/x265.cpp, source/x265.def.in, + source/x265.h: + api: add alloc/free methods for x265_param + + This allows apps to use x265_param as pure abstract structures, just + like x265_encoder, in order to make their app future-safe against + changes to x265_param (without requiring recompiles or code + changes). + [aabaf5382ac5] + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/encoder/slicetype.cpp, source/x265.cpp, source/x265.h: + api: change x265_param.bpyramid to bBPyramid (bool) + + x264's strict bpyramid mode seems to be H.264/blue-ray specific and + thus we are left with only on or off. + [53b1eeb6333b] + + * source/encoder/encoder.cpp: + csv: improve CSV headers to use title caps + [418e55eefbb8] + + * source/test/intrapredharness.cpp: + test: intrapred harness fixup + [0547f1672f13] + + * source/encoder/encoder.cpp: + encoder: nits + [d17843e942b5] + +2013-11-08 Xun Xu, PPLive Corporation + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/x265.cpp, source/x265.h: + rc: initial work towards VBV rate control + + 1. add parameter "vbv-maxrate" "vbv-bufsize" "vbv-init" into cmd + line 2. implement vbv methods, this patch doesn't use lookahead data + + future work, 1. vbv-lookahead 2. CU level ratecontrol + [8a90153de720] + +2013-12-01 Steve Borho + + * source/common/vec/intra-sse41.cpp: + vec: drop intra planar intrinsic primitives, we have asm coverage + [343d9ba487b2] + + * source/test/intrapredharness.cpp: + intra: testbench fixups after dropping 64x64 C refs + [81c09b55acf1] + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-ssse3.cpp: + vec: remove two DCT intrinsic primitives with asm coverage + [9facac4f81f7] + + * source/common/intrapred.cpp: + intrapred: fix func decl of intra-ang C ref + [50261fa292ad] + + * source/common/intrapred.cpp: + intrapred: use square block defines, do not instantiate intra 64x64 + [3409078021ac] + + * source/common/x86/asm-primitives.cpp: + asm: plumb out more 16bpp asm setup infrastructure + [776fc3575e2d] + +2013-11-29 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + 10bpp: asm code for pixel_var_32x32 and 64x64 + [803048f62317] + +2013-11-29 Nabajit Deka + + * source/common/vec/dct-sse41.cpp, source/common/x86/asm- + primitives.cpp: + Enable idst4 asm + [d8c523bd9f90] + + * source/common/x86/dct8.asm, source/common/x86/dct8.h: + asm : Adding asm routine for idst4 + [3e8c280b16a6] + +2013-11-29 Min Chen + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intra-ssse3.cpp, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + size based array for intra_pred_ang[] + [cc7bb2f18d01] + +2013-11-28 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: enabled asm routines for HIGH_BIT_DEPTH, which has the support + for 16bpp + [bb776ea49cba] + +2013-11-28 Nabajit Deka + + * source/common/vec/dct-ssse3.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/dct8.asm, + source/common/x86/dct8.h: + asm : Adding asm routine for dst4. + [2ab09fab2826] + +2013-11-28 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.h, source/common/x86/pixel-util8.asm: + asm: assembly code for cvt16to32_shl + [9bda4cecf6c0] + +2013-11-28 Min Chen + + * source/common/primitives.h: + cleanup: remove unused cvt16to16_shl_t + [8f1a72797abb] + +2013-11-28 Dnyaneshwar Gorade + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + testbench: added cvt16to32_shl primitive function + [f9935384fa2a] + +2013-12-01 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sse_sp intrinsic primitives, we have asm coverage + [5857fdc3c3ff] + +2013-11-28 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h: + asm: cleanups for pixel_sse_sp + [8a9a0ef760e8] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_4xN + [052a1b094def] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_12x16 + [8683adc61bec] + +2013-11-28 Min Chen + + * source/common/CMakeLists.txt, source/common/x86/intrapred.asm, + source/common/x86/intrapred8.asm, source/common/x86/pixel-util.asm, + source/common/x86/pixel-util8.asm: + rename to avoid 10bpp conflict + [016709ae6264] + +2013-11-28 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h: + asm: assembly code for intra_pred_planar[32x32] + [e6a32d404e18] + +2013-11-30 Deepthi Nandakumar + + * Merge from stable + [2786f9e92560] + + * source/common/common.cpp: + presets: bpyramid default value reset to 1 + + No support for strict b-pyramid yet. + [87dc694fc016] + + * source/encoder/encoder.cpp: + ssim: increase precision in ssim reporting + [b08f3853adb9] + +2013-11-29 Deepthi Nandakumar + + * Merge from stable + [833d78aaf71e] + + * source/common/common.cpp: + presets: correct bframes in "slow" to 4 + [fb93582b5f3f] + + * source/encoder/compress.cpp: + compress: cleanup + [ac01f12310ed] + + * source/encoder/compress.cpp: + compress: save distortion info in xComputeCostInter. + [2559b4c52148] + + * source/encoder/compress.cpp: + compress: save best bits, sad in xcomputeCostIntrainInter + [a7d2fb189311] + + * source/encoder/compress.cpp: + compress: disable EARLY_EXIT and TOP_SKIP (temporarily) + [e0036ec4a61b] + +2013-11-28 Steve Borho + + * Merge with stable + [e7a5780843de] + + * source/common/vec/vec-primitives.cpp: + vec: tab to spaces + [2456d360a4ce] + +2013-11-29 Deepthi Nandakumar + + * source/common/common.cpp: + presets: modifications to smoothen the performance-efficiency plot. + [bcc0941f67b3] + +2013-11-28 Deepthi Nandakumar + + * source/encoder/compress.cpp: + RD merge: refine merge costs with estimated merge mode bits. + [42892f4f4cc2] + +2013-11-28 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/vec/vec-primitives.cpp: + cmake: gcc flag introspection to fix Mac OS X 10.6 build + [38719294293f] + +2013-11-28 Deepthi Nandakumar + + * source/encoder/compress.cpp: + RD merge: add in early-skip param. + [2ba6c26c9feb] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + RD merge: remove earlyDetectionSkip, output is unchanged. + [ca8c57f0c532] + + * source/encoder/compress.cpp: + RD merge: remove more redundant set-fields + [54379de63b85] + +2013-11-27 Deepthi Devaki + + * source/encoder/compress.cpp: + merge2nx2n: move initializations out of the loop + + Only mergeIndex and MVs will be changed in each loop. + [f516ff5d6af3] + +2013-11-28 Deepthi Nandakumar + + * source/encoder/compress.cpp: + RD: use sa8d in inter/merge cost measurements instead of satd + [0c2a9f913c0b] + + * source/encoder/compress.cpp: + RD: change cost measurements for inter + + The cost calculation for inter modes now use cost = satd(orig, pred) + + lambda*mebits. This is an estimation to the actual RD-cost. + [0ab1c01bf952] + +2013-11-27 Steve Borho + + * source/encoder/encoder.cpp: + encoder: change default frameNumThreads for 4-core HT CPUs to 3 + [f92e0c49a9f0] + +2013-11-27 Shazeb Nawaz Khan + + * source/encoder/frameencoder.cpp: + Fix to eliminate unwanted application of weight in some (ref, enc) + pairs + [bfd5204ef226] + +2013-11-27 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + aq: Bug Fixes in aq , ssim when ctu size is not set as 64. + + 1.Calculate qp offset for CU based maxCuSize set in + FrameEncoder::calcQpForCu 2.Fix height for ssim computation based on + maxCuHeight in FrameFilter::processRowPost + [6eb45b5bf181] + +2013-11-27 Steve Borho + + * source/encoder/encoder.cpp: + encoder: use more portable %PRIu64, define __STDC_FORMAT_MACROS + + Older GCC versions and clang require __STDC_FORMAT_MACROS to be + defined prior to including stdint.h when compiling for C++, in order + for PRIu64 and friends to be defined + [13f60881bf01] + + * Merge with stable + [949f85337789] + + * source/encoder/CMakeLists.txt, source/encoder/encoder.cpp: + cmake: detect inttypes.h and use for uint64_t printfs + [e4baf53cefe8] + + * source/common/vec/intra-sse41.cpp: + vec: remove intra_pred_planar16_sse4, we have asm coverage + [892addcb1c94] + + * source/common/vec/pixel-ssse3.cpp: + vec: remove scale2D_64to32, we have asm coverage + [c5efe0603b61] + + * source/common/x86/intrapred.asm: + intra: fix yasm warning about redefined macro + [04811b42aa6b] + + * Merge + [ec904fab863a] + +2013-11-27 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_24x32 + [cf5c2f982353] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_8xN + [54ba57708276] + +2013-11-27 Nabajit Deka + + * source/common/vec/dct-sse3.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/dct8.h: + Enable the idct4 asm routine. + [7dbe6495ebb8] + + * source/common/x86/dct8.asm: + asm: Adding asm routine for idct4 + [e463501f8a20] + + * source/common/x86/const-a.asm: + Adding constant tables used for idct4 asm routine + [a49c0228e06e] + +2013-11-27 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: pixel_sse_ss_64xN assembly routine + [bf7cf2555571] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: pixel_sse_ss_48x64 assembly routine + [45ce09834506] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: pixel_sse_ss_24x32 assembly routine + [8edf6fa32a74] + +2013-11-27 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_64xN + [248a56faff0a] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_48x64 + [6051967b60cd] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_32xN + [aeb1c93c69d2] + + * source/common/pixel.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-a.asm, source/common/x86/pixel.h: + asm: code for pixel_var_32x32 and 64x64 blocks + [8846d37b3d9c] + +2013-11-27 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h: + asm : assembly code for intra_pred_planar[16x16] + [09b5e8f592ac] + +2013-11-27 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: fix the alignment issues occured in sse_ss + [9c60abb71cf6] + +2013-11-18 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for scale2D_64to32 routine + [78c171e33865] + +2013-11-27 Steve Borho + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibEncoder/TEncAnalyze.h, source/common/reference.cpp, + source/common/reference.h, source/common/vec/blockcopy-avx2.cpp, + source/common/vec/intra-sse3.cpp, source/common/vec/pixel-avx2.cpp, + source/common/vec/pixel-sse3.cpp, source/compat/msvc/LGPL.txt, + source/compat/msvc/getopt.c, source/compat/msvc/getopt.h: + Merge with default (feature freeze for 0.6) + + Bug fixes and tunings only on the stable branch, until the next tag + is issued. + [417f794274e5] + + * source/encoder/encoder.cpp: + encoder: do not print weightp stats if no P frames were generated + + (prevents NAN or worse errors) + [ece323e1b603] + + * source/encoder/encoder.cpp: + log: replace "Skip" with more accurate "Merge" + [eadcdeb661bc] + +2013-11-26 Steve Borho + + * source/common/vec/intra-sse41.cpp: + vec: drop intra_pred_planar 4x4 and 8x8 intrinsic primitives, we + have asm + [b09b6fa7e89a] + + * source/encoder/encoder.cpp: + encoder: return pic.sliceType = X265_TYPE_IDR if keyframe + [39286dff86ea] + + * source/common/vec/dct-sse41.cpp: + vec: drop dequant_normal intrinsic primitive, we have asm + [5b5d37ef369a] + + * source/common/vec/dct-sse3.cpp: + vec: remove dct4 intrinsic primitive + [543390a8644c] + + * Merge + [ba8e95798860] + +2013-11-26 Nabajit Deka + + * source/common/vec/dct-sse3.cpp, source/common/x86/asm- + primitives.cpp: + Enable the new dct4 asm routine. + [5e3b1d59d8dd] + + * source/common/CMakeLists.txt: + Adding dct8.asm and dct8.h to CMakeLists + [51b6d0c6ecf5] + + * source/common/x86/const-a.asm: + Adding constant table used for dct4 + [78431cd16bb5] + + * source/common/x86/dct8.asm, source/common/x86/dct8.h: + asm: assembly code for dct4x4 + [e9ac715f16cf] + + * source/common/x86/pixel-util.asm: + asm: Correct number of xmm registers for weight_sp routine. + [8bc370263676] + +2013-11-26 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h: + asm: assembly code for intra_pred_planar[8x8] + [13fe158374e7] + +2013-11-26 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for pixel_sse_sp_16xN + [3791482e68f5] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h, source/encoder/ratecontrol.cpp, + source/test/pixelharness.cpp: + asm: removed unused code in pixel_var module + [a903be46b40d] + +2013-11-26 Min Chen + + * source/common/dct.cpp: + fix: in 14bpp mode, maximum shift is 10 + [7b48cda38797] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/encoder/framefilter.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h: + asm: active x264 ssim code + + Side effect: Remove ssim_t to avoid conflict with x264 asm code we + use int64_t when HIGH_BIT_DEPTH enable, but x264 always 'int' + [63a87d839bed] + +2013-11-25 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove intrinsic pixel weight functions, we have asm coverage + [491fd3ee6fd1] + +2013-11-26 Steve Borho + + * source/x265.h: + api: document a few rate control settings + [5accd2ae5ceb] + +2013-11-26 Min Chen + + * source/common/x86/pixel-a.asm, source/common/x86/pixel-util.asm, + source/common/x86/pixel.h: + asm: fix build error on x64 + [116d91f08fcb] + +2013-11-26 chenm003 + + * source/common/dct.cpp, source/common/x86/asm-primitives.cpp: + 合并 multicoreware/x265 到 default + [06d509e2e687] + +2013-11-25 Murugan Vairavel + + * source/common/pixel.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-a.asm, source/common/x86/pixel.h: + asm: code for pixel_var_16xN + [672ae35d4e5f] + +2013-11-25 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sse_ss_32xN + [8075b13cee00] + +2013-11-25 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h: + asm: assembly code for intra_pred_planar[4x4] + [6a8fbb091722] + +2013-11-25 Murugan Vairavel + + * source/common/pixel.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-a.asm, source/common/x86/pixel.h: + asm: code for pixel_var_8xN + [da18434af735] + +2013-11-25 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sse_ss_12x16 + [71262c718dfa] + +2013-11-25 Murugan Vairavel + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + Test bench: code for pixel_var + [529bd0084265] + +2013-11-25 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: assembly code for sse_ss - 4xN, 8xN, 16xN + [7cab79758dd7] + +2013-11-25 Nabajit Deka + + * source/test/pixelharness.cpp: + Test bench modifications for weight_sp() asm routine + [3e688d424f05] + + * source/common/vec/pixel-sse41.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/pixel.h: + Adding asm function declaration and initialization for weight_sp asm + routine + [47ef19a1734c] + + * source/common/x86/pixel-util.asm: + asm : routine for weight_sp(). + [d9d6b8b4e4f1] + +2013-11-25 Min Chen + + * source/common/x86/const-a.asm, source/common/x86/ipfilter8.asm: + asm: move constant 8192 to const-a.asm for share + [a5c7cd496583] + +2013-11-25 Nabajit Deka + + * source/common/vec/pixel-sse41.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/pixel.h: + Adding asm function declaration and initialization for weight_pp asm + routine. + [be74f1731279] + + * source/test/pixelharness.cpp: + Test bench modifications for weight_pp() asm routine. + [13126513fe61] + + * source/common/x86/pixel-util.asm: + asm : routine for weight_pp(), for input width in multiples of 16 + [3e4c257d88ab] + +2013-11-25 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp: + cleanup the temporary function pointer initialization + [b54870f0cdd3] + +2013-11-25 Min Chen + + * source/common/dct.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-util.asm, source/common/x86/pixel.h: + asm: assembly code for dequant_normal + [67e8ecb2b0e5] + + * source/common/x86/mc-a.asm: + cleanup: remove unused code in mc-a.asm + [9c7142ced7c4] + +2013-11-24 Min Chen + + * source/common/x86/pixel-a.asm: + cleanup: remove unused constant in pixel-a.asm + [c0c862dc71fb] + + * source/common/x86/pixel-a.asm: + cleanup: remove unused code in pixel-a.asm + [513f564ba360] + + * source/common/x86/mc-a2.asm: + cleanup: remove unused code in mc-a2.asm + [464af047f7b1] + +2013-11-25 Steve Borho + + * source/encoder/encoder.cpp: + encoder: do not warn of b-pyramid and small bframe value + + Many default presets can do this, it is harmless + [fdaa754dfbd4] + + * source/encoder/encoder.cpp: + log: shorten Planar to P + [e607520863c0] + + * source/encoder/encoder.cpp: + log: do not output statistics for disabled features + [484ac0c345a2] + + * source/common/dct.cpp: + dct: remove unused static array + [81f4e226c5a3] + + * source/encoder/encoder.cpp: + encoder: compilers can't agree on how to print long longs + [319881b4ebf3] + +2013-11-25 Kavitha Sampath + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp, + source/encoder/encoder.cpp: + log: improve statistics display + + represent 4x4 statistics for I frame, combine I frame log in single + line, preserve space in log + [c7b97ebc1c9b] + +2013-11-24 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: repair ICL compile on Windows + [864f4664c775] + +2013-11-22 Steve Borho + + * source/common/vec/vec-primitives.cpp: + clang: remove compilation guards around + Setup_Vec_IPredPrimitives_sse41() + + Now that a number of primitives have been replace by assembly, the + functions are safe. + [10f605bd0530] + + * source/common/CMakeLists.txt, source/common/vec/pixel-sse3.cpp, + source/common/vec/pixel-ssse3.cpp, source/common/vec/vec- + primitives.cpp: + cmake: merge pixel-sse3.cpp into pixel-ssse3.cpp + + Both files only had one primitive each, and they will both probably + be replaced soon + [e5db4a4859ee] + + * source/common/vec/pixel-ssse3.cpp: + pixel-ssse3: remove scale1D_128to64, we have ASM + [7e2694f9a226] + + * source/common/vec/pixel-sse41.cpp: + pixel-sse41: cleanup intrinsic weight function names + [a6da40d0584f] + + * source/common/vec/intra-sse41.cpp: + vec: remove intrinsic predDCFiltering() and intra_pred_dc(), we have + ASM + [285fe7a59cf5] + + * source/common/x86/asm-primitives.cpp: + asm: remove assignements to square block sa8d[] methods + + These are handled specially in x265_setup_primitives() + [e28d9b6b5d65] + +2013-11-22 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + crf: bug fix. regulate qp of first frame based on ABR_INIT_QP. + [e6ec06cf5d3d] + +2013-11-22 Deepthi Nandakumar + + * Merge + [cc075ae1098f] + +2013-11-20 Sumalatha Polureddy + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + no-rdo: implemented topskip algorithm + + It is basically setting the starting depth from which the + partitioning should happen for particular CU. The starting depth for + particular CU in present frame is selected form the previous frame's + colocated CU minimum depth. + + the performance, bitrate increase and psnr comparison are given + below CLI: x265.exe input.y4m -o abc.hevc -r recon.y4m --rd 1 --ref + 1 a. Early exit OFF and Top Skip OFF b. Early exit OFF and Top Skip + ON + + BasketballDrive_1920x1080_50 Timetaken to encode: 681/639s bitrate: + 3650/3657 kb/s PSNR: 36.7/36.703 perf improvement: 6.16% (compared + to TopSkip OFF and ON) Bitrate increase: 0.19% (compared to TopSkip + OFF and ON) + + Cactus_1920x1080_50 Timetaken to encode: 530/492s bitrate: 2787/2795 + kb/s PSNR: 35.527/35.529 perf improvement: 7.16% (compared to + TopSkip OFF and ON) Bitrate increase: 0.28% (compared to TopSkip OFF + and ON) + + Kimono1_1920x1080_24 Timetaken to encode: 278/264s bitrate: + 1243/1246 kb/s PSNR: 38.16/38.16 perf improvement: 5.03% (compared + to TopSkip OFF and ON) Bitrate increase: 0.24% (compared to TopSkip + OFF and ON) + + FourPeople_1280x720_60 Timetaken to encode: 173/163s bitrate: + 486/492 kb/s PSNR: 39.097/39.094 perf improvement: 5.78% (compared + to TopSkip OFF and ON) Bitrate increase: 1.2% (compared to TopSkip + OFF and ON) + + PartyScene_832x480_50 Timetaken to encode: 134/119s bitrate: + 1652/1661 kb/s PSNR: 31.374/31.377 perf improvement: 11.16% + (compared to TopSkip OFF and ON) Bitrate increase: 0.544% (compared + to TopSkip OFF and ON) + + big_buck_bunny_360p24 Timetaken to encode: 1772/1477s bitrate: + 174/175 kb/s PSNR: 37.798/37.797 perf improvement: 16.6% (compared + to TopSkip OFF and ON) Bitrate increase: 0.5% (compared to TopSkip + OFF and ON) + [b6323f2be057] + +2013-11-22 Deepthi Nandakumar + + * source/encoder/slicetype.cpp: + lookahead primitives: fix bug that caused wrong cost estimates in + ducks_take_off. + [28f42f1be951] + +2013-11-22 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: drop intrinsic sse_pp functions, we have ASM coverage + [1c74d7bfd007] + + * source/test/pixelharness.cpp: + pixelharness: fix the other header buffer + [3c827bba6cd6] + +2013-11-22 Praveen Tiwari + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv::addClipChroma, integrated pixel_add_ps function + [cc123a1ec253] + +2013-11-22 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code of sse_pp routine for 64x32, 64x48 and 64x64 blocks + [f082c556f337] + +2013-11-22 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp: + added blockcopy_sp function pointers + [4b437f76280d] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv::addClip, integrated luma_add_ps + [fd90bd911169] + +2013-11-22 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code of sse_pp routine for 48x64 and 64x16 blocks + [2e0a0a5eb0c7] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for sse_pp_24x32 routine + [0b9bccb2ef7f] + +2013-11-22 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp: + asm-primitives.cpp, removed temporary function pointer + initialization, generated through macro calls + [76e2c787aadb] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_64xN, asm code + [e7eeb6443303] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_48x64, asm code + [3847098e9553] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_12x16, asm code + [9f34d1d82296] + +2013-11-22 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for sse_pp_12x16 routine + [f09ca4290a55] + +2013-11-22 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/vec/dct-sse41.cpp, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + split dequant to normal and scaling path + [4ec80bd40603] + + * source/test/pixelharness.cpp: + fix bug for testbench string buffer overflow + [ab94f6effb71] + +2013-11-22 Steve Borho + + * source/common/cpu.cpp: + cpu: fix non-Windows build with ASM disabled + [5009254d3d3a] + + * source/CMakeLists.txt: + cmake: backout cmake_policy(); cmake is idiotic + + Why would you issue an error if your version of cmake doesn't know + about the given policy? Especially if the selected policy is OLD? + Hello?? + [830deb5fb3d3] + +2013-11-21 Steve Borho + + * source/CMakeLists.txt, source/Lib/TLibEncoder/TEncSearch.cpp, + source/common/CMakeLists.txt: + cmake: almost revive Xcode support + + # macbrew based instructions brew install cmake --HEAD cmake -G + Xcode ../source open x265.xcodeproj + + > cmake --version cmake version 2.8.12.20131121 + + The static library is still not linking properly, so the cli does + not link as well; but it does build the shared library + [f4e10e4d3f0d] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: use fast weighted subpel refine + + Don't do the full-blown weighted motion compensation for ME. Just + interpolate the weighted full pel pixels. It is not 100% accurate to + the pixels that will be used to encode the final prediction; but + close enough for ME. + + Testing with sintel_trailer_2k_720p24.y4m at medium preset and all + defaults x265 [info]: 651 of 1124 (57.92%) P frames weighted + + before: 1253 frames in 512.74s (2.44 fps), 223.51 kb/s, Global PSNR: + 50.552 after: 1253 frames in 410.25s (3.05 fps), 223.59 kb/s, Global + PSNR: 50.589 + [8f156b97360b] + + * source/common/vec/intra-sse41.cpp: + intra: fix 16bpp builds + [b172259c07f1] + +2013-11-21 Praveen Tiwari + + * source/common/x86/pixeladd8.asm: + pixel_add_ps, 32xN corrected xmm register count + [76b0d2a278fb] + +2013-11-21 Steve Borho + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + pixelharness: cleanup check function names + [8dfe7282ce81] + + * source/test/intrapredharness.cpp, source/test/pixelharness.cpp, + source/test/testharness.h: + testbench: re-line up test results to improve readability + [1643c78be418] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/pixel-sse41.cpp, source/encoder/motion.cpp, + source/encoder/reference.cpp, source/encoder/slicetype.cpp, + source/test/pixelharness.cpp, source/test/pixelharness.h: + primitive: rename weight primitives to match our naming convention + + weight_pp -> weight pixels to pixels weight_sp -> weight shorts to + pixels + [49f8b71ae89b] + + * source/encoder/slicetype.cpp: + slicetype: prevent gcc 4.8.1 compiler error + [8dc9e5e4a0e6] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: fixup planar size index + [e733415996a1] + + * source/encoder/slicetype.cpp: + slicetype: remove hungarian pointer prefixes and unnecessary pixel + casts + [d7cf0dd2133e] + + * source/common/intrapred.cpp: + intrapred: cleanup intra C references + [bcef0e896234] + + * source/common/x86/asm-primitives.cpp: + asm: remove cmp templated wrapper function + [46e4175eeb88] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: nits + [562eea581dcf] + +2013-11-21 Dnyaneshwar G + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intra-sse41.cpp, + source/encoder/compress.cpp, source/encoder/slicetype.cpp, + source/test/intrapredharness.cpp: + remove width arg from intra_pred_planar + [bdff2c785860] + +2013-11-21 Santhoshini Sekar + + * source/encoder/encoder.cpp: + bug fix: print Mean psnr Y,U,V properly + [012b3d1da19b] + +2013-11-21 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp: + asm: added sa8d-4x4,4x8,8x4,4x16,16x4,12x16,16x12 to asm- + primitives.cpp + [fbb0dab55009] + +2013-11-21 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_32x64, asm code + [3b27df9d0a54] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_16x64, asm code + [4f53bb000c70] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps, asm code + [2683153044c8] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm: pixel_add_ps,32xN + [23626c3cac6b] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps, 16x8, 16x12, 16x16, 16x32 asm code + [583152af6f66] + +2013-11-21 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for transpose_64x64 routine + [b305aa607e0b] + + * source/common/x86/pixel-a.asm: + asm: Improvement by replace macro extend by function call + [0caa6518420e] + +2013-11-21 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_2x8 asm code + [98fac41f309c] + +2013-11-21 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + [asm]: pixel_add_ps_2x4 + [f290ec5a0532] + +2013-11-21 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_6x8, asm code + [3f227acdb082] + + * source/common/x86/pixeladd8.asm: + pixeladd8.asm, removed unsed header 'x86util.asm' + [0e9ae87861ba] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_16x4, asm code + [5077332c5bcc] + +2013-11-21 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_8x32 + [16b15235cd76] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_32x8 + [0b4b4143ceea] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_24x32 + [a71839b646b9] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_32x24 + [47c04fd98b80] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_8x16 and pixel_sa8d_16x8 + [977220137f22] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_16x64 + [2b8e89b5b836] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_64x16 + [d43255da9bd6] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_48x64 + [2d6721016c9a] + +2013-11-21 Min Chen + + * source/test/ipfilterharness.cpp: + testbench: initialize output buffer to solve lumz_sp[] mistake + problem + [7e4656d4fd33] + +2013-11-21 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_64x48 + [6a82e5177eb3] + +2013-11-21 Dnyaneshwar G + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h, + source/test/intrapredharness.cpp: + asm: assembly code for IntraPred_DC[32x32] + [286984ebb6c7] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h: + asm: assembly code for IntraPred_DC[16x16] + [4eb59e47be20] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/intrapred.asm, source/common/x86/intrapred.h: + asm: assembly code for IntraPred_DC[8x8] + [40a060130b62] + +2013-11-20 Steve Borho + + * source/common/primitives.h: + primitive: nits + [db1151bb4974] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy-sse3.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h: + primitive: remove dead blockcpy_sp + [b3749e1d64c0] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy-sse3.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h: + primitive: remove dead pixelsub_ps + [0d54ec898e79] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter-sse41.cpp, source/common/vec/ipfilter- + ssse3.cpp, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + primitive: remove dead ipfilter_s2p + [f3962b7a971c] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter-sse41.cpp, source/common/vec/ipfilter- + ssse3.cpp, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + primitive: remove dead ipfilter_p2s + [f3d4f3ac215e] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: nits + [4b53b32ea6d8] + + * source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/common.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/vec/ipfilter-sse41.cpp, + source/common/x86/asm-primitives.cpp, source/encoder/encoder.cpp, + source/input/y4m.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp, source/x265.cpp, source/x265.h: + TComSlice: nits + [cdfb587319f8] + + * source/cmake/CMakeASM_YASMInformation.cmake: + cmake: fix cygwin builds - yasm must output windows object files + [871f3367fd45] + + * source/test/testbench.cpp: + test: bump pixel depth to 12 for HIGH_BIT_DEPTH builds + [1561d1f1d1da] + + * source/common/vec/blockcopy-sse3.cpp, source/common/vec/dct- + sse3.cpp, source/common/vec/dct-sse41.cpp, source/common/vec/dct- + ssse3.cpp, source/common/vec/intra-sse41.cpp, source/common/vec + /intra-ssse3.cpp: + vec: fix compile warnings with clang at 16bpp - remove dead + functions + [f7590776c5b2] + +2013-11-20 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps, 8x8, 8x16, 8x32 asm code + [c75b72fcd284] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_8x6, asm code + [e2f8b18cd253] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_8x4, asm code + [079864d97afc] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + pixel_add_ps_8x2, asm code + [f65fbb5d2d44] + + * source/common/primitives.h, source/common/x86/pixel.h: + pixel.h and primitives.h, int replaced with intptr_t + [f07d8d18212b] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm code for pixel_add_ps_4x2 + [28e57d696b36] + + * source/common/x86/pixeladd8.asm: + pixel_add_pp_4x4, merged movd and pmovzxbw + [bee92606b540] + + * source/common/x86/pixeladd8.asm: + pixel_add_pp: 4x8, 4x16, merged movd and pmovzxbw instructions + [5fb3eae8a29d] + + * source/common/x86/pixel.h: + added pixel_add_ps chroma function definitions + [406c500c0b0d] + + * source/common/x86/pixeladd8.asm: + pixeladd8.asm, int replaced with intptr_t for strides data type + [0532a37e6a7e] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/pixeladd8.asm: + asm code for pixel_add_ps, 4x8 and 4x16 + [2fe88d075204] + + * source/common/CMakeLists.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel.h, source/common/x86/pixeladd8.asm: + asm code for pixeladd_ps_4x4 and testbench integration + [a3e4cbbc63e1] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + unit test code for pixel_add_ps + [59964c99aa17] + + * source/common/pixel.cpp, source/common/primitives.h: + added pixel_add_ps_c as a primitve + [ec0758b3ecfa] + +2013-11-20 Steve Borho + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h: + TEncBinCABAC: fix naming prefix convention of bIsCounter + [0607aef383b7] + + * source/Lib/TLibCommon/TComDataCU.h, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp, + source/encoder/encoder.cpp, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp: + TypeDef: replace UInt64 with uint64_t + [58dfac0ab062] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h: + TypeDef: replace Int64 with int64_t + [93f03580b998] + + * source/common/x86/asm-primitives.cpp: + asm: white-space nits + [e3f82c9daafb] + +2013-11-20 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_32x64 + [915bb51b6489] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_64x32 + [aa44b552bdd4] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_16x32 + [cbf8720197fb] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_32x16 + [bd44cdfcbba8] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_64x64 + [ef1a9a583c4a] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_sa8d_32x32 + [357c693f8888] + +2013-11-20 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/CMakeLists.txt, + source/common/intrapred.cpp, source/common/primitives.h, + source/common/vec/intra-sse41.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/intrapred.asm, + source/common/x86/intrapred.h, source/encoder/compress.cpp, + source/encoder/slicetype.cpp, source/test/intrapredharness.cpp, + source/test/intrapredharness.h: + asm: assembly code for IntraPred_DC[4x4] + [172b66a79401] + +2013-11-20 Steve Borho + + * source/common/primitives.h, source/common/vec/pixel-sse3.cpp: + primitive: remove unused cvt16to16_shl + [174bd2b5e652] + + * source/common/vec/dct-sse41.cpp: + vec: drop intrinsic quant primitive, we have assembly code + [5e18a1158c5f] + + * source/common/vec/pixel-sse3.cpp: + vec: drop intrinsic transpose primitives, we have assembly code + [7a154e9fccbd] + +2013-11-20 Min Chen + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.asm, source/common/x86/pixel.h, source/test/mbdstharness.cpp: + asm: assembly code for quant + [63b9ba51bac0] + +2013-11-20 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for transpose_32x32 routine + [6c9d7641bdca] + + * source/common/x86/pixel-a.asm: + asm: cleanup routines of transpose module for 4x4, 8x8 and 16x16 + [fe81ba733564] + +2013-11-20 Gopu Govindaswamy + + * source/encoder/slicetype.cpp: + bpyramid: Support for b-pyramid with b-adapt 0 + + Test results for reference when enable and disable the b-pyramid + with b-adapt=0 + + Cli option : --bframes=10 --b-adapt=0 --b-pyramid=1 -f 100 Enable + B-references : --b-pyramid=1 Disable B-references : --b-pyramid=0 + + Results: Enable / Disable + + clip - BasketballDrive_1920x1080_50 Total time taken - 57.84s (1.73 + fps) / 51.74s (1.93 fps) Bitrates - 4725.37 / 5660.68 PSNR - 37.178 + / 37.178 + + Clip - Cactus_1920x1080_50 Total time taken - 41.90s (2.39 fps) / + 47.08s (2.12 fps) Bitrates - 3800.62 / 4838.73 PSNR - 35.640 / + 35.615 + + Clip - Johnny_1280x720_60 Total time taken - 10.41s (9.61 fps) / + 10.34s (9.67 fps) Bitrates - 327.21 / 383.25 PSNR - 40.674 / 40.631 + + Clip - FourPeople_1280x720_60 Total time taken - 10.72s (9.33 fps) / + 10.18s (9.82 fps) Bitrates - 547.18 / 640.88 PSNR - 39.808 / 39.789 + [8e9c965648f6] + +2013-11-19 Steve Borho + + * source/Lib/TLibCommon/TComYuv.cpp, source/common/TShortYUV.cpp: + clang: fix build warnings + [108ddc9e5c6b] + +2013-11-19 Praveen Tiwari + + * source/common/TShortYUV.cpp, source/common/TShortYUV.h: + blockcpy_sp asm integration + [5df643257054] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/test/pixelharness.cpp: + csp support for blockcopy_sp + [fd382d7bfeea] + + * source/Lib/TLibCommon/TComYuv.cpp: + blockcopy_ps, asm integration + [cc88ae755af4] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComYuv::copyPartToPartChroma, blockcopy_ps asm integration + [036a65157263] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/test/pixelharness.cpp: + csp support for blockcopy_ps + [32bd40623496] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv::copyPartToPartChroma, blockcopy_pp asm integration + [10617a4ef7a5] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv::copyToPicLuma, blockcopy_pp asm integration + [e77e7c4465f1] + +2013-11-19 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for transpose_16x16 routine + [d8d9f36d0a44] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for transpose_8x8 routine + [1e1ecddee271] + +2013-11-18 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for transpose4x4 routine + [d484c28c63f5] + +2013-11-19 Steve Borho + + * source/common/common.cpp, source/x265.h: + api: enable weightp by default, disable for ultrafast + [f68e0b9cae2e] + + * source/x265.h: + api: fix typos and improve chroma qp offset descriptions + [59dcbb0be7ca] + + * source/common/common.cpp, source/x265.h: + api: enable b-pyramid by default + + The only reason for it to be disabled is if the decoder/use case + prevented it + [de92149b3c9b] + + * source/x265.cpp: + cli: fix warning in CLI help + [e63916b50ac1] + + * source/common/common.cpp: + common: nit + [a901089e8218] + + * source/common/common.cpp: + common: validate --subme value + [371f83f5c9cd] + + * source/x265.h: + api: cleanup x265_param orderings, add full comments + [7dea1450b7fa] + +2013-11-18 Steve Borho + + * source/encoder/encoder.cpp: + encoder: simplify size variables + [8abf4200186a] + + * source/x265.h: + api: nit + [ca30b6c351c2] + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/x265.cpp, source/x265.h: + api: make x265_encoder_get_stats() somewhat future proof + + By passing in the size of x265_stats as the user application knows + about the encoder can know not to try to set new fields that were + added to the end of x265_stats. This requires some discipline on our + part to only append to the structure and to always check the size + for any new fields we might add. + [5ae9bb8daaec] + + * source/x265.h: + api: remove old suffix and prefix from C symbols in comment, reflow + [afde9f11046a] + + * source/encoder/encoder.cpp, source/x265.cpp, source/x265.h: + api: remove hungarian prefixes from x265_nal members + + These particular prefixes came from x264 originally + [75328ddf6ef1] + + * source/x265.h: + api: remove reserved NAL enums and C++ style comments from public + header + [6bf2bdc1dd1c] + +2013-11-19 Steve Borho + + * source/x265.cpp, source/x265.h: + api: cleanup bpyramid + [f38139868da6] + +2013-11-19 Gopu Govindaswamy + + * source/common/common.cpp, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/slicetype.cpp, + source/x265.cpp, source/x265.h: + b-pyramid implementation: Allow the use of B-frames as references + for non B and B frames + + when we enable the b-pyramid the bitrates efficienctly reduced and + there is not much diff in the performance and the PSNR 00. increased + some of the clips and decreased some of clips + + Test results for reference when enable and disable the b-pyramid: + cli option : -b 10 --hash=1 -f 100 --b-pyramid=1 --b-adapt=2 Enable + B-reference : --b-pyramid=1 Disable B-reference : --b-pyramid=0 + + Results: Enable / Disable + + Clips - Johnny_1280x720_60.y4m Total time taken - 11.19s (8.94 fps) + / 13.44s (7.44 fps) Bitrates - 303.52 kb/s / 326.79 kb/s PSNR - + 40.679 / 40.612 + + Clips - Cactus_1920x1080_50.y4m Total Time taken - 44.61s (2.24 fps) + / 48.23s (2.07 fps) Bitrates - 3420.80 kb/s / 3575.20 kb/s PSNR - + 35.709 / 35.726 + + Clips - BasketballDrive_1920x1080_50.y4m Total time taken - 54.15s + (1.85 fps) / 53.72s (1.86 fps) Bitrates - 4114.07 kb/s / 4310.45 + kb/s PSNR - 37.283 / 37.290 + + Clips - FourPeople_1280x720_60 Total time taken - 11.79s (8.48 fps) + / 12.16s (8.23 fps) Bitrates - 514.90 kb/s / 539.08 kb/s PSNR - + 39.782 / 39.757 + [1449a1a2041f] + +2013-11-18 Steve Borho + + * source/common/common.cpp: + log: do not show aq-strength if AQ is disabled + [2f5f538d2cbc] + + * source/common/common.cpp: + preset: return superfast/ultrafast lookahead depths to 10 + [628be479be44] + +2013-11-18 Aarthi Thirumalai + + * source/common/common.cpp, source/x265.cpp: + cli: add aq-strength to cli input options, add validations for aq + mode + [6a068b264c84] + +2013-11-18 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: fix initialization order to prevent warnings + [cfd834ba8e4f] + +2013-11-18 Praveen Tiwari + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv::copyPartToPartYuv, blockcopy_pp asm integration + [ad5dff9a2374] + +2013-11-18 Murugan Vairavel + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + Test bench code for transpose routine + [301f9f86ae23] + + * source/common/TShortYUV.cpp, source/common/TShortYUV.h, + source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/test/pixelharness.cpp: + TShortYUV: asm code integration for pixelsub_ps + [58646537ce29] + +2013-11-18 Yuvaraj Venkatesh + + * source/common/x86/pixel-a.asm: + asm: fix the bug caused on 32-bit linux due to satd routines. + [690f6534d310] + +2013-11-18 Kavitha Sampath + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.h, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/x265.cpp: + Encoder statistics: Inter[% (2Nx2N %2NxN %Nx2N %AMP)] Intra[% (%DC + %PLANAR %ANG)] Split[%] Skip[%] + [2321ebe0bf64] + +2013-11-18 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + Adding asm function declarations and initializations for chroma hps + filter functions. + [ac9e64d8a80b] + +2013-11-18 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: cleanup, remove unused buffers + [58f4e30dbe8f] + +2013-11-17 Steve Borho + + * source/test/ipfilterharness.cpp: + testbench: quit zeroing buffers before every test + [e2895ce7bbeb] + + * source/common/vec/ipfilter-sse41.cpp: + vec: disable 16bpp ipfilter_ps[] functions, not 10bit pixel safe + [b32dc442ca83] + + * source/common/vec/intra-ssse3.cpp: + clang: fix ambiguous * operator in intra-ssse3.cpp + [7cae5e2e17ed] + +2013-11-16 Steve Borho + + * source/common/common.cpp: + common: remove thread counts from param2string + [e276322c6288] + + * source/common/common.cpp, source/common/common.h, source/x265.cpp, + source/x265.h: + cli: allow string argument names for --me, generalize + [14cd6a901cc4] + + * source/common/vec/pixel-sse3.cpp: + vec: drop blockfill intrinsic primitives, we have ASM + [1544cf94837d] + + * source/common/pixel.cpp, source/common/vec/pixel-sse3.cpp: + vec: drop residual and recon intrinsic primitives, we have ASM + [3052fca7e7d1] + + * source/common/x86/pixel-util.asm: + fix eoln of pixel-util.asm + [119c01293aa5] + +2013-11-16 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.asm, source/common/x86/pixel.h: + asm: assembly code for calcresidual[] + [24bcae464492] + + * source/common/pixel.cpp, source/common/vec/pixel-sse3.cpp, + source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.asm, source/common/x86/pixel.h: + cleanup: remove unused 64x64 functions + [64ece76a2152] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/x86/pixel- + util.asm: + asm: residual buffer is alignment to size, so we can use alignment + load instruction + [e43b1e005ffa] + +2013-11-16 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/slicetype.cpp: + repair -i0 behavior + [2552369e3537] + + * source/encoder/slicetype.cpp: + slicetype: hoist intra cost estimate out of weightsAnalyze + [da5b10cf5ee5] + +2013-11-15 Praveen Tiwari + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv::copyPartToYuv, asm code integration for blockcopy_pp + [7e7397e823c9] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv::copyToPartYuv, asm integration for blockcopy_pp + [08cb5a0bff32] + + * source/common/primitives.cpp, source/common/primitives.h, + source/test/pixelharness.cpp: + checking for all supported csp values + [a4b880b9114d] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/test/pixelharness.cpp: + TComYuv::copyFromPicLuma, asm integration for chroma blockcopy_pp + [0aa7be2c038a] + +2013-11-15 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + Adding asm function declarations and initializations for luma vss + filter functions. + [ec9295e39fe6] + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + Adding asm function declarations and initializations for chroma vss + filter functions + [726f302ca952] + + * source/common/x86/ipfilter8.asm: + asm: routines for luma vss filter functions for all block sizes. + [4ee15557ea11] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vss filter functions for all block sizes + [83c9a7a473c4] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Adding test bench code for luma vss filter functions. + [7fd467ca4f3e] + + * source/common/ipfilter.cpp, source/common/primitives.h: + Adding function pointer array and C primitive initializations for + luma vss filter functions + [544c4712cd45] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Adding test bench code for chroma vss filter functions + [47de8ca8952c] + + * source/common/ipfilter.cpp: + Adding C primitive for chroma vss filter functions + [fdaef69fbf19] + + * source/common/primitives.h: + Adding function pointer type & array declaration for chroma vss + filter functions + [f6efa96cfbd4] + +2013-11-16 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: disable some flags for clang, old versions balk at them + [60a66cbd8b47] + +2013-11-15 Praveen Tiwari + + * source/Lib/TLibCommon/TComPrediction.cpp: + reverted chroma_copy_pp asm integration to avoid csp break + [2cb5461ccd45] + +2013-11-15 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: hoist weightp analysis above dosearch checks + [3e9bb2ca458b] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: cleanup hugarian notations and redundant vars + [959cabcc9ece] + + * source/Lib/TLibCommon/TComSlice.h: + fix variable shadow warning + [95f292369c06] + +2013-11-15 Deepthi Nandakumar + + * Merge + [c45770e5a2ae] + + * source/common/common.cpp: + presets: fix bugs, better spread out the efficiency/speed data + points. + [a85c5f418794] + + * source/x265.h: + x265: whitespace nit + [a5689bb5808e] + +2013-11-14 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: correct non-denom round, improve some comments + [ee42f57411ae] + + * source/encoder/slicetype.cpp: + slicetype: remove unnecessary lines, simplify a few things + [31bbe5e1142e] + + * source/encoder/slicetype.cpp: + slicetype: use x265 style camelCase auto vars + [82b9f30398ae] + + * source/encoder/slicetype.cpp: + slicetype: since w is an auto-var there is no need to zero at early- + outs + [02fd5b099fa3] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: optimize away mcWeight helper function + [ba00da135945] + +2013-11-14 Shazeb Nawaz Khan + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Using weighted lowres ref frames in cost estimation in lookahead + [899731955c6d] + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + Pulling x264 weight decision into x265 lookahead + [61f9fc2e91d2] + +2013-11-14 Min Chen + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.asm, source/common/x86/pixel.h, source/test/pixelharness.cpp: + asm: assembly code for calcrecon[] + [1b9545e23e36] + +2013-11-14 Sumalatha Polureddy + + * source/encoder/compress.cpp: + no-rdo early exit: giving weightage to the cost of that CU and + neighbour CU's for early exit + + Early exit is done when CU cost at depth "n" is lessthan sum of 60% + of avgcost of that CU at same depth and 40% of avgcost of neighbour + CU's at same depth. + + the performance, bitrate increase and psnr comparison are given + below CLI: x265.exe input.y4m -o abc.hevc -r recon.y4m --rd 1 --ref + 1 + + BasketballDrive_1920x1080_50 Timetaken to encode: 704/585/564s + bitrate: 3650/3696/3696 PSNR: 36.7/36.67/36.67 perf improvement: + 16.9% (compared to early exit OFF and already existing early exit) + perf improvement: 19.8% (compared to early exit OFF and new early + exit) + + Cactus_1920x1080_50 Timetaken to encode: 526/443/436s bitrate: + 2787/2831/2833 PSNR: 35.527/35.48/35.48 perf improvement: 15.7% + (compared to early exit OFF and already existing early exit) perf + improvement: 17.1% (compared to early exit OFF and new early exit) + + Kimono1_1920x1080_24 Timetaken to encode: 279/235/238s bitrate: + 1243/1252/1252 PSNR: 38.16/38.158/38.159 perf improvement: 15.7% + (compared to early exit OFF and already existing early exit) perf + improvement: 14.6% (compared to early exit OFF and new early exit) + + FourPeople_1280x720_60 Timetaken to encode: 169/157/157s 16.9%/19.8% + bitrate: 486/489/489 PSNR: 39.09/39.052/39.042 perf improvement: + 7.1% (compared to early exit OFF and already existing early exit) + perf improvement: 7.1% (compared to early exit OFF and new early + exit) + + big_buck_bunny_360p24 Timetaken to encode: 1739/1511/1505s + 16.9%/19.8% bitrate: 174.9/175.38/175.5 PSNR: 37.798/37.746/37.752 + perf improvement: 13.1% (compared to early exit OFF and already + existing early exit) perf improvement: 13.4% (compared to early exit + OFF and new early exit) + + PartyScene_832x480_50 Timetaken to encode: 123/120/120s 16.9%/19.8% + bitrate: 208/208/208 PSNR: 40.344/40.33/40.332 perf improvement: + 2.4% (compared to early exit OFF and already existing early exit) + perf improvement: 2.4% (compared to early exit OFF and new early + exit) + [1a033fe23a3e] + +2013-11-14 Praveen Tiwari + + * source/Lib/TLibEncoder/TEncSearch.cpp: + reverted chroma_copy_pp asm integration code, avoiding csp break + [b0ce6bd99b15] + +2013-11-14 Murugan Vairavel + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + asm: code for scale1D_128to64 routine + [05484f075744] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + Unit test code for Pixel scaling + [38e124ec202c] + +2013-11-14 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_32x64 and pixel_satd_48x64 + [84f9ced21747] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_64x64 + [99b64d267788] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_64x32 and pixel_satd_64x48 + [ed1dab579cb1] + +2013-11-13 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nit + [e871fe75d5ab] + +2013-11-13 Nabajit Deka + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter functions for 2x4 and 2x8 block + sizes. + [c828dd4d9eae] + +2013-11-13 Derek Buitenhuis + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Reindent after last commit + [5683ee5b793c] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: Fix parameter type of xEstimateResidualQT + + Fixes compilation with g++. + [c89e22d26bcd] + +2013-11-13 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h: + Change minimum architecture to sse4 as chroma vsp functions for + block sizes(2x4,2x8 and 6x8) need faster SSE4 instructions. + [a04ca925ad3f] + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + Adding asm function declarations and initializations for chroma vps + filter functions. + [5fc6ca938864] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter functions for 32xN block sizes. + [701b696d0670] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter function for 24x32 block size. + [21d27b188e71] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter functions for 16xN block sizes. + [52d18d911356] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter functions for 6x8 and 12x16 + block sizes. + [8e6dcabdccd5] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter functions for 8xN block sizes + [91cfcd159ff3] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vps filter functions for 4xN block sizes. + [23aecd3f9180] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma hps filter functions for 16xN, 24xN and + 32xN + [d80ab2913b31] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma hps filter functions for 8xN block sizes. + [3448252924ad] + + * source/common/x86/ipfilter8.asm: + asm: Proper indentation and function prototype updation for chroma + hps filter functions for 2xN, 4xN, 6x8 and 12x16 block sizes. + [51d3c0782e46] + +2013-11-13 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_64x16 + [32e01ab333a6] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_32x32 + [4ee655b93b03] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_24x32 + [2ffe634ebd71] + +2013-11-12 Murugan Vairavel + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + asm: Unit test code for pixelsub_ps function + [c4ca80d19105] + +2013-11-13 Murugan Vairavel + + * source/common/x86/pixel-a.asm: + asm: pixelsub_ps routine for all block sizes + [2d6dd46dc286] + +2013-11-12 sagarkotecha + + * source/common/ipfilter.cpp: + Bug fix : In ipfilter for 10 bit yuv support + [90c2763ee027] + +2013-11-12 Shazeb Nawaz Khan + + * source/encoder/ratecontrol.cpp: + Adding initialisation for ssd/sum values for lowress frame + [a19ba09c1fd7] + +2013-11-12 Nabajit Deka + + * source/test/ipfilterharness.cpp: + Adding test bench code for chroma vps filter functions. + [2185b81ae35b] + + * source/common/ipfilter.cpp, source/common/primitives.h: + Adding function pointer array and C primitive initializations for + chroma vps filter functions. + [1ddacfd89112] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and + 12x16 block sizes. + [533bca3ec7e9] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Adding test bench code for chroma hps filter functions. + [e6d26209c45f] + + * source/common/ipfilter.cpp, source/common/primitives.h: + Adding function pointer array and initializations for chroma hps + filter functions. + [8a8b967500e5] + + * source/common/x86/asm-primitives.cpp: + Adding function pointer initializations for asm chroma vsp + functions. + [028b911ae623] + + * source/common/x86/ipfilter8.h: + Adding asm function declarations for chroma vsp filter functions. + [8fe8d8f9f7cb] + + * source/common/x86/ipfilter8.asm: + asm: routines for chroma vsp filter functions for all block sizes. + [4844849073b7] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Adding test bench code for chroma vsp filter functions. + [ed8a6cd4d8ec] + + * source/common/ipfilter.cpp, source/common/primitives.h: + Adding function pointer array and initializations for chroma vsp + filter functions. + [e676cbd86238] + +2013-11-12 Min Chen + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: assembly code for x265_pixel_avg_12x16 + [d0f80f375c3b] + +2013-11-12 Praveen Tiwari + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv.cpp, use new luma_copy_ps asm primitives where feasible + [31528c277c64] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv.cpp, use new blockcopy_pp luma primitives where feasible + [8708689dcca2] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv::copyFromPicLuma, blockcopy_pp luma asm code integration + [c56ea57ce3ab] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv::copyToPicLuma, blockcopy_pp asm code integration + [04c28af13c4d] + +2013-11-12 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_12x16 + [c56ce77dc081] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for satd_16x32, satd_16x64, satd_8x32 + [d636952ed093] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_16x4 + [7818f5b7cc25] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_16x12 + [2baf62a8e47d] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for pixel_satd_32x24 and rearranged the functions + [085d5c625c53] + +2013-11-11 Steve Borho + + * source/encoder/compress.cpp: + compress: fix shadow warning from GCC + [58bdb05da194] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use luma block copy (luma part size) if bChromaSame + [ea4f939478ed] + +2013-11-12 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/compress.cpp: + Backout: Causing non-determinism in rd 0 and 1. Needs to be further + investigated. + [ab0968b4b65d] + +2013-11-12 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp: + cleanup: hardcoded m_qtTempTComYuv[qtLayer].m_width to MAX_CU_SIZE + [12053d6bf759] + +2013-11-12 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: pixel_avg_48x64, pixel_avg_8x32 + [4a4fd61e98e6] + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: asm: pixel_avg_24x32 + [56642525d09e] + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: pixel_avg_64x(64,48,16) + [9c92947860e0] + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: pixel_avg_32x(64,32,24,8) + [5b0e1731f776] + +2013-11-12 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/compress.cpp: + no-rdo early exit: giving weightage to the cost of all CU's and + neighbour CU's for early exit + + Early exit is done when CU cost at depth "n" is lessthan sum of 60% + of avgcost of all CU's and 40% of avgcost of neighbour CU's at same + depth. + [dc5c51ff542f] + +2013-11-11 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + no-rdo: cleanups. Remove unnecessary memsets, rearrange + computations. + [1ca01c82609f] + +2013-11-11 Steve Borho + + * source/Lib/TLibCommon/TComYuv.h: + TComYuv: de-hungarian nits + [d1d716083aa7] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/ratecontrol.cpp: + asm: use new block copy primitives where feasible + [1c95568c7143] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter-sse41.cpp, source/common/x86/asm- + primitives.cpp, source/common/x86/ipfilter8.asm, + source/common/x86/ipfilter8.h, source/encoder/motion.cpp: + asm: hookup luma_vsp primitive, drop asm and intrinsic non-block + versions + [904b788b09e2] + +2013-11-11 Nabajit Deka + + * source/common/x86/asm-primitives.cpp: + Adding function pointer initializations for luma vsp functions. + [d11de5be8e25] + + * source/common/x86/ipfilter8.h: + Adding asm function declarations for luma vsp filter functions. + [937ac0c1bac4] + + * source/common/x86/ipfilter8.asm: + asm: routines for luma vsp filter functions for all block sizes. + [1eae34eb5995] + +2013-11-11 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for x265_pixel_satd_32x16 + [27b97bc50331] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: assembly code for x265_pixel_satd_32x8 + [da13148e7c6e] + +2013-11-11 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: enabled pixel_avg_16x(64,32,12,4) assembly functions + [1990e66030d1] + +2013-11-11 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + use fixed stride/size on m_qtTempTComYuv, to reduce number of + calcRecon() parameters + [0f9c6391fa19] + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/common/x86/pixel.h: + asm: pixel_avg[32x16] + [79a452bec247] + +2013-11-11 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp: + added macro call for luma partition blockcopy_ps function + [18dd57c38254] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_64xN + [ed32ed5a0785] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for luma blockcopy_ps_48x64 + [c19168acd391] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for luma blockcopy_ps_32x64 + [15b705145e15] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + added asm function for luma blockcopy_ps_16x64 + [8e20f3c1dbb4] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + added asm code blockcopy_ps_4x16 and invoked function pointer + initialization with macro + [67fb80ee548a] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_6x8 + [b208adfaaba6] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_2x8 + [c047d5898b59] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_2x4 + [cf089f73913d] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_24x32 + [c8e0d150b111] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_4x8 + [332793211a8d] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_4x4 + [953fe27840b6] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_4x2 + [4c45ee313c3c] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_12x16 + [c09ba17002c0] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_32xN + [badcc7920c91] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + blockcopy_ps_16x4, asm code is now sse4 + [1365b796a75e] + + * source/common/x86/blockcopy8.asm: + eliminated register copy from BLOCKCOPY_PS_W16_H4 macro + [7a0afcd7c4c9] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for asm code for blockcopy_ps,16x8, 16x12, 16x16, 16x32 + [e5567a4eeec5] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_16x4 + [cb378330b31b] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps, 8x6, 8x16 and 8x32 + [7d74ee88f3fe] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_8x6 + [1fbaef13feb7] + +2013-11-11 Steve Borho + + * source/encoder/compress.cpp: + compress: white-space nits + [c94d51359a5f] + +2013-11-11 Mahesh Doijade + + * source/encoder/compress.cpp: + TEncCu: cleanup xComputeCostIntraInInter to use 32x32 logic for + 64x64 + [2e90d81098af] + +2013-11-11 Min Chen + + * source/test/pixelharness.cpp: + bugfix: PixelHarness::check_pixelavg_pp() output buffer did not + initialize + [9642b5b6500b] + + * source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm: + re-enable asm code for pixel_avg, the problem is miss EMMS + [a1577003ee96] + +2013-11-11 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_8x4 + [25300bdf7bbe] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockcopy_ps_8x2 + [11b09a9fa32f] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + unit test code for block_copy_ps function + [eab2cd89e813] + + * source/common/pixel.cpp, source/common/primitives.h: + added blockcopy_ps c primitive and function pointes + [7f3164f16551] + +2013-11-11 Nabajit Deka + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Adding test bench code for luma vsp filter functions. + [51358e3422b7] + + * source/common/ipfilter.cpp: + Adding C primitive for luma vsp filter functions. + [d2b3aefb522e] + + * source/common/primitives.h: + Adding function pointer type & array definition for luma vsp filter + functions. + [8d496292dd1d] + +2013-11-11 Deepthi Nandakumar + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp: + 16bpp primitives: disabling dct/idct/dst/idst primitives + [8ca334701a92] + +2013-11-09 Steve Borho + + * source/test/testbench.cpp: + testbench: set g_bitDepth to 10 for HIGH_BIT_DEPTH builds + + This more accurately tests our optimized primitives vs the C ref, + and several of them now fail validation. The intrinsic primitives + need to be updated to the changes made in commit b24d05dd4990 and/or + disabled for 16bpp. + [9d74638c3640] + + * source/common/vec/dct-sse3.cpp: + dct: white-space cleanup + [975d0089a37d] + + * source/output/y4m.cpp: + y4m: fix 8bpp build + [efb26544f8aa] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: add a comment for future optimizations + [2be9bd65f418] + + * source/input/yuv.cpp: + yuv: pic.stride should be in units of pixels, not bytes + [b2b455afd60e] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + csp: fix allocation size calculation of chroma planes + [99d934beca75] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: remove unused luma-oriented functions, cleanup destroy + method + [9dece99f7faa] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: remove unused copy methods + [ad8b9d120f8c] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: cleanup prefixes + [c05952a71525] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: remove unused dump function + [8774e07b9d8a] + + * source/output/output.cpp, source/output/y4m.cpp, + source/output/y4m.h, source/output/yuv.cpp, source/x265.cpp: + cli: file writers should validate output bitdepth, if any is + specified + [5c3ecc48bf3b] + + * source/common/CMakeLists.txt, source/common/vec/pixel-avx2.cpp, + source/common/vec/vec-primitives.cpp: + pixel: remove AVX2 vector class SAD primitives + + These are superceded by assembly - the ASM doesn't use AVX2 yet but + it's already faster + [af9ec83d864a] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + TComYuv: remove unused removeHighFreq() + [cd9b013529ad] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy-sse3.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h: + primitive: remove pixeladd_pp and TComYuv::addClip() + + These were used by the old HM bidir search + [57caf112acf9] + +2013-11-08 Steve Borho + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h: + input: drop unthreaded reader paths for simplicity + [29ad451b3026] + + * source/input/yuv.cpp: + yuv: simplify framesize usage + [27486e340eac] + + * source/input/yuv.cpp: + yuv: initialize pixelbytes as early as possible + [45c5ba1dc340] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: include nits + [620c2a377cf6] + + * source/x265.cpp: + cli: do not check recon bit depth if no recon file is requested + + For now, just enforce our current requirement that the output + bitdepth must match the input bit depth. If the recon file writers + ever support depth conversions, these checks can be revisited + [4a824497b3f4] + + * source/common/common.cpp: + common: set default params to match medium preset, keep star search + for medium + [5b688170c506] + +2013-11-06 Steve Borho + + * source/common/common.cpp: + presets: adjust presets to increase spread and align closer with + x264 presets + [8487f675effa] + +2013-11-08 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + no-rdo: refactor enodeResandCalcRDInterCU function + + Divide estimateBits and modeDecision inside the function. + EstimateBits uses a pseudo encode. Bitstream changes with this patch + for --rd 1. + [66659d4a7b31] + +2013-11-08 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: fixup 16x16 picture padding by using unpadded width as + pad base + [74bed0a288f5] + +2013-11-08 Mahesh Doijade + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup estIntraPredQT to use 32x32 logic for 64x64 + blocks + [abb7c130ca2f] + +2013-11-08 Yuvaraj Venkatesh + + * source/common/x86/sad-a.asm: + asm: optimised pixel_sad_xN_24x32 assembly code + [cd16d2ed3128] + +2013-11-08 Praveen Tiwari + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_2x8, optimized asm code + [1e7c99e5b511] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_2x4, optimized asm code + [7bd27dfad3bf] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_6x8, optimized asm code + [073ca727d5de] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_32xN, optimized asm code + [b95f9e753039] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_48x64, optimized asm code + [fa5544054a1d] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_64xN, optimized asm code + [a1c0eb5f5d84] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_24x32, optimized asm code + [3cf4edc66844] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_16xN, optimized asm code + [a1a9b29cccf9] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_12x16, optimized asm code + [970517e2eb4c] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_8x16, optimized asm code + [a0b003aac23e] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_8x8, optimized asm code + [8cfa90a574f8] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_8x2, optimized asm code + [c8d25ce3b965] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_8x6, optimized asm code + [2fd3cf3b5edb] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_8x4, optimized asm code + [27c70b409c1b] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_4x8, optimized asm code + [ceed26f375d5] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_4x16, optimized asm code + [b20b89bf5348] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_4x4, optimized asm code according to modified C + primitive + [d5f67f9cba2c] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp_4x2, optimized asm code according to modified C + primitive + [85dddb9aa165] + +2013-11-08 Steve Borho + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter-sse41.cpp, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + primitives: remove unused ipfilter_pp_t funcdef and C and intrinsic + primitives + + These are now completely replaced by block based assembly code + [fef74c2e329d] + + * source/common/vec/pixel-sse41.cpp: + linux: re-enable sse_12x16 for clang and gcc + [94cba84de8dd] + +2013-11-08 Min Chen + + * source/common/vec/pixel-sse41.cpp: + fix bug in sse_sp12 + [f76b591b7aef] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + testbench: update for blockcopy, these function use dynamic range + [0,255] + [c5e633516217] + +2013-11-07 Steve Borho + + * source/x265.cpp: + cli: add missing --version long option for -V, and -? short option + for --help + [fd721a5ba063] + + * source/x265.cpp: + cli: report a warning on preset or tune string mismatches + [08872c3c4735] + + * source/common/vec/pixel-sse41.cpp: + linux: disable sse_12x16 for GCC and clang + [fd0ebb4b4709] + + * source/test/pixelharness.cpp: + linux: fix pixelharness on linux which has full-range rand() + [1ea82d16d334] + + * source/common/vec/blockcopy-sse3.cpp: + vec: remove use of deleted primitive for 16bpp + [014e3303ad3d] + + * source/common/lowres.h: + lowres: reorder members of Lowres struct for clarity + [5563bd58c1e3] + + * source/common/lowres.h, source/common/mv.h, + source/encoder/motion.cpp, source/encoder/reference.h: + lowres: move ReferencePlanes from mv.h to lowres.h + [dac2888cbf4c] + + * source/common/mv.h, source/encoder/motion.cpp, + source/encoder/slicetype.cpp: + lowres: pull lowres motion compensation and residual costs into + lowres struct + [9668c5b6373a] + +2013-11-07 Min Chen + + * source/common/TShortYUV.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/blockcopy-sse3.cpp, + source/test/pixelharness.cpp, source/test/pixelharness.h: + rename: pixelsub_sp to pixelsub_ps, because it sub two Pixel and + result is Short + [cb24ed71905d] + +2013-11-07 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockfil_s, 32x32 + [b4993b1fef7c] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockfill_s, 16x16 + [a8df8123e9ab] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockfill_s, 8x8 + [7d3e461312a5] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/blockcopy8.h: + asm code for blockfill_s, 4x4 + [29d208555299] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + unit test code for blockfill_s_c function + [12ec248f7390] + +2013-11-07 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp, source/encoder/motion.cpp: + asm: enable luma_vpp block MC functions + [4d9aac4f0985] + +2013-11-07 Nabajit Deka + + * source/common/x86/ipfilter8.asm: + Bug fix for luma vpp asm routines.Also incorporated review comment + changes. + [9ba49b482a1e] + +2013-11-07 Min Chen + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy-sse3.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h: + cleanup: remove unused blockcpy_sc + [db7752a46693] + + * source/common/pixel.cpp: + asm: the pixel value in blockcopy_ps is saturation by calcRecon, so + asm can use packuswb + [b572831429ec] + +2013-11-07 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComPicYuv.cpp: + tcompicyuv: add right boundary padding while applying bottom row + padding. + [397a201b0ea3] + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/lowres.cpp, + source/encoder/frameencoder.cpp: + aq: bug fix, extend right and bot of TComPic::m_origPicYuv to a + multiple of 16 + [93a4f88844f1] + +2013-11-07 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove last remaining intrinsic SAD primitives + [536db32fc253] + +2013-11-07 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_x4_64xN + [dc31fc1daf42] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_x3_64xN + [d6644a32e6bc] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_x4_48x64 + [96f1bb63b747] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_x3_48x64 + [74682dfe5342] + +2013-11-07 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPicYuv.cpp: + tcompicyuv: improvement for Extend the right if width is not + multiple of min CU size + [85002898f5b4] + +2013-11-07 Min Chen + + * source/common/pixel.cpp: + asm: the pixel value in blockcopy_ps is saturation by calcRecon, so + asm can use packuswb + [0a1b379be359] + +2013-11-07 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPicYuv.cpp: + tcompicyuv: fix for copyFromPicture() when HIGH_BIT_DEPTH enable, + sizeof(Pel)=2 and pic.bitDepth=8 + [ed1b1a7b0b38] + +2013-11-07 Steve Borho + + * source/encoder/encoder.cpp: + api: output x265_picture.bitDepth should reflect actual internal + bitdepth + + And not sizeof(pixel) * 8. + [0ab509a661c7] + +2013-11-06 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + api: simplistic auto-determination of frame thread count + [90d9c1067f50] + + * source/encoder/motion.cpp: + motion: fixup weightp - use unweighted reference pixels as + interpolation source + [93cccbe49a93] + + * source/x265.cpp: + cli: do not lookup a colorspace name if not supplied + [60f78cbfacc8] + + * source/common/common.cpp: + common: set a default color space of I420 + [d9ea97e248bc] + + * source/Lib/TLibCommon/TComPrediction.cpp, source/encoder/motion.cpp: + asm: use new block based single-pass H-filter motion compensation + primitives + [dbb86150c919] + + * source/Lib/TLibCommon/TComPrediction.cpp: + asm: use new block based chroma single-pass MC primitives + [8d1bd79d3618] + + * source/common/CMakeLists.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.h, source/common/x86/pixel.h: + asm: move block copy funcdefs into blockcopy8.h + [edf77f60b55c] + + * source/common/x86/asm-primitives.cpp: + asm: move _sse4 block copy function pointer assignments into SSE4 + section + [34d494a8051f] + +2013-11-06 Praveen Tiwari + + * source/common/x86/blockcopy8.asm: + blockcopy_sp, corrected number of xmm registers + [0c359d82ebc1] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 64xN + [f0214135645a] + + * source/common/x86/blockcopy8.asm: + blockcopy_sp, 48x64 changed the macro name according to width + [d87d627b2161] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 48x64 + [cde21084ca9d] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + blockcopy_sp, added 16x64 block size + [598a03afc62f] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 32x64 + [1a46771b9f87] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/pixel.h: + used sse4 for 2x4, 2x8 and 6x8 + [ddaa80b9b959] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 6x8 + [2ae2eb6c8e51] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 2x8 + [529bf6093782] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 2x4 + [ea33d0f85b8e] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 12x16 block + [99c3b2e4f1cc] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 32xN + [6b6d54cc234e] + +2013-11-06 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComDataCU.cpp: + tcomdatacu: partStartIdx is always negative, no need to have else + block in InitCU() + [11a4c5a15d79] + + * source/Lib/TLibCommon/TComDataCU.cpp: + tcomdatacu: remove the for loop in InitCU(), which will never + execute + + partStartIdx is always zero or negative, and the numElements is also + always zero or negative the for will never executed if numElements + is zero or negative, removed the for loop block in initCU() + [8bdb65fef0f0] + +2013-11-06 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: nits + [eab2d925a0e0] + + * source/common/x86/pixel.h: + pixel.h: nit + [267b3da1a734] + +2013-11-06 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/ipfilter.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h, + source/encoder/motion.cpp, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + asm: ipfilter_ss[FILTER_V_S_S_8] + [de7a50155cba] + +2013-11-06 Deepthi Nandakumar + + * source/output/y4m.cpp, source/output/yuv.cpp: + YUV, Y4M Output: bitdepth confusion resolved + [846e2c0d8478] + +2013-11-06 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 24x32 block + [8f71fba52d55] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 16xN blocks + [264b1458963a] + +2013-11-06 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + Adding asm function declaration and function pointer initializations + for luma hps functions. + [450947d76251] + + * source/common/x86/ipfilter8.asm: + asm: routines for luma hps filter functions for all block sizes. + [01d97a51d37d] + +2013-11-06 Deepthi Nandakumar + + * Merge + [21e08cf159c5] + + * source/output/yuv.cpp: + YUV Output: more rext merge bugs + [b2068453b55b] + + * source/output/yuv.cpp: + YUV output: correct a rext merge issue + [dd8510d84b5a] + +2013-11-06 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 4xN blocks + [bab35592e71c] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm: + asm code for blockcopy_sp, 8xN blocks + [73b4015984fd] + +2013-11-06 Nabajit Deka + + * source/test/ipfilterharness.cpp: + Adding test bench code for luma hps filter functions. + [cb323bec7d06] + + * source/common/ipfilter.cpp, source/common/primitives.h: + Adding function pointer array and C primitive for luma hps filter + functions. + [e31319dfb866] + +2013-11-06 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComDataCU.cpp: + tcomdatacu: remove memset in initSubCU() + + m_trCoeffY, m_trCoeffCb, m_trCoeffCr, m_iPCMSampleY, m_iPCMSampleCb + and m_iPCMSampleCr buffers are getting initialized in initCU(), and + its not required to set 0 in initSubCU() + [1b913b8f7f19] + + * source/Lib/TLibCommon/TComDataCU.cpp: + tcomdatacu: remove memset in initEstData() + + m_trCoeffY, m_trCoeffCb, m_trCoeffCr, m_iPCMSampleY, m_iPCMSampleCb + and m_iPCMSampleCr buffers are getting initialized in initCU(), and + its not required to set 0 in initEstData() + [d044314537ad] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + tcomdatacu: remove unused set functions + [9368bfd107b8] + +2013-11-06 Steve Borho + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: simplify subpel refine, drop height+1 interpolation + + This is in preparation of enabling assembly versions of + interpolation functions + [a1d576fbd0b0] + +2013-11-05 Steve Borho + + * source/encoder/motion.cpp: + motion: simplify lowres subpel refine + [72520485725e] + +2013-11-06 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: move lastNonB set outside if expression for both I and P + clauses + [412d2f3a2bd2] + +2013-11-05 Steve Borho + + * source/encoder/motion.cpp: + motion: move StarPatternSearch higher in the file, no behavior + change + [bc99537483f1] + +2013-11-06 Steve Borho + + * source/CMakeLists.txt: + cmake: link platform libraries into the shared library (fixes PPA + builds) + [0234bd136cb9] + +2013-11-05 Steve Borho + + * source/cmake/CMakeDetermineASM_YASMCompiler.cmake: + cmake: use detected yasm executable for CMAKE_ASM_YASM_COMPILER + variable + [ae906d212c5e] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/common.cpp, + source/common/common.h, source/encoder/cturow.cpp, + source/encoder/encoder.cpp, source/input/input.h, + source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/input/yuv.h, source/output/output.cpp, + source/output/output.h, source/output/y4m.cpp, source/output/y4m.h, + source/output/yuv.cpp, source/output/yuv.h, source/x265.cpp, + source/x265.h: + rext: partial support for 4:2:2 and 4:4:4 color spaces + + reconFileBitDepth did not need to be a member of x265_param, and the + comment for inputBitDepth needed to mention that it also determined + the internal bit depth of the encoder. + [7cdcf1a03d93] + + * source/common/common.cpp, source/common/version.cpp, + source/encoder/encoder.cpp, source/input/input.cpp, + source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h, source/output/y4m.cpp, + source/x265.cpp, source/x265.h: + main10: allow pixel sizes of 10 and 12 for HIGH_BIT_DEPTH builds + + Removes param.internalBitDepth and uses inputBitDepth to mean both + the size of input pixels and internal pixels (x265 will do no color + space conversions) + [6a94dca867c8] + + * source/common/dct.cpp, source/common/ipfilter.cpp: + main10: fix dct and MC primitives to correctly respect larger pixel + sizes + [b24d05dd4990] + + * source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/common.cpp, + source/common/cpu.cpp, source/common/dct.cpp, + source/common/ipfilter.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/common/pixel.cpp, source/common/vec + /dct-sse41.cpp, source/common/vec/intra-sse41.cpp, source/common/vec + /intra-ssse3.cpp, source/common/vec/ipfilter-sse41.cpp, + source/common/vec/ipfilter-ssse3.cpp, source/common/vec/pixel- + avx2.cpp, source/common/vec/pixel16-sse41.cpp, source/common/x86 + /asm-primitives.cpp, source/common/x86/ipfilter8.h, + source/common/x86/mc.h, source/common/x86/pixel.h, + source/encoder/bitcost.cpp, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/ratecontrol.cpp, source/input/y4m.cpp, + source/input/y4m.h, source/input/yuv.cpp, source/input/yuv.h, + source/output/y4m.cpp, source/output/yuv.cpp, + source/test/ipfilterharness.cpp, source/test/pixelharness.h, + source/test/testbench.cpp, source/x265.cpp: + uncrustify source tree + [0c9d8360e1b0] + + * source/encoder/slicetype.cpp: + slicetype: better fix for --b-adapt 0 + + Don't allow an IDR to be re-ordered + [f56a9fc5e999] + + * source/x265.cpp: + cli: re-introduce x265_setup_primitives() call prior to + printVersion() + + This way --cpuid N is properly respected. This orders the output of + the log messages, but the total number is the same. + [76d1d1aa3700] + + * source/encoder/encoder.cpp: + encoder: nits + [9b7ca5a14605] + + * source/encoder/encoder.cpp: + log: cleanup Encoder::printSummary() + [d8513c114f42] + + * source/encoder/slicetype.cpp: + slicetype: do not force P before I with --b-adapt 0 + + I don't remember what bug this was covering up, but it's no longer + necessary and it was breaking the later code which expected a single + non-B per mini-GOP + [f7e55b468373] + + * source/common/primitives.cpp: + log: only log primitve status if some primitives are missing + + Don't waste a line of log output to report everything is normal + [e566ef4d3e95] + +2013-11-05 Aarthi Thirumalai + + * source/common/common.cpp, source/x265.cpp, source/x265.h: + cli: add option to turn on AQ + [ddf4d33e8c15] + +2013-11-05 Steve Borho + + * source/encoder/encoder.cpp: + log: white-space nit + [f6cb006b3628] + +2013-10-30 Kurtnoise + + * source/x265.cpp: + Display also version info during encoding instead of primitives + setup. + [5895ca0d36be] + +2013-11-05 Gopu Govindaswamy + + * source/Lib/TLibEncoder/TEncCu.cpp: + tenccu: remove calling outTempCU->initEstData() multiple time + without modifying outTempCU + [486f2cff2c3e] + + * source/Lib/TLibCommon/TComDataCU.cpp: + tcomdatacu: Remove memset from create(), these variables are + initialized in initCU() + [67ec1b965461] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + tcomdatacu: remove unused copySubCU() function + [1028756870f7] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + tcomdatacu: remove unused copyInterPredInfoFrom() function + [f3f510808287] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + tcomdatacu: remove unused setOutsideCUPart() function + [a38c5f57cf7e] + +2013-11-05 Praveen Tiwari + + * source/common/x86/pixel.h: + changed the naming convention for blockcopy_sp + [e3853b2cbaa8] + + * source/common/pixel.cpp, source/common/primitives.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + changed naming convention and added unit test code for blockcopy_sp + function + [9deec9d96ed4] + +2013-11-02 Neil Piken + + * build/README.txt: + cmake: document minimum cmake required version for VC12 + [be7af3216ebd] + +2013-11-05 Steve Borho + + * source/x265.cpp: + Merge with stable + [ba8c09070e54] + + * source/x265.cpp: + cli: use 64bit byte counter (closes #1) + [3c15a9f74091] + +2013-11-05 Gopu Govindaswamy + + * source/input/y4m.cpp, source/input/yuv.cpp: + input:reset the file pointer before return in guessFrameCount() + [ece7af249573] + + * source/Lib/TLibEncoder/TEncCu.cpp: + tenccu:remove calling outTempCU->initEstData() multiple time without + changing outTempCU from xCompressIntraCU + [925ed6f7ed34] + +2013-11-05 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + Adding asm function declaration and function pointer initializations + for luma vps functions. + [df9d7d85a146] + + * source/common/x86/ipfilter8.asm: + asm: routines for luma vps filter functions for all block sizes. + [29d3861c5370] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Adding test bench code for luma vps filter functions. + [f8053c69b732] + + * source/common/ipfilter.cpp: + Adding C primitive for luma vps filter functions. + [9d29fff93f3a] + + * source/common/primitives.h: + Adding function pointer type & array definition for luma vps filter + functions. + [ed0dd83bb7ca] + +2013-11-05 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/encoder.h, source/x265.h: + api: use uint64_t to accumulate bits + [c57ed1fd7bd5] + +2013-11-05 Santhoshini Sekar + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncAnalyze.h, source/encoder/CMakeLists.txt, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/framefilter.cpp: + collect global stats of psnr and ssim + [6f19e6ef9ab5] + +2013-11-04 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: make b-adapt 0 set lastNonB, leadingBframes, and + bLastMiniGopBFrame + [7aa33c9734f5] + + * source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + white-space fixes following UInt->uint32_t search and replace + [686b5b502797] + + * source/input/y4m.cpp: + y4m: fix two more while loops + [22d51335d4ec] + + * source/input/y4m.cpp: + y4m: fix eof detection during header parsing + + Evaluating an istream as a bool returns the file status. When ifs + was converted to a stream pointer, these while loops needed to be + changed because a bool evaluation of a pointer is always true if the + pointer is not NULL + [9732c845db44] + + * source/test/pixelharness.cpp: + pixelharness: shorten copy primitive names for consistency + [f3106abb88b2] + +2013-11-04 Praveen Tiwari + + * source/test/pixelharness.cpp: + corrected buffer name for chroma_copy_pp + [35989e4e0b46] + + * source/common/x86/pixel.h: + pixel.h, added asm function decleration for blockcopy_ps_c + [4cd16b86488c] + + * source/common/pixel.cpp: + pixel.cpp, initialization of function pointer table for + blockcopy_ps_c partitions + [64f25611bcb2] + + * source/common/pixel.cpp, source/common/primitives.h: + added C primitive for blockcopy_p_s and function pointer creation + for new type + [e61a0b1c035b] + + * source/common/CMakeLists.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/blockcopy8.asm, source/common/x86/pixel.h, + source/test/pixelharness.cpp: + asm code and test bench integration code for blockcopy_pp_c + partitions + [7898c58d9cbc] + +2013-11-04 Min Chen + + * source/common/x86/ipfilter8.asm, source/test/ipfilterharness.cpp: + fix bug in chroma_p2s and testbench + [c83157a8b616] + + * source/test/ipfilterharness.cpp: + update authors header + [6e211f980d98] + +2013-11-04 Deepthi Devaki + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + RDOQ : pass a parameter to temporarily disable RDOQ. + + default value of parameter is true, that means RDOQ is not disabled. + Outputs mustnot change in any rd levels. + [91d96a6038e2] + +2013-11-04 Shazeb Nawaz Khan + + * source/Lib/TLibCommon/TComDataCU.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + Dropping the 'm_' prefix from names used for members of struct type + Lowres + [37903c6fd1f9] + +2013-11-04 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: re-enable sad_x3 following alignment workaround + [814b4639d6a6] + +2013-11-04 Shazeb Nawaz Khan + + * source/common/lowres.h, source/encoder/encoder.cpp, + source/encoder/ratecontrol.cpp: + Generating sum & ssd values for weightp decision in lookahead + [2ab39c2dd50f] + +2013-11-02 Wenju He + + * source/encoder/motion.cpp: + fix vec/asm crash in COST_MV_X3_DIR: costs+3 is not aligned + [8621008756ba] + +2013-10-30 idxa + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/x265.cpp, source/x265.h: + rc: add CRF ratecontrol + + 1. add a parameter of "--crf" to the command line + + 2. modify the running branches of rateControlStart, using + "if(isAbr)" instead of "switch (cfg->param.rc.rateControlMode)", for + the logic of classifying the combination of multiple ratecontrol + methods is very complex, it is not only based on rateControlMode, so + porting x264's way looks feasible. + + 3. add crf method into x265 + [c51c35880df5] + +2013-11-01 Steve Borho + + * source/common/CMakeLists.txt: + cmake: cleanup intrinc primitives + [f81af999ef6c] + + * source/common/CMakeLists.txt: + cmake: more asm simplifications + [ad8222ed1360] + + * source/encoder/ratecontrol.cpp: + rc: replace tabs with spaces + [0d79e31728a4] + + * source/CMakeLists.txt: + cmake: drop STLport support, it is no longer necessary + [a3d07bee1316] + +2013-10-27 Neil Piken + + * build/vc12-x86/build-all.bat, build/vc12-x86/make-solutions.bat, + build/vc12-x86_64/build-all.bat, build/vc12-x86_64/make- + solutions.bat: + cmake: add Visual Studio 2013 build folder + [d7986ac66bc1] + +2013-11-01 Steve Borho + + * source/common/CMakeLists.txt: + cmake: remove obsolete build flags + [a1502a1f1fa2] + +2013-10-31 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: remove assembly library to fix MSVC static lib + + This fixes the static library built by VisualStudio to include the + assembly objects. It has the side effect of causing the assembly + objects to be built once for the static lib and once for the shared + lib, which can be avoided by unloading the shared lib project while + you are working on assembly within VisualStudio + [3716c9f588df] + + * source/cmake/CMakeDetermineASM_YASMCompiler.cmake: + cmake: fix warning about ASM_YASM compiler being unknown + [1c5122851247] + + * source/x265.cpp: + cli: tweaks for command line help + [bd53cb226710] + + * source/common/common.cpp: + common: fix int typecast to operate on results of float expression + [ae576a38ca5b] + + * source/common/common.cpp, source/common/primitives.cpp, + source/common/vec/vec-primitives.cpp, source/x265.h: + Merge with stable + [a4e9f242fdf3] + + * source/x265.h: + api: give structs the same name as their typedef + + This allows them to be forward-decl'd. + [30a0c2c5fcbd] + + * source/common/cpu.cpp, source/common/vec/vec-primitives.cpp: + cpu: move ASM fallback functions out of vec-primitives.cpp + + This fixes link errors when ASM and vector primitives are both + disabled + [e1dde58cf6e1] + + * source/common/common.cpp: + common: disable MSVC warning that is exposed when compiling without + primitives + [01e77fde7194] + + * source/common/primitives.cpp: + primitives: fix compile warning exposed when ASM and instrincs are + both disabled + [ad6d6ddd7037] + + * source/common/common.cpp, source/x265.h: + api: add zero-latency tune target + + This just disables lookahead and B frames at the moment. + [8afb161419df] + + * source/common/x86/asm-primitives.cpp: + asm: disable more sad_x3 functions which cause crashes on Haswell + [885e41fac726] + + * source/common/x86/asm-primitives.cpp: + asm: disable sad_x3[LUMA_32xN], they cause crashes on Haswell + + Seen crashes on Windows, Mac, and Linux. The only constant is + Haswell. + [51660f092aa4] + + * source/common/common.cpp: + common: lower search range for higher presets with max CTU size 32 + [21da3bba6e70] + +2013-10-30 Steve Borho + + * source/CMakeLists.txt, source/common/common.cpp, source/x265.cpp, + source/x265.def.in, source/x265.h: + api: introduce performance presets + [0607132e6b11] + +2013-10-31 Steve Borho + + * source/common/x86/asm-primitives.cpp: + disable two avx2 routines which fail unit tests + [2621639c96b5] + + * source/common/vec/pixel-sse41.cpp: + clang: re-disable 12x16, sse_pp_12x16 testbench fails + [8f4744bdf6fc] + +2013-10-31 Min Chen + + * source/common/x86/ipfilter8.asm: + asm: fix typo bug in chroma_p2s + [e842b2a4aeeb] + +2013-10-31 Nabajit Deka + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: routines for vertical luma filter for all block sizes + [faf29e19669f] + + * source/test/ipfilterharness.cpp: + Ensure that the destination buffer is not overwritten. 64 is added + as it is the maximum width supported for luma filter. + [935d96d93b70] + +2013-10-31 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + remove clang prevention for 12x16 pixel primitives + [30c655ec95f7] + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sad_x3_12 and sad_x4_16 intrinsic functions + [2d08d77871b0] + + * source/encoder/ratecontrol.cpp: + aq: remove unnecessary double->float->double conversions + [abedbfdb1e12] + + * source/encoder/ratecontrol.cpp: + aq: fixes for loop over 16x16 blocks + + This loop was busted when maxCUSize was not 64. It still has a + problem with pictures that are not even multiples of 16. The + lookahead will extend out the frame during lowres init to an even + multiple of 16 pixels, so it's lowres CU width will be wider than + the AQ code will use, so the block_xy offsets will be wrong for + lookahead analysis. + + The pixel extension needs to be moved earlier so AQ and the + lookahead have a consistent 16x16 CU width + [974a6afaddca] + + * source/encoder/ratecontrol.cpp: + aq: simplify acEnergyCu + + EMMS was in the wrong place, there were a few white-space issues. + [180d95f09057] + + * source/encoder/ratecontrol.cpp: + aq: use more explicit chroma variance stride + [3e2d69028a3b] + + * source/encoder/slicetype.cpp: + aq: fix NULL pointer check + [9acea4fbacef] + +2013-10-31 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + aq: set qp, lambda for every CU in the row before processing the CU + + enabled bUseDQP flag when AQ is mode is ON. + [650e40a62322] + +2013-10-31 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + compress: cleanup, remove unused data structs + [eed2b51675cf] + +2013-10-31 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + no-rdo: Use entropy encoder for bit estimation. + + Instead of me-bit estimation, use entropy encoder. + [775519fb9ba1] + +2013-10-31 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + assembly code for pixel_sad_x4_12x16 + [ed884e91d5d5] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + assembly code for pixel_sad_x3_12x16 + [7ccdf622d081] + +2013-10-31 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/ipfilter.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + asm: chroma_p2s to replace ipfilter_p2s + [4a40c4069ad1] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/x86/ipfilter8.asm, source/test/ipfilterharness.cpp: + asm: fix bug in luma_p2s and active it in encoder + [21dbf988079b] + + * source/common/x86/ipfilter8.asm: + asm: less code size by reduce constant offset + [a64e813de628] + + * source/common/x86/sad-a.asm: + asm: fix stack broken bug + [08bc7ccc8aad] + +2013-10-31 Dnyaneshwar Gorade + + * source/common/x86/sad-a.asm: + asm: reduce large code size in sad_16xN, sad_32xN for better cache + performance + [9a0da4e6d9e3] + + * source/common/x86/sad-a.asm: + asm: reduce large code size in pixel_sad_8x32 for better cache + performance + [e4a75488c147] + +2013-10-31 Yuvaraj Venkatesh + + * source/common/x86/sad-a.asm: + asm: fix the bug which occured at win32 compile + [4a886c170a51] + +2013-10-31 Steve Borho + + * source/test/testpool.cpp: + testpool: add missing stdio.h for printf + [ec6b4d35f110] + +2013-10-30 Steve Borho + + * source/common/vec/ipfilter-sse41.cpp: + ipfilter: fix 16bpp build following f0eea23735a6 + [f06e4a24b388] + + * source/encoder/framefilter.cpp: + fix shadowed variable warning + [a406f7c1dd3b] + +2013-10-30 Praveen Tiwari + + * source/common/pixel.cpp: + added blockcopy_pp_c primitive according to modified argument list + [7f68debc632b] + + * source/common/primitives.h, source/test/pixelharness.cpp, + source/test/pixelharness.h: + added test code for blockcopy_pp function + [e8e84b67cf8f] + +2013-10-30 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sad_12, sad_48, and sad_64 + + All single sads have asm coverage + [645899ddda59] + + * source/common/vec/pixel-sse41.cpp: + pixel: remove 24 and 32 width sad intrinsic functions + + These are now covered by assembly. Only 12, 48, and 64 remain + because they still lack x3 and x4 versions. + [eccfe236169b] + +2013-10-30 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + assembly code for pixel_sad_x4_32xN + [c3cf2c42e854] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + assembly code for pixel_sad_x3_32xN + [e371719c4c47] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + assembly code for pixel_sad_x4_24x32 + [f021f06f3b80] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h, + source/common/x86/sad-a.asm: + assembly code for pixel_sad_x3_24x32 + [de91fbc95b4a] + +2013-10-30 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h, + source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_12x16 + [8ee637b11d17] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h, + source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_24x32 + [ed5d877b8452] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h, + source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_48x64 + [78db76b7abec] + +2013-10-30 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/ipfilter.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + asm: filterConvertPelToShort + [1a51e6cb0e0c] + +2013-10-30 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_64x48 and pixel_sad_64x64 + [700b46a1a0cf] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_64x32 + [42ad273b1d4f] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_64x16 + [4414f3394a61] + + * source/common/x86/sad-a.asm: + asm: modified common macro for pixel_sad_64xN + [e9340727231d] + +2013-10-30 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + no-rdo: use bit estimates from ME to calculate RDcost. + + bits estimated in ME stored in CU and used for calculating rdcost + along with distortion. This results in better bitrate with no-rdo, + with small drop in PSNR. + [77db80a67f4e] + +2013-10-30 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + chroma interp_4tap_vert_pp all blocks asm code + [74bf8634037c] + +2013-10-30 Dnyaneshwar Gorade + + * source/common/x86/pixel.h: + asm: declare asm function pointers for sad_64xN partitions + [9f9b2f8d293a] + +2013-10-30 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sad_x3_4x16 and sad_x4_4x16, no longer HAVE_MMX + [65462024832b] + +2013-10-30 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + assembly code for pixel_sad_x3_4x16 and pixel_sad_x4_4x16 + [50c2c41ac0ea] + +2013-10-30 Santhoshini Sekar + + * source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, source/encoder/frameencoder.cpp: + rename variable name m_Bitstream to m_bitstream + [e2a1dcca4518] + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + writing hash SEI messages in frameencoder + [4c047e5ff69b] + +2013-10-30 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_16x12 + [40e38dfa5cdd] + +2013-10-30 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sad_8, sad_x3_8, sad_x4_8 intrinsic functions + [abf8286f3fa9] + +2013-10-30 Min Chen + + * source/test/ipfilterharness.cpp: + testbench: upgrade for check_IPFilter_primitive, don't pass wrong + (width, height, stride) to asm + [20aa88626c52] + +2013-10-29 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_8x32 + [c048ef93ea55] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_32x64 + [d3e510bb67cf] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_32x16 + [def3d61bc4b0] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_32x32 + [77aa24f08e76] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_32x24 + [840a638609b0] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_32x8 + [1aec8ddad7a3] + + * source/common/x86/sad-a.asm: + asm: created comman asm macro for pixel_sad_32xN functions + [f69c0f13c7b0] + +2013-10-29 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sad_16, sad_x3_16 and sad_x4_16 + + We have assembly coverage for everything but sad_16x12; which I've + put on the top of our TODO list. + [42ae4dc90005] + +2013-10-29 Murugan Vairavel + + * source/common/ipfilter.cpp, source/encoder/motion.cpp, + source/test/intrapredharness.cpp: + refactor: Check need for signed/unsigned int16_t + [c946d617fd9f] + +2013-10-29 Steve Borho + + * Merge + [b02df3ebdf39] + +2013-10-29 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: Remove initializations in initCU() already initialized + in create() + + currently m_partSizes, m_mvpIdx[0] and m_mvpIdx[1] all three + variables initialized using memset in TComDataCU::create() and same + initialization in done in TComDataCU::initCU(), removed the memset + in initCU() to avoid the duplicate initilization + [deac0d819c43] + +2013-10-29 Deepthi Devaki + + * source/Lib/TLibCommon/TComMotionInfo.cpp: + cleanups: Remove unnecessary reset. + + If refIdx is set to NOT_VALID, mv will not be used. Hence resetting + MVs to 0 can be safely removed. + [50cd62d85ead] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Cleanups: Remove call to setALLMVField, and set only required + members. + [6552629b45c5] + + * source/Lib/TLibCommon/TComMotionInfo.h: + Cleanups: MVField - make members public. + + set and clear methods take many cycles, where most of the time only + one index need to be set. By giving public access, caller can set + required indexes. + [f1a1d4f19db4] + +2013-10-29 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, source/encoder/compress.cpp: + no_rdo: implementation of new early exit + + early exit is done when the RD cost of best mode at depth "n" is + less than the average of RD cost of the CU's at depth "n" of + previosuly coded CU's(CUAbove, CUAboveRight, CUAoveLeft, CULeft, + CUColocated). For HD videos performance improvement of 20 to 27% + bitrate increases by 0.75 to 0.02% + [0e0d0d2e1d2b] + +2013-10-29 Dnyaneshwar Gorade + + * source/common/x86/pixel.h: + asm: declare asm function pointers for sad_32xN partitions + [b3208fa4294b] + +2013-10-29 Min Chen + + * source/common/x86/ipfilter8.asm: + asm: improvement on chroma_hpp{2,4} by reduce memory operator + [48b75fc2e614] + +2013-10-29 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_16x4 + [fb7b95d11c77] + +2013-10-29 Deepthi Devaki + + * source/encoder/compress.cpp: + no-rdo: add no-residue candidate in merge2Nx2N + [064f309d4862] + +2013-10-29 Steve Borho + + * source/common/common.cpp, source/common/primitives.h, + source/encoder/ratecontrol.cpp: + Merge with stable + [8b909d315964] + + * .hgtags: + Added tag 0.5 for changeset 69acb3cb777f + [9eef2952ea16] + +2013-10-28 idxa + + * source/common/common.cpp, source/encoder/ratecontrol.cpp: + rc: improvements for ABR + + 1. modify rateTolerance from 0.1 to 1.0, in order to allow the final + bitrate to fluctuate more(CBR to ABR) + + 2. ncu in initialization of ratecontrol is set according to 16x16 + block size which is same to x264, for the empirical formula in x264 + is for 16x16 size. + + 3. do more modification to make ratecontrol algorithm of x265 more + similar to x264 for x264 has been developed for about 10 years and + is very mature , it is reasonale to take its method firstly. + [69acb3cb777f] [0.5] + +2013-10-25 Steve Borho + + * source/encoder/encoder.cpp: + encoder: prevent divide by zero in elapsedVideoTime calculation + [6528ab023062] + +2013-10-29 Steve Borho + + * source/common/primitives.h: + primitives: fix ordering of LUMA_4x16 LUMA_16x4 to match other AMP + partitions + + This also fixes the testbench to properly represent 16x4 and 4x16 + partition primitives + [9713ec98fa8a] + +2013-10-29 Deepthi Devaki + + * source/encoder/compress.cpp: + no-rdo: add no-residue candidate in merge2Nx2N + [358400cb0c67] + +2013-10-29 Murugan Vairavel + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h: + Cleanups: Replacing Ushort with uint16_t + [4c618e33c25f] + +2013-10-29 Steve Borho + + * source/common/primitives.h: + primitives: fix ordering of LUMA_4x16 LUMA_16x4 to match other AMP + partitions + + This also fixes the testbench to properly represent 16x4 and 4x16 + partition primitives + [4db0aec57138] + +2013-10-29 Dnyaneshwar Gorade + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_16x64 + [3c0b386fe799] + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm: assembly code for pixel_sad_16x32 + [f44cc9f976cc] + +2013-10-29 Steve Borho + + * source/common/vec/vec-primitives.cpp: + vec: fix VC9 build with ASM disabled but intrinsics enabled + [8846f5cf6d8d] + + * source/common/threadpool.cpp, source/input/y4m.cpp, + source/input/yuv.cpp: + use correct _WIN32 build guard + [560bd09eb4bb] + +2013-10-29 Min Chen + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h, + source/test/ipfilterharness.cpp: + asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8] + [44c38df44532] + +2013-10-28 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter-sse41.cpp, source/encoder/motion.cpp, + source/test/ipfilterharness.cpp: + replace pointer to coeff by coeffIdx in ipfilter_sp + [f0eea23735a6] + + * source/encoder/motion.cpp: + disable interpolate horizontal merge + + we need width is multiple of 4 in asm code, the maskmovq is very + expensive + [a36a2e39f983] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm, source/common/x86/ipfilter8.h, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + asm: interp_8tap_hv_pp_8x8() for Interpolate_HV_8x8 + [31dfc1580bf2] + +2013-10-28 Yuvaraj Venkatesh + + * source/common/x86/asm-primitives.cpp, source/common/x86/sad-a.asm: + asm code for pixel_sad_x3_16x64 and pixel_sad_x4_16x64 + [0666d56aaa42] + +2013-10-28 Aarthi Thirumalai + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/slicetype.cpp: + rc: implement qp with Aq for each Cu before encoding them. + [c3a28172894a] + +2013-10-28 Kavitha Sampath + + * source/encoder/encoder.cpp, source/input/input.h, + source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/input/yuv.h, source/output/output.cpp, + source/output/output.h, source/output/yuv.cpp, source/output/yuv.h, + source/x265.cpp, source/x265.h: + refactor: (Input/Output files) - Check need of signed/unsigned int + [5e1fb266b39f] + +2013-10-28 murugan + + * source/output/y4m.cpp, source/output/yuv.cpp: + refactor: use unsigned int16 types where applicable + [4de8551f1f52] + +2013-10-28 Kavitha Sampath + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/common/common.cpp, source/common/dct.cpp, + source/common/lowres.cpp, source/common/lowres.h, + source/common/pixel.cpp, source/common/primitives.h, + source/common/threading.h, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-sse41.cpp, source/common/vec/dct-ssse3.cpp, + source/common/vec/intra-sse41.cpp, source/common/vec/pixel-avx2.cpp, + source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel16-sse41.cpp, source/compat/getopt/getopt.c, + source/compat/getopt/getopt.h, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp: + refactor: replace int pointers with int32_t and UInt with uint32_t + [a96b8f225c4a] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/encoder/compress.cpp, source/encoder/dpb.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + refactor: replace ternary operator with loop variable and enum with + #define + [4176ba91f62d] + +2013-10-28 Santhoshini Sekar + + * source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, source/encoder/encoder.cpp, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + row wise generation of hash in framefilter + [edd33e942d7b] + +2013-10-28 Steve Borho + + * source/input/input.cpp, source/input/input.h, source/x265.cpp: + cli: add --y4m option to force Y4M stream parser + [fa30212a64f8] + +2013-10-28 Gopu Govindaswamy + + * source/input/y4m.cpp, source/input/y4m.h: + input: read y4m input from stdin if filename is passed as "-" + [f59393a695ee] + + * source/input/y4m.cpp: + y4m : bug fix for calculating framesize in skipFrames() + [7916afc6c9c4] + +2013-10-28 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/primitives.h, + source/encoder/compress.cpp, source/encoder/motion.cpp, + source/encoder/slicetype.cpp, source/test/pixelharness.cpp: + primitives: lower case partitionFromSizes() function to follow + coding style + [ef2428fd32fe] + +2013-10-27 Steve Borho + + * source/common/primitives.cpp, source/common/primitives.h: + primitives: micro optimizations of PartitionFromSizes() + + * use int for w, h + * change lookup table to uint8_t + * make function inlined + [1a65c6df7e70] + +2013-10-27 Wenju He + + * source/Lib/TLibEncoder/TEncSearch.cpp: + need not compute chroma in xGetInterPredictionError + [7c2ce4d33f62] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + fix variable name, msg to mrg + [e1453bc3ae1e] + +2013-10-25 Steve Borho + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: do not check for inactive read thread until read queue is + empty + [ccac3a7d3622] + + * source/input/yuv.cpp: + yuv: set binary mode on stdin on Windows + [34573a44f81d] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: do not use ifstream.good(), use ifstream.ignore() for + skipFrames + [bc044398a2c1] + + * source/x265.cpp: + cli: improve handling of encoder start failures; no pictures encoded + [8606211480ea] + + * source/encoder/encoder.cpp: + encoder: prevent divide by zero in elapsedVideoTime calculation + [824ede6074a6] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: prevent more deadlocks on file read errors + [12d6ba61b235] + + * source/input/yuv.cpp: + yuv: fix --skip behavior for stdin + [565abe18280a] + + * source/cmake/version.cmake: + cmake: improve .hg_archive parsing + + Patch submitted by HaaeeD via pull request + [7dff7b61898c] + + * Merge + [4d024982658d] + + * Merge with stable + [4cdd80a730ab] + + * source/x265.cpp: + cli: improve handling of unknown input frame count + [0c8e2580b410] + + * source/encoder/encoder.cpp: + encoder: calculate encoded video time in floating point + [9c26397ec80c] + + * source/common/ipfilter.cpp: + Merged in mcwmurugan/x265 (pull request #5) + + CleanUps: Replacing Short with int16_t + [4ced02a9a5b2] + +2013-10-25 Murugan + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/dct.cpp, + source/common/intrapred.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy-sse3.cpp, source/common/vec/dct- + sse3.cpp, source/common/vec/dct-sse41.cpp, source/common/vec/dct- + ssse3.cpp, source/common/vec/intra-ssse3.cpp, source/common/vec + /ipfilter-sse41.cpp, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/x86/pixel.h, source/encoder/motion.cpp, + source/encoder/motion.h, source/output/y4m.cpp, + source/output/yuv.cpp, source/test/intrapredharness.cpp, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h, + source/test/mbdstharness.cpp, source/test/mbdstharness.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + CleanUps: Replacing Short with int16_t + [2126b735949e] + +2013-10-25 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/encoder/ratecontrol.cpp: + Merge with stable + [f645df2eb5bd] + + * source/input/yuv.cpp: + yuv: do not attempt to measure size of stdin + [88e0d2dbb8fe] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: increase vertical padding to account for + TComDataCU::clipMv() logic + + mvmin is clamped to -(g_maxCUHeight + offset + m_cuPelY) where + offset is 8 + [9a7c5831ebf8] + +2013-10-25 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc: Bug fix. use the slice type from the right context in + rateControlEnd. + [782e2d041d3f] + +2013-10-25 Steve Borho + + * source/input/yuv.cpp: + Merge with stable + [333c7a8d1f49] + + * source/input/yuv.cpp: + yuv: zero ifs pointer if deleted + [3d733e4c52c7] + +2013-10-25 Nabajit Deka + + * source/common/ipfilter.cpp, source/common/x86/ipfilter8.h: + Function declarations and function pointers set up for the vertical + luma filter functions. + [2adbf0c4c4c0] + +2013-10-25 Gopu Govindaswamy + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + tenccu : remove unused functions + [74f7e6d96d2d] + + * source/Lib/TLibCommon/NAL.h: + nal: remove unused helper functions + [85c064318253] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: initialize the input buffer (buf) pointer + [ba7d3d79c5a3] + + * source/input/yuv.cpp, source/input/yuv.h: + input: read yuv input from stdin if filename is passed as "-" + [9223c4aff1e0] + +2013-10-25 Nabajit Deka + + * source/common/primitives.h, source/test/ipfilterharness.cpp: + Test bench code for vertical luma filter. + [4ca4da7bdd36] + +2013-10-25 Steve Borho + + * source/test/pixelharness.cpp: + testbench: differentiate the two weight functions + [159876e25c7a] + +2013-10-25 Yuvaraj Venkatesh + + * source/common/vec/pixel-sse41.cpp: + pixel: modified weightUnidirPixel to match the changes done in c + code + [298b17e548d4] + +2013-10-25 Steve Borho + + * source/encoder/reference.cpp: + reference: initialize weight buffer pointer + [4125c74ff21d] + +2013-10-25 Min Chen + + * source/common/x86/ipfilter8.asm: + more general on ipfilter macro FILTER_H8_W8 + [6d6fe9d208d1] + +2013-10-25 Steve Borho + + * source/CMakeLists.txt, source/compat/getopt/LGPL.txt, + source/compat/getopt/getopt.c, source/compat/getopt/getopt.h, + source/compat/msvc/LGPL.txt, source/compat/msvc/getopt.c, + source/compat/msvc/getopt.h: + cmake: segregate the getopt files into their own compat/ folder + [84aea900ea0d] + + * Merge with stable + [7145f423fd03] + + * source/CMakeLists.txt: + cmake: add compat/msvc to include path if using our getopt.h + [7cc9e1566162] + + * source/input/y4m.cpp, source/input/yuv.cpp, source/x265.cpp: + input: improve handling of frame count estimation failures + [88f69939bc17] + +2013-10-24 Steve Borho + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h, source/x265.cpp: + input: add explicit startReader() method to prevent file handle use + collisions + [6e4ef45441b7] + + * source/encoder/encoder.cpp: + encoder: fix per-frame logging without CSV file + [260bff6100f2] + + * source/input/y4m.cpp: + y4m: directly use member variables while parsing header + [0174ac7bc2b1] + + * source/output/yuv.cpp: + yuv: fix VC9 compile warning - possible loss of precision + [34c9951b15fa] + + * source/common/common.cpp: + Merge with stable + [f886f2ed1fcd] + + * source/CMakeLists.txt: + cmake: use system native stdint.h and getopt if they are found + [1ab80557656a] + + * source/common/vec/pixel-sse41.cpp: + pixel: sse_sp[LUMA_12x16] fails tests on clang, disable it + [17195e65e91b] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: add missing carriage returns from error log messages + [63ca0173f8cb] + + * source/input/y4m.cpp: + y4m: disable file reader thread on frame header errors + + Prevents deadlocks from malformed Y4M files + [776c2ec26a5f] + + * source/common/common.cpp: + common: hoist x265_mdate above using namespace x265 + + I do not understand why, but this fixes cli linkage on Mac + [4ac43db2d640] + + * source/encoder/reference.cpp, source/encoder/reference.h: + reference: more robust initialization, remove m_startPad member + variable + + m_startPad was only used in the init() function so it could be an + auto-var + [015839081c84] + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/encoder/reference.cpp: + Merge with stable + [201468d75f33] + + * source/Lib/TLibEncoder/WeightPredAnalysis.cpp: + weightp: use source reference frames for weight analysis + + This fixes weightp when used in combination with frame parallelism, + where the reference's reconstructed picture is most likely not yet + avaialable. + + Some measurements using the sintel 480p clip no-weightp -F1: 408.47s + (3.07 fps), 144.63 kb/s, Global PSNR: 48.956 no-weightp -F3: 361.14s + (3.47 fps), 144.01 kb/s, Global PSNR: 48.746 + + Prior to this change (recon refs used for weightp analysis): weightp + -F1: 402.84s (3.11 fps), 131.09 kb/s, Global PSNR: 49.908 x265 + [info]: 278 of 687 (40.47%) P frames weighted weightp -F3: 355.88s + (3.52 fps), 132.09 kb/s, Global PSNR: 49.768 x265 [info]: 242 of 687 + (35.23%) P frames weighted + + After this change (source refs used for weightp analysis): weightp + -F1: 404.83s (3.10 fps), 131.82 kb/s, Global PSNR: 49.414 x265 + [info]: 325 of 687 (47.31%) P frames weighted weightp -F3: 348.32s + (3.60 fps), 131.01 kb/s, Global PSNR: 49.957 x265 [info]: 325 of 687 + (47.31%) P frames weighted + + Because of the lower bitrate, enabling weightp actually makes this + clip encode faster. No idea why -F1 has so much less PSNR than -F3; + needs investigation. + [a44b48b74d6f] + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/reference.cpp: + TComPicYuv: fix padding of picture buffers + + Ensure row starts are at a multiple of 32, this causes strides to be + multiple of 64. Before this patch, strides we capable of being + multiples of 8, causing alignment exceptions for some videos. + [0315cf14deda] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: remove unused include + [a54a9fa53063] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp, + source/encoder/slicetype.cpp: + Merge with stable + [a349dec61168] + +2013-10-24 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + improvement xGetRateLast by remove reduce double operator + [4bb4dbe427ec] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + improvement getSigCoeffGroupCtxInc by merge pointer calculate + [c986a9fc7f8f] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + improvement TComTrQuant::calcPatternSigCtx + + 1. replace width/height by size, since there are only NxN 2. use + mask operatior to avoid condition and branch + [84bf706ace32] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + improvement TComTrQuant::getSigCtxInc by lookup table for 'cnt' + [25d372f13fb6] + +2013-10-24 Deepthi Devaki + + * source/encoder/slicetype.cpp: + lookahead: fix reference initialization for intra prediction + + buffer size of pAbove/pLeft is height+1+width = 2*cusize+1 + [1aaa596bb20b] + +2013-10-24 Gopu Govindaswamy + + * source/input/y4m.cpp: + input: If Any error in reading frame from file, deactivate the + thread and exit encoder gracefully + [b69e4433cc97] + + * source/output/y4m.cpp, source/output/yuv.cpp: + output: use 64bit file offsets to prevent overflow with 4k video + [f94f18950283] + +2013-10-24 Steve Borho + + * source/encoder/encoder.cpp: + encoder: fix bitrate statistic (accBits is a bit count, not byte + count) + [98bf7e4154f5] + +2013-10-24 Shazeb Nawaz Khan + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/x265.h: + Displaying the number of weighted P frames used in console log + [e8992549a970] + +2013-10-24 Min Chen + + * source/Lib/TLibEncoder/TEncSbac.h: + fix commit typo + [eb694f6150b1] + +2013-10-24 Steve Borho + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h, + source/common/x86/sad-a.asm: + asm: instantiate some sad_x3 and sad_x4 functions for HEVC + partitions + [3b8fa23f68ec] + +2013-10-23 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unused copySliceInfo + [e8f05b1c543a] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unused m_bEqualRef and helper functions + [d31740e6905d] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/encoder/dpb.cpp: + TComSlice: remove unused TComRefPicListModification + [855151a30078] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/CMakeLists.txt, + source/common/lowres.h, source/common/mv.h, + source/common/reference.cpp, source/common/reference.h, + source/encoder/CMakeLists.txt, source/encoder/dpb.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/reference.cpp, source/encoder/reference.h: + reference: cache MotionReference instances in each FrameEncoder + + This prevents these structures from being allocated over and over + for each frame The source files were moved into the encoder folder + where they've belonged but couldn't live in the past because + TComPicYuv needed to know their contents. + [95384f8f7c22] + +2013-10-24 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/common.cpp, source/common/common.h, + source/common/primitives.cpp: + Merge with stable + [6ea79d6f7e17] + +2013-10-23 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncCfg.h, source/common/common.cpp, + source/common/common.h, source/common/primitives.cpp, + source/dllmain.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/input/input.h, + source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/input/yuv.h, source/output/output.h, source/output/y4m.cpp, + source/output/y4m.h, source/output/yuv.cpp, source/output/yuv.h, + source/x265.cpp, source/x265.h: + api: drop _t suffix from public data types, for POSIX compatibility + + x265_t was changed to x265_encoder, since x265 is too short and + would collide with our namespace. + [b07c29e930fe] + + * source/x265.h: + api: white-space cleanups in x265.h + [7beeab25a8fb] + + * source/CMakeLists.txt, source/Lib/TLibCommon/CommonDef.h, + source/common/common.cpp, source/dllmain.cpp, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/x265.cpp, source/x265.def.in, source/x265.h: + api: large reorg of logging and statistics + + * move all CSV logging into the encoder so API users can take + advantage of it + * remove hacky global PSNR return value from x265_encoder_close + * add time and bitrate values to x265_stats_t + * remove some dead HM code + * use x265_log in the CLI + [3e53b004a8f8] + +2013-10-23 Kavitha Sampath + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/common.cpp, source/common/common.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/x265.cpp, source/x265.h: + CSV log: Enable frame-by-frame CSV logging + [0c8dbda94696] + +2013-10-22 Steve Borho + + * source/CMakeLists.txt, source/x265.pc.in: + cmake: generate and install pkgconfig file + + This installs the x265.pc into /usr/local/lib/pkgconfig/ with these + contents: + + prefix=/usr/local exec_prefix=${prefix} libdir=${exec_prefix}/lib + includedir=${prefix}/include + + Name: x265 Description: H.265/HEVC video encoder Version: 0.4.1 + Libs: -L${libdir} -lx265 Libs.private: -lstdc++ -lm -lc -lpthread + -lrt Cflags: -I${includedir} + + The pkg-config file is only generated and installed if CMake finds + pkg-config + + Q1: it seems unwise to hard-code "/lib" in a few places, what if + lib64 is used or something else? Q2: Should x265.pc have a version + number like x265-0.4.1.pc? + [b2fcb1bf7b75] + +2013-10-23 Min Chen + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/common/CMakeLists.txt, + source/common/primitives.cpp: + cabac: cleanup and convert class ContextModel to struct + [0cb0692d6c69] + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h: + cabac: move static table to global space + [11a4ca818c57] + + * source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + cabac: imprvement by merge context status update path + [b53d40e1c7cf] + +2013-10-23 Steve Borho + + * source/Lib/TLibCommon/TComRom.h, source/common/CMakeLists.txt, + source/encoder/slicetype.cpp: + Merge with stable + [8ebddbb7acd3] + + * source/common/CMakeLists.txt, source/common/vec/blockcopy-avx2.cpp, + source/common/vec/vec-primitives.cpp: + remove blockcopy-avx2.cpp - not worth the trouble to convert and + maintain + [020eb714b9cc] + +2013-10-23 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: bug fix for estimated frame costs + + By the time rate control queries the estimated frame cost, lastNonB + might have moved. Use the slice's actual L0 reference. + [6a7383b8cbf0] + +2013-10-23 Steve Borho + + * source/common/vec/intra-ssse3.cpp: + intra: move 8x8 and 16x16 into their final order + + None of the 8bpp intra primitives use vector clases any more. + blockcopy-avx2 is the last 8bpp vector primitive. + [465e48ca0d15] + + * source/common/vec/intra-ssse3.cpp: + intra: remove unused macros + [d167d149b59e] + +2013-10-23 Min Chen + + * source/common/vec/intra-ssse3.cpp: + Fix memory write beyond bound bug in intraPredAng8x8() + [30be14fa79d6] + +2013-10-23 Jan Ekström + + * source/Lib/TLibCommon/TComRom.h, source/common/CMakeLists.txt: + Fix compilation with Visual Studio 2013 + + The header is needed for std::min and friends, and the + CMakeLists.txt modification makes the version check match what is + used in the source code. + + Unfortunately cmake does not seem to contain a GREATER_OR_EQUAL + macro, so NOT(LESS) has to be used. + [4922bf148182] + +2013-10-23 Yuvaraj Venkatesh + + * source/common/vec/intra-ssse3.cpp: + intra: converted intraPredAng16x16 vector class functin to intrinsic + [521ecea592fa] + +2013-10-23 Dnyaneshwar Gorade + + * source/common/vec/intra-ssse3.cpp: + intra-ssse3: intra angular 8x8 vector to intrinsic + [aaee6da4f02c] + +2013-10-23 Steve Borho + + * source/common/vec/pixel-avx2.cpp: + pixel: remove vector class include from pixel-avx2.cpp + [51a48f878f39] + +2013-10-23 Yuvaraj Venkatesh + + * source/common/vec/pixel-avx2.cpp: + pixel: converted sad_avx2_x4_64 vector class to intrinsic + [435665d9233e] + + * source/common/vec/pixel-avx2.cpp: + pixel: converted sad_avx2_x4_32 vector class to intrinsic + [2d4ee19a40b4] + +2013-10-22 Aarthi Thirumalai + + * source/encoder/slicetype.cpp: + slicetype: calculate weighted frame costs for Aq + [6d96d64c4e9a] + +2013-10-22 Yuvaraj Venkatesh + + * source/common/vec/pixel-avx2.cpp: + pixel: converted some sad_avx2 vector class functions to intrinsic + [0bb1d7221938] + +2013-10-22 Steve Borho + + * source/common/CMakeLists.txt, source/common/vec/intra-sse3.cpp, + source/common/vec/intra-ssse3.cpp, source/common/vec/vec- + primitives.cpp: + intra: rename intra-sse3.cpp to intra-ssse3.cpp + [20bf892451db] + + * source/common/vec/intra-sse3.cpp: + intra: move 32x32 angular prediction function into non-vector + portion + [41aa6dd645ed] + +2013-10-22 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: replace intraPredAng32x32 vector class function with + intrinsic + [9827c0129014] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: replace intraPredAng8x8 vector class function with + intrinsic + [de32b76c391d] + +2013-10-22 Steve Borho + + * source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, source/common/common.cpp, + source/encoder/ratecontrol.cpp: + Merge with stable + [88de242f7530] + +2013-10-22 Min Chen + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + cabac: improvement performance by use negative cabac counter + [88b3831ab799] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/CMakeLists.txt, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + cleanup: merge header operator from TEncCavlc into class TEncSbac + [4ec21109440b] + + * source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/CMakeLists.txt, + source/encoder/frameencoder.cpp: + cleanup:remove unused base class TEncBinIf + [b6427fa01195] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + cleanup: remove unused getNumberOfWrittenBits() from + TEncBinCABAC::getNumWrittenBits() + [af1695e1808c] + + * source/common/x86/ipfilter8.asm: + remove reduce register copy in FILTER_H4_w2_2 and FILTER_H4_w4_2 + (update for linux build error) + [f1045bead3b5] + +2013-10-22 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: consistent comment style + [27265ca6dd90] + +2013-10-22 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/common.cpp, + source/encoder/encoder.cpp: + encoder: auto-padding to min CU size and set conformance window + [27a149b2062c] + +2013-10-22 Shazeb Nawaz Khan + + * source/common/pixel.cpp: + Eliminating decreament in pointer index in weightp primitives + + could have been a source of possible crash + [49849de33234] + +2013-10-22 Steve Borho + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: use cfg->param.frameNumThreads directly; nit cleanups + [cd65a3311df5] + +2013-10-22 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: Bug fixes for ABR. + + remove uninitialized local variables and use values from + cfg.param.rc as required. + [e2dc3ec294a8] + +2013-10-22 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: nits + [f1bdacac6497] + +2013-10-22 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + rc: Bug fixes for ABR. + + remove uninitialized local variables and use values from + cfg.param.rc as required. + [f8ab02273bdb] + +2013-10-21 Steve Borho + + * source/common/common.cpp: + Merge with stable + [9245a882ccee] + + * source/common/common.cpp: + common: add --ref to the tool list so it is visible in the log + [40eb6be35caa] + + * source/common/lowres.cpp: + lowres: initialize satdCost to -1 + [606cdb8d05ef] + +2013-10-21 Aarthi Thirumalai + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + lowres: Add states to store weighted Aq costs per frame. + [ee5e9caa0ff9] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/common/common.cpp, source/common/common.h: + add methods to convert qpAqoffsets to qscale + [afa3a6660764] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/lowres.cpp, source/common/lowres.h, + source/encoder/ratecontrol.cpp: + lowres: move m_qpAqOffset from TComPic to Lowres + [588c8fcc1df6] + +2013-10-21 Steve Borho + + * doc/README_data-structure.ppt, doc/astyle/AStyle.exe, doc/astyle + /apply-to-all-source.py, doc/astyle/astyle-config.txt, doc/astyle + /drag-astyle.bat, doc/intra/T16.TXT, doc/intra/T32.TXT, + doc/intra/T4.TXT, doc/intra/T8.TXT, doc/software-manual.pdf, + doc/uncrustify/uncrustify.bat, doc/uncrustify/uncrustify.exe, + source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/VectorClass/instrset_detect.cpp, + source/cmake/mergestaticlibs.cmake, + source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/avx2.cpp, source/common/vec/blockcopy-avx.cpp, + source/common/vec/blockcopy-sse41.cpp, source/common/vec/blockcopy- + ssse3.cpp, source/common/vec/blockcopy-xop.cpp, + source/common/vec/blockcopy.inc, source/common/vec/dct-avx.cpp, + source/common/vec/dct-avx2.cpp, source/common/vec/dct-xop.cpp, + source/common/vec/dct.inc, source/common/vec/intra-avx.cpp, + source/common/vec/intra-avx2.cpp, source/common/vec/intra-ssse3.cpp, + source/common/vec/intra-xop.cpp, source/common/vec/intrapred.inc, + source/common/vec/ipfilter-avx.cpp, source/common/vec/ipfilter- + avx2.cpp, source/common/vec/ipfilter-sse3.cpp, source/common/vec + /ipfilter-xop.cpp, source/common/vec/ipfilter.inc, + source/common/vec/ipfilter16.inc, source/common/vec/ipfilter8.inc, + source/common/vec/pixel-avx.cpp, source/common/vec/pixel-ssse3.cpp, + source/common/vec/pixel-xop.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/vec/sse.inc, source/common/vec/sse3.cpp, + source/common/vec/sse41.cpp, source/common/vec/ssse3.cpp, + source/common/vec/utils.h, source/common/vec/vecprimitives.inc, + source/common/vec/xop.cpp, source/common/x86/CMakeLists.txt, + source/test/unittest.cpp, source/test/unittest.h, source/x265opts.h: + Merge with default (feature freeze for 0.5) + [f2fcda06d76a] + +2013-10-21 Min Chen + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + merge multiple encodeBinEP to encodeBinsEP + [6817f34b0572] + +2013-10-21 Steve Borho + + * source/encoder/compress.cpp: + compress: remove a pile of unnecessary intra estimation code + [dd26e0a9c3df] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + compress: remove pointer reference from xComputeCostIntraInInter and + simplify + [c1ee22ece6f5] + + * source/encoder/compress.cpp: + compress: replace magic numbers with proper enums + [505b1c488b39] + +2013-10-21 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: replace predIntraAng8_32 vector class function with + intrinsic + [e24a4bb6de84] + +2013-10-21 Sumalatha Polureddy + + * source/encoder/compress.cpp: + no-rdo: compute Luma only in MC for cost estimation for mode + decision + + luma alone is computed in MC since luma cost alone is used for mode + decision. After best mode is selected, before encoding, chroma MC is + done + [16a5fc504005] + +2013-10-21 Shazeb Nawaz Khan + + * source/common/pixel.cpp, source/common/vec/pixel-sse41.cpp: + Adapting weightp primitive for pixel input + + By simulating shift & round as in convertPixelToShort primitive. The + SSE4.1 intrinsic primitives are disabled because they no longer + match + [e719f0de8d9c] + +2013-10-21 Steve Borho + + * source/common/ipfilter.cpp, source/common/vec/ipfilter-sse41.cpp, + source/common/vec/ipfilter-ssse3.cpp: + ipfilter: cleanup C and intrinsic functions + [90dde3b44ada] + +2013-10-21 Praveen Tiwari + + * source/common/ipfilter.cpp: + added C code for chroma filter_vpp function + [2e0076f3f694] + + * source/common/x86/ipfilter8.h: + created function declerations for chroma_vpp + [9477665dd935] + + * source/test/ipfilterharness.cpp: + ipfilterharness.cpp, modified chroma filter_hpp unit test code to + support filter_vpp + [60ade24dbf9b] + + * source/test/ipfilterharness.cpp: + ipfilterharness.cpp, added measure speed code for ipfilter_vpp + function + [091fb24735da] + + * source/test/ipfilterharness.cpp: + ipfilterharness.cpp, added test correctness code for ipfilter_vpp + function + [e42826e06e06] + + * source/common/primitives.h: + added array of function pointers for chroma ipfilter_vpp function + [e3b4b6b778a5] + +2013-10-21 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, source/encoder/encoder.cpp: + tcomslice : removed set and get window*offset() methods + [ee5ef0f3fd3d] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/encoder.cpp, + source/encoder/framefilter.cpp: + tenccfg : removed unused setpad() and getpad() methods + [caa8db6b7986] + +2013-10-21 Steve Borho + + * source/x265.cpp: + cli: reintroduce -w short option for weightp + [c4cc469e5286] + +2013-10-21 Deepthi Devaki + + * source/encoder/slicetype.cpp: + Lookahead: wavefront bugfix. + + conditionally assign estimated cost to avoid overwriting intra cost. + [92d13feba8fe] + +2013-10-21 =?utf-8?b?UmFmYcOrbCBDYXJyw6kgPGZ1bm1hbkB2aWRlb2xhbi5vcmc+?= <=?utf-8?b?UmFmYcOrbCBDYXJyw6kgPGZ1bm1hbkB2aWRlb2xhbi5vcmc+?=> + + * source/encoder/ratecontrol.cpp: + [x265] ratecontrol: initialize frameThreads Fix a floating point + exceptio + + --- source/encoder/ratecontrol.cpp | 1 + 1 file changed, 1 + insertion(+) + [3fe9a9d0a0b6] + +2013-10-21 Deepthi Nandakumar + + * source/common/lowres.cpp: + lowres: right and bottom margins are being extended twice? + [f987c24c7bf2] + +2013-10-21 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp: + bug fix for passing bLuma and bChroma as separate arguments + + if there are default arguments other than bLuma and bChroma, those + default values (bRound, refIdx) are not passed during the function + call(addWeightBi(), xWeightedPredictionUni()) + [b2aa2aad2c66] + +2013-10-21 Deepthi Nandakumar + + * source/common/vec/intra-sse3.cpp: + intra-sse: Fix for HIGH_BIT_DEPTH build error + [8fc308449916] + +2013-10-21 Min Chen + + * source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/common/CMakeLists.txt: + cabac: cleanup array of cabac context + [15588437fc24] + +2013-10-19 Min Chen + + * source/common/x86/ipfilter8.asm: + remove reduce register copy in FILTER_H4_w2_2 and FILTER_H4_w4_2 + [fabb25ae4db4] + +2013-10-20 Steve Borho + + * source/common/vec/intra-sse3.cpp, source/common/vec/intra-sse41.cpp: + intra: move intra_pred_dc to intra-sse41.cpp; it uses SSSE3 + instructions + + We don't have an intra-ssse3.cpp and it seems a waste to create one + just for this one function. + [7ec69cb067fd] + +2013-10-18 Steve Borho + + * source/common/wavefront.h, source/encoder/encoder.cpp, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + Lookahead: implement wavefront parallel processing + [c96f97cf3914] + +2013-10-18 Deepthi Devaki + + * source/common/wavefront.cpp, source/common/wavefront.h: + WaveFront: add new function to enable all rows + [dd45e55248c8] + +2013-10-18 Steve Borho + + * source/test/ipfilterharness.cpp: + ipfilterharness: simplify filter names + [4066e6e725ee] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: add x265_emms() after use of pixelavg_pp and satd + primitives + [1fa93e1f4caa] + +2013-10-18 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm code for luma filter functions + [0d146f05d561] + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + added 24x32 partion size asm code to chroma function + [a301f749b0bc] + + * source/common/x86/asm-primitives.cpp, + source/test/ipfilterharness.cpp: + asm: corrected luma enum variable, testbench fix + [8b507771e6b0] + + * source/common/ipfilter.cpp: + ipfilter.cpp, added code to support luma coefficients too + [4bcfc0e23935] + +2013-10-18 Steve Borho + + * source/common/vec/intra-sse3.cpp: + intra: isolate last remaining vector class functions (angular intra + 8, 16, 32) + [8de380c7bd41] + + * source/common/vec/intra-sse3.cpp: + intra: sane function names and typedefs + [1959dbe1b643] + + * source/common/vec/intra-sse3.cpp: + intra: nits + [6a453beeea88] + + * source/common/vec/intra-sse3.cpp: + intra: remove SSE3 planar intrinsic functions; they are redundant + [edf6eb8da4ca] + +2013-10-18 Yuvaraj Venkatesh + + * source/common/vec/intra-sse3.cpp: + intra: replace predDCFiltering vector class function with intrinsic + [140f90417702] + + * source/common/vec/intra-sse3.cpp: + intra: replace intra_pred_dc vector class function with intrinsic + [d24283fe5e31] + +2013-10-18 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc : removed warning , moved strength to acEnergyCu + [089b29b4da2a] + +2013-10-18 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: disable remaining pixelavg primitives, they fail against our C + ref + [904ff6d6e5d9] + + * source/test/pixelharness.cpp: + pixelharness: fix iteration through partition enums + [7e95be5f70bc] + +2013-10-18 Dnyaneshwar Gorade + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + added pixelavg_pp function to testbench + [fdd1262059ad] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy-sse3.cpp: removed unnecessary variable. + [9ff06eb3bc4d] + +2013-10-18 Steve Borho + + * source/common/CMakeLists.txt: + cmake: msvc yasm dependency fix + [357a6d0c305d] + +2013-10-18 Dnyaneshwar Gorade + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + added cvt32to16_shr_sse2 function to testbench. + + Speed up measured is almost 14x. + [f3523973eafb] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy-sse3.cpp: removed warning: overflow in implicit constant + conversion. + [48afd41e0753] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: replace xPredIntraAng4x4 vector class function with + intrinsic. + [c1e53b796ef4] + +2013-10-18 Steve Borho + + * source/common/vec/intra-sse3.cpp: + intra: remove unused variable + [d6d7187c5f4e] + + * source/common/x86/asm-primitives.cpp: + asm: fix 32bit build following partition enum carnage + [27dfc522397e] + +2013-10-17 Min Chen + + * source/common/CMakeLists.txt, source/common/vec/pixel-sse3.cpp, + source/common/x86/asm-primitives.cpp, source/common/x86/pixel- + util.asm, source/common/x86/pixel.h: + asm: add cvt32to16_shr_sse2, remove intrinsic primitive + [84857e7ba3e1] + +2013-10-17 Steve Borho + + * source/common/common.h: + common: force float and double arguments to logf() and log(), + respectively + + Fixes warnings on some MSVC versions + [d61e2ff59c29] + + * source/common/vec/intra-sse3.cpp: + intra: fix GCC warning about potentially uninitialized sum variable + [61abe115acfc] + + * source/common/vec/intra-sse3.cpp, source/test/intrapredharness.cpp: + intra: segregate 8bpp from 16bpp functions, drop 16bpp angular, drop + 64x64 + + The HIGH_BIT_DEPTH angular function was just a copy of the C + reference, we do not need 64x64 blocks any more + [5ab2da8320f5] + + * source/common/vec/intra-sse3.cpp: + intra: remove unused argument to predDCFiltering(), remove static + + There's no need to declare the function static, it is within an + anonymous namespace + [d05cf1a4d3a5] + + * source/common/vec/pixel-avx2.cpp, + source/common/vec/pixel16-sse41.cpp, source/test/testbench.cpp, + source/test/testharness.h: + pixel: fix avx2, 16bpp, and testbench following luma enum reorg + [39ceb9570c5d] + +2013-10-17 Praveen Tiwari + + * source/test/ipfilterharness.cpp: + removed unnecessary calculation from chroma REPORT_SPEEDUP function + [b42f1963229b] + + * source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h: + asm: fundef creation and function pointer table setup for luma asm + primitives + [165d27a37689] + + * source/common/ipfilter.cpp: + ipfilter: setup luma function pointers + [2ecc6883d465] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + unit test code for luma filter + [2b312edc7d7d] + +2013-10-17 Steve Borho + + * source/common/primitives.h: + primitives: define luma_hpp block interpolation function pointers + [8f0f4bb9825e] + + * source/common/CMakeLists.txt, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.cpp, + source/common/primitives.h, source/common/vec/pixel-sse41.cpp, + source/common/x86/asm-primitives.cpp, source/common/x86/ipfilter8.h, + source/common/x86/mc.h, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h, source/encoder/framefilter.cpp, + source/encoder/motion.cpp, source/encoder/ratecontrol.cpp, + source/encoder/slicetype.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp, source/test/testharness.h: + primitives: cleanup luma partition enums and primitive + initialization + + Don't define any enums for partitions which are not used by HEVC. + Line up chroma enums to match luma enums. Stop instantiating C + primitives that are never used. Shorten up the partition enum names. + Prune unused SSD assembly routines, move ASM funcdefs into headers + [2eb3f19bb34a] + +2013-10-17 Yuvaraj Venkatesh + + * source/common/vec/pixel16-sse41.cpp: + pixel16: converted sad_4 from vector class to intrinsic + [f5cdcb7cdaca] + +2013-10-17 Aarthi Thirumalai + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: implement Adaptive Quantization. + + added functions to compute AC Energy per CU for all planes, + calculate qpAqOffset for each CU + [c49db12611a2] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/encoder.cpp: + TComPic: add m_qpAqOffset to store qp offsets per CU when Aq is + enabled + [e2333fe80c56] + +2013-10-17 Steve Borho + + * source/common/CMakeLists.txt: + cmake: fix assembly dependency path + [a09583956501] + +2013-10-17 Gopu Govindaswamy + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + cli: rename bRDLevel to rdLevel + [ffb8df2f3778] + + * source/common/common.cpp, source/common/common.h: + common : Added new function x265_param2string + [09bb631e4253] + +2013-10-17 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel-sse3.cpp: + improvement cvt32to16_shr by merge width and height loop + [f6226cef13f1] + +2013-10-17 Yuvaraj Venkatesh + + * source/common/vec/pixel-sse3.cpp: + pixel: fix the hash mismatch due to convert32to16_shr + [a31b03ff2cff] + +2013-10-17 Min Chen + + * source/common/x86/pixel-a.asm: + x265_pixel_ssd_4x4_ssse3 miss EMMS + [2a3af4fe8e5c] + +2013-10-17 Praveen Tiwari + + * source/test/ipfilterharness.cpp: + removed unnecessary calculation form rand_srcStride + [dfae391107c3] + + * source/test/ipfilterharness.cpp: + added genration of random stride in chroma unit test code + [fc9dbd798ac3] + + * source/common/x86/ipfilter8.asm: + fixed output mismatch problem with chroma 2xN block + [b3852d6908a5] + +2013-10-17 Steve Borho + + * source/common/vec/intra-sse3.cpp: + intra: remove dead tables, fix comment typos, and other white-space + issues + [2fbeab18f182] + + * source/encoder/slicetype.cpp: + slicetype: fix pre-calculation of slice cost for ABR + [ce1116e9def7] + + * source/common/vec/intra-sse3.cpp: + intra: add parens to macros to fix auto-alignment + [d05787e07b21] + +2013-10-16 Steve Borho + + * doc/uncrustify/codingstyle.cfg, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp: + uncrustify: another brace-style tweak + [ebec58e22380] + + * source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/common.cpp, source/common/common.h, + source/common/cpu.h, source/common/ipfilter.cpp, + source/common/lowres.cpp, source/common/lowres.h, + source/common/piclist.cpp, source/common/piclist.h, + source/common/pixel.cpp, source/common/primitives.cpp, + source/common/reference.cpp, source/common/threading.cpp, + source/common/threading.h, source/common/vec/blockcopy-avx2.cpp, + source/common/vec/blockcopy-sse3.cpp, source/common/vec/dct- + sse3.cpp, source/common/vec/dct-sse41.cpp, source/common/vec/intra- + sse3.cpp, source/common/vec/intra-sse41.cpp, source/common/vec + /ipfilter-sse41.cpp, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse3.cpp, + source/common/vec/pixel-sse41.cpp, source/common/vec/pixel- + ssse3.cpp, source/common/vec/pixel16-sse41.cpp, source/common/vec + /vec-primitives.cpp, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel.h, source/dllmain.cpp, + source/encoder/compress.cpp, source/encoder/cturow.h, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp, + source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/input/yuv.h, source/test/intrapredharness.cpp, + source/test/ipfilterharness.cpp, source/test/pixelharness.cpp, + source/test/pixelharness.h, source/test/testbench.cpp, + source/test/testharness.h, source/x265.cpp, source/x265.h: + uncrustify entire source tree + [1d6b3626f1b3] + + * doc/uncrustify/apply-to-all-source.py: + uncrustify: allow uncrustify to process .cpp and .h in common/x86 + [03be69eab3db] + + * doc/uncrustify/codingstyle.cfg: + uncrustify: do not remove whitespace in an empty brace section {} + + (just ignore it) + [6405dca03059] + +2013-10-14 Steve Borho + + * source/CMakeLists.txt, source/compat/msvc/LGPL.txt, + source/compat/msvc/getopt.c, source/compat/msvc/getopt.h: + getopt: grab an unambiguously LGPL version of getopt + + Found at: https://github.com/Tietew/mediawiki- + xml2sql/tree/master/getopt + + The only change made was to move getopt_long() into getopt.c from + getopt1.c + + # HG changeset patch # User Steve Borho # Date + 1381776922 18000 # Mon Oct 14 13:55:22 2013 -0500 # Node ID + b6cca1b1a9b700a8ffc316f3186dbf10bc1149cc # Parent + abae6903e0af0d9940bb734ba34dff6928d72e61 getopt: grab an + unambiguously LGPL version of getopt + + Found at: https://github.com/Tietew/mediawiki- + xml2sql/tree/master/getopt + + The only change made was to move getopt_long() into getopt.c from + getopt1.c + [60a105fed8c8] + +2013-10-16 Steve Borho + + * source/common/vec/pixel-sse3.cpp: + pixel: disable intrinsic cvt32to16_shr; it is causing hash + mismatches + [258394d8ab91] + + * source/common/ipfilter.cpp: + ipfilter: gcc preprocessor does not allow ## use for non-symbols + + The ## in this case was totally unnecessary + [63a63e668fb7] + + * source/x265.cpp: + cli: add missing --cpuid handler + [af92ca11aa13] + + * source/test/ipfilterharness.cpp: + test: revert accidental change to 64 dims so testbench passes + [16ae0ba47935] + + * source/common/vec/dct-sse41.cpp: + dct: remove SSE41 dct 8x8, 16x16 and 32x32 intrinsic primitives + + These were adapted from vector class functions but were much slower + than the hand-tuned dct SSSE3 functions written by Min. + [6c3aa856dc65] + + * source/common/primitives.h, source/common/vec/vec-primitives.cpp, + source/common/x86/asm-primitives.cpp: + primitives: do not include the public API header in primitives.h + + This was necessary in the past for the CPU level enums but those are + now gone. Only vec-primitives.cpp and asm-primitives.cpp need the + CPU capability defines and they can include x265.h themselves. + [5e269e353bc6] + +2013-10-16 Praveen Tiwari + + * source/test/ipfilterharness.cpp: + check_IPFilterChroma_primitive, stride made equal to min width 2 + + short-term workaround for 2XN blocks + [89a299c198d0] + +2013-10-16 Steve Borho + + * source/x265.cpp: + cli: tighten up command line help, remove redundancies and save + whitespace + [f7d1914d1f61] + + * source/x265.cpp: + cli: pull CLIOptions methods out of struct definition (cleanup) + + Remove i_ hungarian notation from variables + [5c258896b571] + + * build/vc10-x86/make-solutions.bat, build/vc10-x86_64/make- + solutions.bat, build/vc11-x86/make-solutions.bat, build/vc11-x86_64 + /make-solutions.bat, build/vc9-x86/make-solutions.bat, + build/vc9-x86_64/make-solutions.bat: + build: wtf is a Visual Studion? + [798f12a3ca4c] + +2013-10-16 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + pass bLuma and bChroma parameters for functions + + Based on the values of two parameters, luma and chroma operations + are done. Default value for both is set as true todo both operations + [872e98c8bc7b] + +2013-10-16 Gopu Govindaswamy + + * source/CMakeLists.txt, source/x265opts.h: + cli: remove unused x265opts.h + [9317198e26e2] + + * source/x265.cpp: + cli: long_options structure initialization without using x265opts.h + [1cf1fe777d14] + + * source/x265.cpp: + cli: implemented do_help() without using x265opts.h + [9347808f2f9b] + + * source/common/common.cpp, source/x265.cpp, source/x265.def.in, + source/x265.h: + api: add x265_param_parse() function based on x264_param_parse() + [69d8796e57e4] + +2013-10-16 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + bidir: fix for hash mismatch with B-frames + + Use seperate variables for mvp and mvpidx used for zero mv + candidates. Also copy the corresponding AMVPinfo for each + reflist/refIdx. + [c415f32219fe] + +2013-10-16 Praveen Tiwari + + * source/common/x86/asm-primitives.cpp, + source/common/x86/ipfilter8.asm: + asm: update chroma interpolation primitives + [bc1399dbc2ed] + + * source/common/ipfilter.cpp, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + primitive: added C primitive and unit test code for one chroma + filter + [762ca3c4b6f2] + + * source/common/primitives.h: + primitive: chroma partition enums by full dimension and function + pointers + [08ecee513efc] + +2013-10-16 Sumalatha Polureddy + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + no-rdo: remove unused checks + [d45ce25752cf] + +2013-10-16 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Remove unused vector class macros + [ec5a816c4ef3] + +2013-10-16 Min Chen + + * source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + cleanup: remove unused updateContextTables() + [ad081b5f340f] + +2013-10-16 Kavitha Sampath + + * source/input/yuv.cpp, source/input/yuv.h: + yuv: make file reading threaded + [18c935330e9e] + +2013-10-16 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: disable the use of x264 pixel weighting functions + + These are breaking lowres qpel generation + [4b1716b232e5] + +2013-10-16 Aarthi Thirumalai + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + move ssim calculation to frameFilters + [09c0e0209d84] + +2013-10-15 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unused static MV arrays + [a998daed8459] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nits + [9bff70c75d32] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/dpb.cpp: + TComSlice: remove unused m_list1IdxToList0Idx and methods + [676788df0ab2] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unused variables + [397589048016] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: rename refList -> list, refIdxTmp -> idx (more readable) + [53323636b8d3] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: simplify unidirectional search logic + + There was a lot of extra code to check if an L0 pic was in the L1 + list and was already searched. We do not copy our L0 pics into the + L1 list so this isn't an issue. + [d8665f64a662] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unnecessary memsets + [f2f61a2626ef] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp: + TEncSearch: hoist setSourcePlane() to be called just once per frame + [1fda6e4da927] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: move variables closer to their use + [b6db83cab831] + + * source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/framefilter.cpp: + global search and replace for common hungarian prefixed variables + [5ada776190c0] + + * source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h: + WeightPredAnalysis: remove hungarian prefixes + [e0893637f5ac] + + * source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h: + TEncCavlc: remove hungarian prefixes + [c5ac154bfb4d] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h: + TEncBinCABAC: remove hungarian prefixes + [b7d09f879c51] + + * source/common/CMakeLists.txt: + cmake: fix gcc 16bpp build + [b04d75ceb182] + + * source/Lib/TLibCommon/TComPic.h, source/encoder/encoder.cpp, + source/x265.h: + api: pass presentation timestamp through the encoder + [7c8c591908ac] + +2013-10-15 Min Chen + + * source/Lib/TLibCommon/TComBitStream.cpp: + faster grow buffer size to reduce number of memcpy + [f66122e4565e] + + * source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp: + cabac: writeByte() for faster CABAC output + [7bd38dc97fa1] + + * source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + cleanup:reduce ContextModel3DBuffer to 1D + [e4f130853d90] + +2013-10-15 Praveen Tiwari + + * source/common/primitives.h: + primitives: add chroma partition widths and interpolation function + def + [d011ba82c852] + +2013-10-15 Dnyaneshwar Gorade + + * source/common/vec/pixel-sse41.cpp: + pixel-sse41.cpp: Modified PROCESS_SSE_SS4x1 macro with faster + intrinsics + [352781943ca8] + +2013-10-15 Kavitha Sampath + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h: + y4m: make file reading threaded + [dd78cc895f2d] + +2013-10-15 Santhoshini Sekar + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + PSNR: row-by-row PSNR measurement with SSD accumulators in TComPic + [404528f1ed26] + +2013-10-15 Deepthi Nandakumar + + * source/common/vec/ipfilter-sse41.cpp: + ipfilter: Fix for 16bpp build + [1a85d8814346] + +2013-10-15 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: fix 16bpp build + [bbe95ece093f] + +2013-10-15 sairam + + * source/encoder/motion.cpp: + Fixed the --me 4 cli option error + [cb83e2f93592] + +2013-10-15 Yuvaraj Venkatesh + + * source/common/vec/pixel-sse41.cpp: + pixel: cleared the bug in sse_sp8, through sse_sp64 + [8c8d5700d22b] + + * source/common/vec/pixel-sse41.cpp: + pixel: modified weightUnidir to clear the bug. + [07f03a3fa2b8] + + * source/common/vec/pixel-sse41.cpp: + pixel: cleared the bug in sse_sp4. + [cc35cb2f55e8] + +2013-10-15 Steve Borho + + * source/common/CMakeLists.txt, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel16-sse41.cpp, source/common/vec/pixel16.inc: + cmake: give 16bpp vector sad primitives their own C++ file + [764c0e9984f0] + + * source/CMakeLists.txt: + cmake: do not query clang version, it is not used + [fa90c915a323] + +2013-10-14 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix comment for TEncSearch::predInterSearch + [062c51758069] + + * source/CMakeLists.txt: + cmake: move X265_BUILD definition to near top of main CMakeLists.txt + + This is just to make it easier to find + [abae6903e0af] + + * source/common/CMakeLists.txt, source/common/common.cpp, + source/common/version.cpp: + version: move export variables into version.cpp + [eeffa630e770] + + * source/CMakeLists.txt: + cmake: allow MinGW to rename x265-static to x265 + + MinGW uses libx265.dll.a for the shim loader library, which does not + collide with libx265.a, so it is ok. Only MSVC wants to use x265.lib + for both + [28e2a3926c95] + + * source/common/CMakeLists.txt: + cmake: fix 32bit GCC compile + [b2c148b71db8] + + * source/CMakeLists.txt: + cmake: drop x265 folder suffix to archive install path + [5af31960c41d] + +2013-10-13 Steve Borho + + * source/common/common.cpp, source/common/common.h, + source/common/lowres.h, source/encoder/slicetype.h: + common: sanity check some lookahead settings + [eb0aa9c42bba] + +2013-09-10 Steve Borho + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: fill in missing detail from slicetypeDecide() + + This moves slicetypeDecide() to make the file more readable. It also + adds stubs for features that we do not support yet including + weightp, B-pyramid, intra refresh, user-supplied slice types, etc + [9bdff3310321] + +2013-10-13 Steve Borho + + * source/common/CMakeLists.txt: + cpu: disable -Wnarrowing for cpu.cpp + + This C99 code from x264 causes warnings when compiled for C++ + [8011064113f7] + +2013-10-13 Min Chen + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp, + source/encoder/cturow.cpp: + cabac: cleanup unused code + [1eeac78dbddb] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/CMakeLists.txt, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + cabac: cleanup TEncBinCoderCABACCounter to reduce C++ feature + [6d351ec699e5] + +2013-10-12 Steve Borho + + * source/common/vec/vec-primitives.cpp: + vec: it seems clang has its own set of intrinsic generation bugs + + If clang is allowed to run the instrinsic functions it builds in + these two files it causes SEGVs + [db12b4bf3ffd] + + * source/common/primitives.cpp: + primitives: show capabilities used even when cpuid is specified + [4423fc3fcd08] + + * source/test/testbench.cpp: + testbench: repair --cpuid command line argument + [c1acbd493213] + + * source/common/vec/ipfilter-sse41.cpp: + ipfilter: move 16bpp primitives into a separate area of the file + + This isolates the remaining vector primitives from the intrinsic + primitives + [7196914eff8f] + +2013-10-10 Steve Borho + + * source/VectorClass/instrset.h, + source/VectorClass/instrset_detect.cpp, + source/common/CMakeLists.txt, source/common/cpu.cpp, + source/common/cpu.h, source/common/primitives.cpp, + source/common/primitives.h, source/common/vec/vec-primitives.cpp, + source/common/x86/asm-primitives.cpp, source/test/testbench.cpp, + source/x265.h: + asm: adopt x264 CPU detection and flags + [76260e1b472d] + +2013-10-12 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/common.cpp, source/common/common.h, source/x265.cpp, + source/x265.def.in, source/x265.h: + api: add exported strings which describe version and build info + + It seemed more useful for this data to be in the x265 library rather + than the CLI app + [6d5df4858df6] + + * source/CMakeLists.txt, source/cmake/cmake_uninstall.cmake.in: + cmake: add uninstall rule for non-Windows platforms + + CMake on Windows doesn't appear to generate an install manifest file + [c18a09b9d2d3] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt: + cmake: enable vim syntax hightlighting + [ec98f30c5185] + + * source/cmake/version.cmake: + cmake: nit cleanups in version.cmake + [c032a0fbc863] + + * source/CMakeLists.txt, source/cmake/version.cmake, source/x265.def, + source/x265.def.in, source/x265.h, source/x265_config.h.in: + cmake: add install targets, machine generate x265.def and + x265_config.h + + Now X265_BUILD is maintained in just a single place, + souce/CMakeLists.txt. + + The shared library is only installed if a valid tag is found; + meaning the user must be building a Mercurial clone or a release + tarball with .hg_archive.txt file in it (otherwise they must install + the shared library themselves) + [1097e547c441] + + * source/common/common.h: + common: properly report clang compiled by version + [28690748ab1d] + +2013-10-12 Deepthi Nandakumar + + * source/test/pixelharness.cpp, source/test/testharness.h: + testbench fix: short buffers now have short values + [aef52403ed5a] + +2013-10-11 Steve Borho + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: use param.rc.rateControlMode, from duplicate RC var + [9a02765f182e] + + * source/common/vec/intra-sse41.cpp: + intra: remove vector class header include from intra-sse41.cpp + + intra-sse3.cpp is the last file with 8bpp (non-AVX2) vector class + primitives + [f77efd501767] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy-sse3: consistent naming convention + [8518e39a2b74] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy-sse3: remove vector class use from last 16bpp intrinsic + + blockcopy files are now vector class clean + [41b7ceea1e32] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy-sse3: consistent naming convention + [0be273b5f082] + + * source/common/vec/intra-sse3.cpp: + intra: prevent variable shadow warnings from GCC + [d97cf152f620] + +2013-10-11 Aarthi Thirumalai + + * source/common/pixel.cpp, source/common/primitives.h: + primitves: add c primitives for the following : + + compute AC energy for each block copy pixels of chroma plane + [725ac176cd13] + + * source/common/common.cpp, source/x265.h: + param: added rc states for setting Aq mode and Aq strength + [73d085da8533] + + * source/encoder/encoder.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + rc: added TEncCfg instance to RateControl to reuse all the rc params + directly. + [ce889cef37be] + +2013-10-11 Shazeb Nawaz Khan + + * source/common/reference.cpp, source/encoder/motion.cpp: + Some fixes in applyWeight() function + + These wont fix the PSNR drop but are necessary + [b70432f7b275] + +2013-10-11 Steve Borho + + * source/common/vec/dct-ssse3.cpp: + dct-ssse3: remove vector class includes; dct files are now clean + [1cd3bc5e6881] + + * source/common/vec/dct-sse3.cpp: + dct-sse3: don't compile dct4 for 16bpp builds when it is not used + [2267068cc7e1] + + * source/common/vec/dct-sse41.cpp: + dct-sse41: reorder functions for clarity - no code change + [d6dc4ebb5cbe] + + * source/common/vec/dct-sse3.cpp: + dct-sse3: remove idst4; it uses SSE4.1 but dct-sse41.cpp already has + idst4 + [839a9ba551e4] + +2013-10-11 Yuvaraj Venkatesh + + * source/common/vec/dct-sse3.cpp: + dct: Replaced inversedst vector class function to intrinsic + [df024b91ffd6] + +2013-10-11 Steve Borho + + * source/common/vec/pixel-sse3.cpp: + pixel-sse3: move convert32to16_shr to top of file, remove vector + class includes + [9f37e3d7818c] + +2013-10-11 Dnyaneshwar Gorade + + * source/common/vec/pixel-sse3.cpp: + pixel-sse3.cpp: Replace convert32to16_shr vector class function with + intrinsic. + [efb230642757] + +2013-10-11 Steve Borho + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp: + dct: move dct32 to dct-sse41.cpp, inline convert16to32 + [def1551c14f0] + +2013-10-11 Yuvaraj Venkatesh + + * source/common/vec/dct-sse3.cpp: + dct: Replaced partialButterfly32 vector class function to intrinsic + [ca00db64f5bb] + +2013-10-11 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_32 vector class function + with intrinsic. + [4824f15116e6] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_26 vector class function + with intrinsic. + [267fa83cd7b9] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_21 vector class function + with intrinsic. + [90b34ae5e8de] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_17 vector class function + with intrinsic. + [263acbde8ec1] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_13 vector class function + with intrinsic. + [f1013117efab] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_9 vector class function with + intrinsic. + [5c6f7106c918] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_5 vector class function with + intrinsic. + [87a56e0ff6a9] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_m_2 vector class function with + intrinsic. + [e4efd408f394] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_2 vector class function with + intrinsic. + [bd335e21744d] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_5 vector class function with + intrinsic. + [2b9f94e11cc5] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_9 vector class function with + intrinsic. + [e65e3714bbb9] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_13 vector class function with + intrinsic. + [f3d0ced4a4f1] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_17 vector class function with + intrinsic. + [17c772394df3] + +2013-10-11 Steve Borho + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp: + dct: move dct8 to dct-sse41.cpp, inline convert16to32 + [f0eebdf90a58] + +2013-10-11 Yuvaraj Venkatesh + + * source/common/vec/dct-sse3.cpp: + dct: Replaced partialButterfly16 vector class function to intrinsic + [f760de7f5596] + +2013-10-11 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_21 vector class function with + intrinsic. + [e9b401f5c655] + +2013-10-11 Min Chen + + * source/common/x86/ipfilter8.asm: + asm: improvement filterHorizontal_p_p_4 by reorder intermedia data + + 1. repleace phaddw to paddw 2. use extra load operator to split data + dependency and reduce table size + [080a9fdada2c] + + * source/common/x86/ipfilter8.asm: + asm: fix bug in filterHorizontal_p_p_4 with width less than 8 (seed + 0x52578C72) + [953a4e9f3d57] + +2013-10-11 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_26 vector class function with + intrinsic using intrinsic macros PRED_INTRA_ANGLE_4_START and + PRED_INTRA_ANGLE_4_END. + [295973cbc020] + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Created common macros PRED_INTRA_ANGLE_4_START, + PRED_INTRA_ANGLE_4_END for PredIntraAng4_[ANGLE] function. + [ee4f9ae07523] + +2013-10-11 Steve Borho + + * source/common/vec/dct-sse41.cpp: + dct: manually inline convert16to32, for 10% improvement + [ab9f6ad97d30] + +2013-10-11 Yuvaraj Venkatesh + + * source/common/vec/dct-sse41.cpp: + dct: modified block copy used in dct8 with convert16to32 inline + function + [855757691efc] + +2013-10-11 Steve Borho + + * source/common/vec/dct-sse3.cpp: + dct: fix 16bpp, dct primitives are not 16bpp safe + [c6d89dc62e19] + +2013-10-10 Steve Borho + + * source/common/piclist.cpp: + piclist: ensure a TComPic is not enqueued in two lists at once + [7134a091a71d] + +2013-10-11 Steve Borho + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/pixel-sse3.cpp: + pixel: remove unreferenced cvt16to32_t and cvt32to16_t primitives + [9bbaa60db38b] + + * source/common/vec/pixel-sse41.cpp: + pixel: remove unreferenced sse_pp4 function + [fa480d5c2166] + +2013-10-10 Steve Borho + + * source/common/vec/pixel-avx2.cpp: + pixel: allow clang to build AVX2 pixel primitives functions + [0fabe33e0448] + +2013-10-11 Min Chen + + * source/cmake/version.cmake: + cmake: default value for X265_VERSION + [b6756c2e6386] + +2013-10-10 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: repair ICL nmake builds - do not use yasm custom rule with + nmake + [57e6b2cf633d] + +2013-10-11 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: make intel C++ compiler detection independent of env vars + [e282601b92d6] + +2013-10-10 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt: + cmake: cleanup compiler determination + [95f8e0c146b8] + + * source/common/CMakeLists.txt: + cmake: add parens to fix icpc builds of intrinsic primitives + [7320ecd0901c] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: add missing cstring include for memcpy + [71fca64942a6] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unreferenced member variable + [614a68ab4703] + + * source/Lib/TLibEncoder/NALwrite.cpp: + NALwrite: reintroduce include of cstring, required for memcpy on + Linux + [d6b9cc9c402f] + + * source/encoder/CMakeLists.txt: + cmake: merge TLibEncoderH source group into TLibEncoder + + They are both small enough now that they are manageable as a single + unit + [bfdfeb2fd817] + + * source/CMakeLists.txt: + cmake: link PPA and other libs into x265-shared and x265-static + [499ef0e4e254] + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp: + dct: add comments for future opts/code reuse + [7b4a6a5f8efc] + + * source/common/vec/dct-sse41.cpp: + dct: remove vector class includes from dct-sse41.cpp, it is clean + [3be4451ea3aa] + + * source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp: + dct: move functions which require SSE4.1 from dct-sse3.cpp to dct- + sse41.cpp + [be7c6c42566a] + + * source/common/vec/dct-sse3.cpp: + dct: move last vector dct function into its own section + [b0b5c22f5a34] + +2013-10-10 Yuvaraj Venkatesh + + * source/common/vec/dct-sse3.cpp: + dct: replaced partialButterfly8 vector class function with intrinsic + [6fa763ba9da8] + + * source/common/vec/dct-sse3.cpp: + dct: replace dequant vector class function with intrinsic + [b77a66b6b93d] + +2013-10-10 Dnyaneshwar Gorade + + * source/common/vec/intra-sse3.cpp: + intra-sse3.cpp: Replace PredIntraAng4_32 vector class function with + intrinsic. + [8b49d3995f0c] + +2013-10-10 Steve Borho + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy: move intrinsic function out of vector-class section + [7dbbbb2a42bc] + +2013-10-10 Dnyaneshwar Gorade + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy-sse3.cpp: Replace pixeladd_pp vector class function with + intrinsic. + [12d098e5d907] + +2013-10-10 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + tcomtrquant:remove unused methods + [a79ecf3a7875] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, source/common/dct.cpp, + source/common/pixel.cpp, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-sse41.cpp, source/x265.cpp: + remove unused includes + [dce6ced4b4a3] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp: + tcomslice:remove unused set methods + [c2fb3d12c812] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + tcomslice: removed unused set methods in TComReferencePictureSet + class + [bb43823efa92] + +2013-10-10 Steve Borho + + * source/common/vec/pixel-sse41.cpp, source/common/vec/sse.inc: + sse: move last SSE function into vector-class section of pixel- + sse41.cpp + [7b4685130793] + + * source/common/vec/ipfilter-ssse3.cpp: + ipfilter: remove vector class headers from ipfilter-ssse3.cpp + [29844e1eb697] + +2013-10-10 Dnyaneshwar Gorade + + * source/common/vec/ipfilter-ssse3.cpp: + ipfilter-ssse3.cpp: Replace filterConvertPelToShort vector class + function with intrinsic. + [b7b00d3533b1] + + * source/common/vec/ipfilter-ssse3.cpp: + ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class + function with intrinsic. + [fc4bc74c095a] + +2013-10-10 Steve Borho + + * source/common/vec/pixel-sse41.cpp, source/common/vec/sse.inc: + sse: move intrinsic functions to pixel-sse41.cpp + [02fd071a875b] + +2013-10-10 Yuvaraj Venkatesh + + * source/common/vec/sse.inc: + pixel: replace sse_sp64 vector class with intrinsic + [250b1b037e94] + + * source/common/vec/sse.inc: + pixel: replace sse_sp48 vector class with intrinsic + [1d872b8c6480] + + * source/common/vec/sse.inc: + pixel: replace sse_sp32 vector class with intrinsic + [92b11584470c] + + * source/common/vec/sse.inc: + pixel: replace sse_sp24 vector class with intrinsic + [cae449cb7965] + + * source/common/vec/sse.inc: + pixel: replace sse_sp16 vector class with intrinsic + [087267802b1c] + + * source/common/vec/sse.inc: + pixel: modified sse_sp8 with a comman macro SSE_SP8x1 + [fb043c201cce] + +2013-10-10 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSeach: add alignment to bidir output buffer + + fixes crash in x265_pixel_avg_w16_sse2 from aligned writes to the + output buffer + [6a6f72ea32a3] + +2013-10-10 Steve Borho + + * source/x265.def, source/x265.h: + api: make notice of how the build number now must be echoed in + x265.def + [8ae52f2b159c] + + * source/encoder/encoder.cpp: + encoder: silence idiotic compiler warning from VC10 + [4b84c969a079] + +2013-10-09 Steve Borho + + * source/x265.h: + api: add API version to x265_encoder_open to prevent dynamic library + link skew + + Gratefully borrowed from x264 + [52284d8d4dc1] + + * source/common/CMakeLists.txt, source/common/vec/vec-primitives.cpp: + cmake: enable clang to compile intrinsic primitives without version + checks + [edcc92f2b2ab] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy: isolate vector class routines together (firing squad) + [82f20a7cb593] + + * source/VectorClass/vectori128.h, source/VectorClass/vectori256.h, + source/VectorClass/vectori256e.h: + vector: bypass a number of functions we do not use when compiled by + clang + + The sooner these classes go away the better + [5cc9abe88e62] + + * source/common/vec/pixel-ssse3.cpp: + pixel: remove vector class headers from pixel-ssse3.cpp + [a574f4347855] + + * source/common/vec/intra-sse3.cpp: + intra: remove an unreferenced function + [27a3de7a742c] + + * source/common/vec/ipfilter-sse41.cpp: + ipfilter: remove two unreferenced functions + [9518070da726] + + * source/common/ipfilter.cpp: + ipfilter: remove two unused functions + [6a08d0e9178c] + + * source/CMakeLists.txt: + cmake: detect clang compiler anf fix some link issues + [d7922b02ef3c] + + * source/CMakeLists.txt: + cmake: bump minimum required version to 2.8.8 + [5dceef85c58c] + + * source/CMakeLists.txt: + cmake: add -ffast-math to GCC compile flags + [4710e2b5e134] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: linux build fix, include for memcpy + [80af6aaa16e2] + +2013-10-09 Sumalatha Polureddy + + * source/encoder/compress.cpp: + no-rdo(early exit): update the memory with info from m_interCU_NxN + + The encoding and prediction details which are updated in the + m_interCU_NxN in NxN calculation are updated in the m_tempCU + [bd3f43f06dd4] + +2013-10-09 Steve Borho + + * source/common/vec/dct-ssse3.cpp, source/encoder/bitcost.cpp: + remove more includes + [e4369bb24ad7] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: replace with X265_MIN, X265_MAX + [50d55737f6b0] + + * source/Lib/TLibCommon/ContextModel.cpp: + ContextModel: replace with X265_MIN, X265_MAX + [8cd4c7e800ed] + + * source/Lib/TLibEncoder/NALwrite.cpp: + NALwrite: remove "using namespace std" + [05b6f86ebda6] + + * source/test/unittest.cpp, source/test/unittest.h: + remove unused unittest class (is not even compiled today) + [882d6a2b329e] + + * source/x265.cpp: + cli: remove "using namespace std" + [ddb9d884df8d] + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/NALwrite.cpp: + remove a pile of unused STL includes, reorder includes for clarity + [7c15a193cef9] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + TEncSBac: remove unused #include + [64b58c78dd68] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unused ParameterSetMap and #include + [5a50663968f5] + +2013-10-09 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComList.h, source/Lib/TLibCommon/TComSlice.h, + source/common/CMakeLists.txt, source/encoder/encoder.cpp: + TLibCommon: Removed unused TComList + [fd1d967972d0] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/encoder.h: + dpb and TComSlice: replaced TComList with PicList + [85c733f8a057] + +2013-10-09 Shazeb Nawaz Khan + + * source/common/reference.cpp, source/common/reference.h, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/motion.h: + Enabling weight prediction for half and full pel + [b44fccc8e6ea] + +2013-10-09 Steve Borho + + * source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel: fixup GCC builds, remove pixel8.inc and pixel.inc + [271b02645979] + +2013-10-09 Yuvaraj Venkatesh + + * source/common/vec/pixel8.inc: + pixel: Replace weightUnidir vector class function with intrinsic. + [408ca7050bc7] + + * source/common/vec/sse.inc: + pixel: replace sse_sp12 vector class with intrinsic + [e828f98d388a] + + * source/common/vec/sse.inc: + pixel: replace sse_sp8 vector class with intrinsic + [d9665d0ad277] + + * source/common/vec/sse.inc: + pixel: replace sse_sp4 vector class with intrinsic + [dc0bd9d959db] + +2013-10-09 Dnyaneshwar Gorade + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss64 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [756de900bae5] + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss48 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [50a2725a989d] + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss32 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [363ff9b66afa] + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss24 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [9f1ec1c9cdb6] + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss16 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [aca490fa02d7] + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss12 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [3f5fe6d9a81d] + + * source/common/vec/sse.inc: + sse.inc: Modified sse_ss8 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [1428dcddfa4d] + + * source/common/vec/sse.inc: + pixel.inc: Modified sse_ss4 intrinsic function. Removed redundancy + using comman macro PROCESS_SSE_SS4x1. + [d1ca36034d9f] + + * source/common/vec/sse.inc: + sse.inc: Created comman macro PROCESS_SSE_SS4x1 for functions + sse_ss4, sse_ss8, sse_ss16, sse_ss24, sse_ss32, sse_ss48, sse_ss64. + [95da8fa18a2e] + + * source/common/vec/pixel8.inc: + pixel8.inc: Replace weightUnidirPixel vector class function with + intrinsic. + [9a20693c6ff2] + + * source/common/vec/pixel-sse3.cpp: + pixel-sse3.cpp: Modified calcRecons function argument names. Removed + hungarian prefixes. + [9d534f295529] + + * source/common/vec/pixel-sse3.cpp: + pixel-sse3.cpp: Modified calcRecons8 function argument names. + Removed hungarian prefixes. + [a1a2233536d5] + + * source/common/vec/pixel-sse3.cpp: + pixel-sse3.cpp: Modified calcRecons4 function argument names. + Removed hungarian prefixes. + [87084e0f93f4] + +2013-10-09 Steve Borho + + * source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/CMakeLists.txt, + source/encoder/cturow.cpp, source/encoder/cturow.h, + source/encoder/encoder.cpp, source/encoder/encoder.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + pull TEncTop code into Encoder class in encoder.cpp, encoder.h + + The TEncTop in our repo bears little resemblance to the original HM + class, so I think it is applicable to move this code into one of our + files where it makes the most sense. Some methods were renamed to + our naming style in the process. + [47d92e8a8a41] + +2013-10-09 Gopu Govindaswamy + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: removed TComList(std::list) and used PicList to store the + TComPics + [497ef3556b12] + +2013-10-09 Steve Borho + + * source/CMakeLists.txt: + cmake: this time for reals + [fc7fbdd18bc0] + +2013-10-08 Steve Borho + + * source/CMakeLists.txt: + cmake: fixes for non-assembly builds and windows builds + [4737b5423ea4] + + * source/CMakeLists.txt: + cmake: tweak static library names to avoid conflict + + MSVC was trying to write both the static library and the DLL shim + loader to x265.lib. After this change, the static library is now + x265-static.lib and the shim loader is x265.lib (corresponding with + x265.dll) + [4b354b902b50] + +2013-10-07 Steve Borho + + * source/CMakeLists.txt, source/cmake/mergestaticlibs.cmake, + source/common/CMakeLists.txt, source/common/common.cpp, + source/common/common.h, source/common/primitives.cpp, + source/common/vec/CMakeLists.txt, source/common/x86/CMakeLists.txt, + source/encoder/CMakeLists.txt, source/encoder/encoder.cpp, + source/test/CMakeLists.txt, source/x265.cpp, source/x265.def, + source/x265.h: + cmake: use cmake 2.8 OBJECT target type to manage static and share + libs + + With the OBJECT target type, the common and encoder folders are + compiled to object files but not linked until main static or shared + library is built. This removes the need for mergestatic.cmake and + cleans up a lot of messy problems - at the cost of requiring a + somewhat recent cmake. + + For MSVC (and presumably Xcode) we must keep the assembly as a + static lib since it uses custom build commands which do not work + with OBJECT target types. This static lib is then linked with the + main x265.lib or x265.dll + + The X265_EXPORT macro is no longer necessary since we are generating + both the static library and shared library from one compile we are + forced to use an x265.def file to define DLL exports. x265.exe must + link with the static library because on Windows the static lib will + be empty if no EXE links with it. + + x265_mdate() was moved into the CLI x265.cpp so the CLI could link + with the shared library if necessary (x265_mdate is not exported) + [713c2133c77c] + +2013-10-08 Steve Borho + + * source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel8.inc: + pixel: move intrinsic residual and recon functions to pixel-sse3.cpp + [3202ca7a44bb] + +2013-10-08 Shazeb Nawaz Khan + + * source/common/reference.cpp: + Check against numRows in applyweight + + applyWeight() processes rows a 'refLag' number of rows in advance as + compared to the current iteration in compressCTURows() + [7831bda44186] + + * source/common/reference.cpp: + Fix for possible memory access violation in applyWeight() + [85cc6aaac7ec] + + * source/Lib/TLibCommon/TComPrediction.cpp: + Use unweighted pixels before interpolation for Luma in Motion + Compensation + [65f56d5e2ee7] + +2013-10-08 Steve Borho + + * source/common/piclist.cpp, source/common/piclist.h: + piclist: pass non-optional pictures by reference to avoid NULL + checks + [71afca6c173b] + +2013-10-08 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/common/CMakeLists.txt, source/common/piclist.cpp, + source/common/piclist.h: + piclist: add class PicList for picture list manipulations + + Created new file piclist.cpp and piclist.h for linked list + manipulation, there will be no intermediate storage in piclist, + PicList just links TComPic objects together. + + PicList will be used to replace TComList and this will + remove std::list dependency in x265 + [df812d396499] + +2013-10-08 Yuvaraj Venkatesh + + * source/common/vec/pixel8.inc: + pixel: replace getResidual64 from vector class to intrinsic + [bb27ac985cb9] + + * source/common/vec/pixel8.inc: + pixel: replace getResidual32 from vector class to intrinsic + [1e53142731ab] + + * source/common/vec/pixel8.inc: + pixel: replace getResidual16 from vector class to intrinsic + [c27e9b8951d0] + +2013-10-08 Dnyaneshwar Gorade + + * source/common/vec/pixel8.inc: + pixel8.inc: replace calcRecons vector class function with intrinsic. + [add71d9845a7] + + * source/common/vec/pixel8.inc: + pixel8.inc: replace calcRecons8 vector class function with + intrinsic. + [1d2e192467a8] + + * source/common/vec/pixel8.inc: + pixel8.inc: replace calcRecons4 vector class function with + intrinsic. + [d2c8e7248f4a] + +2013-10-08 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + tcomslice: removed un-used sortPicList() function + [91aea72a1de9] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + tcomslice: removed un-used checkThatAllRefPicsAreAvailable() + function + [ed0d3fd544e6] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: Removed un-used + createExplicitReferencePictureSetFromReference() Method + [268a34d4389f] + +2013-10-08 Min Chen + + * source/test/testpool.cpp: + Update testbench for threadpool + [57c15726158a] + +2013-10-08 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + calculate SSIM for each Row after deblock, sao + [349206daad1d] + + * source/common/pixel.cpp, source/common/primitives.h: + primitives: added C primitives to compute SSIM + [279e050947cf] + +2013-10-08 Deepthi Devaki + + * source/encoder/slicetype.cpp: + slicetype: Bidir cost estimation added to lookahead + [0204da76bdb6] + +2013-10-08 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComBitStream.cpp: + tcombitstream: bug fix for count StartCodeEmulations to calculate + the substream size + [9b3a427a1009] + +2013-10-07 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/frameencoder.cpp: + tcomdatacu: Removed Un-used NDBFBlockInfo - Non-deblocking filter + processing block information + + 1.setNDBFilterBlockBorderAvailability() 2.createNonDBFilterInfo() + 3.createNonDBFilterInfoLCU() 4.destroyNonDBFilterInfo() + + All the above methods are used to set and destroy the + NDBFBlockInformation, but NDBFBlockInfo m_vNDFBlock is not getting + used encoding process + [d71078917df0] + +2013-10-07 Steve Borho + + * source/common/common.cpp: + common: rename AILIGNBYTES to avoid conflict with apple i386/param.h + [41e5e72e2a46] + + * source/CMakeLists.txt: + cmake: white-space nit + [5ff1a56ba33b] + + * source/CMakeLists.txt, source/test/CMakeLists.txt: + cmake: simplify test build structure + [d79cbcdcb538] + + * source/common/vec/pixel-sse3.cpp: + pixel: wrap primitives in anononymous namespace (file static) + [e1be6debf9a7] + + * source/common/vec/pixel-sse3.cpp: + pixel: fix spelling of blockfill functions, add calcresidual + functions + [c8b2682f8ee3] + + * source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel8.inc: + pixel: move intrinsic getResidual4 and getResidual8 to pixel- + sse3.cpp + [baf9d1d30a9c] + +2013-10-07 Yuvaraj Venkatesh + + * source/common/vec/pixel8.inc: + pixel: replace getResidual8 vector class with intrinsic + [5be3fea0721a] + + * source/common/vec/pixel8.inc: + pixel: replace Residual4 vector class with intrinsic + [5a0c772fd165] + +2013-10-07 Dnyaneshwar Gorade + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x4_32 to make it easier to maintain + [b6f29369879c] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x4_24 to make it easier to maintain + [9facd6b0a9f8] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x4_12 to make it easier to maintain + [d6dc18f9cead] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x3_32 to make it easier to maintain + [91df941f46bc] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x3_24 to make it easier to maintain + [908f617c8847] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_12 to make easier to maintain + [b8756aa16d1a] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x3_12 to make it easier to maintain + [afcfd17e395f] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_32 to make it easier to maintain + [b52b52cd0f77] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_24 to make easier to maintain + [2556af7a5c0d] + +2013-10-06 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel-sse3.cpp: + primitives: rename blockfil_s to blockfill_s_c + [bed5c2765dc8] + + * source/common/primitives.h: + primitives: remove unused function decl + [c433691041a2] + + * source/common/primitives.h: + primitives: remove obsolete FilterConf enum + [69d2774132c4] + + * source/common/ipfilter.cpp, source/common/primitives.h: + ipfilter: remove unused filterRow primitives + [808210e75c21] + +2013-10-07 Steve Borho + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + ipfilterharness: remove test harnesses for removed primitives + [6fdea23da2b9] + +2013-10-06 Steve Borho + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter-sse41.cpp: + ipfilter: remove unused weighted interpolation primitives + [15c6e2451b34] + + * source/CMakeLists.txt: + cmake: use add_definitions() instead of modifying CMAKE_CXX_FLAGS + manually + [c010342f7605] + +2013-10-02 Steve Borho + + * source/CMakeLists.txt, source/common/common.cpp, + source/common/primitives.cpp, source/dllmain.cpp, + source/encoder/encoder.cpp, source/x265.h: + cmake: add ENABLE_SHARED build option for creating a shared library + (dll/so) + [33ea0f317564] + +2013-10-06 Steve Borho + + * source/common/vec/pixel-avx2.cpp: + pixel: fix eoln damage to pixel-avx2.cpp + [2190f2f036a1] + + * source/common/x86/asm-primitives.cpp: + asm: simplify setup of HEVC partitions for SATD primitives + [484d1d98710b] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_x3_16 and sad_x4_16 to make them easier to + maintain + [d27d01ffa4f0] + +2013-10-05 Steve Borho + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_16 to make it easier to maintain + [bf5852bbf75f] + + * source/common/vec/pixel-sse41.cpp, source/common/x86/asm- + primitives.cpp: + pixel: fix HIGH_BIT_DEPTH builds + [bc3d1a8ebc89] + + * source/common/vec/pixel-sse41.cpp: + pixel: add missing sse_pp_12x16, untemplatize others + [017aab1983dd] + + * source/common/x86/asm-primitives.cpp: + asm: don't build wrappers for functions with intrinsic + implementations + [da37cd44a77c] + + * source/common/vec/pixel-sse41.cpp: + pixel: drop SSE primitives that have assembly + [08b4bb1e5dbe] + + * source/common/x86/asm-primitives.cpp: + asm: cleanup the assignment of SSD primitives + [dc74d9932a3f] + + * source/common/x86/asm-primitives.cpp: + asm: simplify generation of sa8d_inter functions from 8x8 and 16x16 + blocks + [276f98fe1c59] + + * source/test/testbench.cpp: + testbench: fix off-by one initialization of primitives + [e352d1f1a7c6] + + * source/common/vec/pixel-sse41.cpp: + pixel: add back intrinsics for sad_x3_4x16 and sad_x4_4x16 + + These routines do not yet have assembly code + [2e8d7b261880] + + * source/common/primitives.cpp: + primitives: fix off-by one initialization of primitives + [6e46fabdef40] + + * source/common/primitives.cpp: + primitives: fixup 12x16 and 16x2 sa8d_inter pointers + + 32x12 isn't used but 12x16 and 16x12 are (for AMP) + [884016c98502] + + * source/common/primitives.cpp, source/common/x86/asm-primitives.cpp: + primitives: setup square sa8d_inter function pointers from sa8d + block pointers + [58bacc9ae3d1] + + * source/common/x86/asm-primitives.cpp: + asm: use x265_pixel_satd_8x4_xop for p.satd[PARTITION_16x4] for 32 + bit builds + + On 64bit builds, we have native sse2 functions + [4f837e3ebd26] + + * source/common/primitives.cpp, source/common/x86/asm-primitives.cpp: + primitives: move small block sa8d_inter setup to primitives.cpp + + This hack didn't belong in the assembly setup function + [83ae910874e3] + + * source/test/pixelharness.cpp: + pixelharness: report sad, sad_x3, and sad_x4 scores together + [4089b17f33ed] + + * source/common/x86/asm-primitives.cpp: + asm: quit instantiating functions which are not necessary + + Re-order functions for more clarity + [5c27d330da43] + + * source/common/vec/pixel-sse41.cpp: + pixel: stop building 16x16, 16x8, and 8x16 intrinsic primitives + [73f14d5ca8a9] + + * source/common/vec/pixel-sse41.cpp: + pixel: limit sad_8 routines to just height 32 + + 8x4, 8x8, and 8x16 are handled by x264 assembly, only 8x32 remains + for the intrinsic function to cover + [19b319c9a6aa] + + * source/common/vec/pixel-sse41.cpp: + pixel: eliminate width 48 SAD code for height != 64 + [8f7091d09c11] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_32 primitives to a single loop + + 32 width blocks will only be height 8, 16, 24, or 32. Having an + 8-row loop is just fine + [affee51d4f86] + + * source/common/vec/pixel-sse41.cpp: + pixel: simplify sad_64 primitives to a single loop + + 64 width blocks will only be height 16, 32, 48, or 64. Having an + 8-row loop is just fine + [87b5a379a1d8] + + * source/common/vec/pixel-sse41.cpp: + pixel: remove sad_*_4<> intrinsic functions, they are covered by + assembly + + x264 assembly code covers 4x4, 4x8, and 4x16 and those are the only + 4-width partitions used by x265. + [1ae7953bceb4] + + * source/common/vec/pixel-sse41.cpp: + pixel: eliminate width 24 SAD code for height != 32 + [fb475b36852c] + + * source/common/vec/pixel-sse41.cpp: + pixel: eliminate width 12 SAD code for height != 16 + [bbc040a8109c] + + * source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse3.cpp, + source/common/vec/pixel-sse41.cpp, source/common/vec/pixel.inc: + pixel: only compile partition sizes that are used by the encoder + [e5369adbccba] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + pixelharness: only test partition sizes plausibly used by the + encoder + [699b843073de] + + * source/common/vec/pixel-sse41.cpp: + pixel: use unaligned loads for reference pixels in sad_12 + [49231db18e60] + + * source/common/vec/pixel-sse41.cpp: + pixel: fix typo which was copied and pasted 36 times + [fbfa3a5c5ae8] + + * source/common/vec/pixel-sse41.cpp: + pixel: use unaligned loads for reference pixels in sad_24 + [bee7275174f1] + +2013-10-05 Min Chen + + * source/encoder/framefilter.cpp: + fix bug on SAO initialize + [46a901ac1aff] + +2013-10-04 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/CMakeLists.txt, + source/common/x86/asm-primitives.cpp, source/common/x86/mc-a.asm, + source/encoder/motion.cpp: + use x264 pixel average assembly routines for bidir and lowres QPEL + + This required adding a weight parameter and re-ordering arguments. + Bidir might eventually use the weighting feature so this didn't seem + like a bad trade-off. + + This commit naively pulls in all of mc-a.asm from x264 for just this + one set of assembly functions. + [84d0f4f907f7] + + * source/encoder/motion.cpp: + motion: use new pixelavg_pp primitive for lowres QPEL pixel + generation + [7976c35f5b76] + + * source/encoder/motion.cpp: + motion: remove NULL checks prior to X265_FREE calls + [cb6b3038a1e6] + + * source/common/x86/CMakeLists.txt: + cmake: add ASM files to MSVC solution so they are easily edited + + this has no effect on the build + [4ee217cd64cc] + +2013-10-04 Shazeb Nawaz Khan + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc, + source/test/pixelharness.cpp: + Fix for Testbench fail in weightpUni for Pixel input + [e1404a7a05b0] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/pixel.inc, source/test/pixelharness.cpp: + Using int16_t rather than uint16_t in weightpUni primitive; inputs + can be signed + [491996e415b0] + +2013-10-04 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + rc : bug fix for quality drop with larger number of frame threads. + [b960d808d0be] + +2013-10-04 Dnyaneshwar + + * source/common/vec/blockcopy-sse3.cpp: + Replace "pixeladd_ss" vector class function with intrinsic. + Performance measured is same as that of vector function. + [cfc69c57d335] + + * source/common/vec/blockcopy-sse3.cpp: + replace "pixelsub_sp" vector class function with intrinsic. + Performance is same as that of vector function. + [1a884afb63bb] + + * source/common/vec/blockcopy-sse3.cpp: + replace blockcopy_s_p (pixel to short) vector class function with + intrinsic. Performance is same as that of vector class function. + [5b7226f332be] + + * source/common/vec/blockcopy-sse3.cpp: + replace block_copy_p_s (short to pixel) vector class function with + intrinsic. Performance measured is same as that of vector function. + [64325084bd3b] + + * source/common/vec/blockcopy-sse3.cpp: + replace block_copy_p_p vector class function with intrinsic code. + Performance is almost same as that of vector function. + [7b93c1cae0c4] + +2013-10-04 Steve Borho + + * source/common/vec/pixel-sse41.cpp, source/common/vec/pixel8.inc: + pixel: move SSE4.1 functions from pixel8.inc to pixel-sse41.cpp + [8829b508822b] + +2013-10-04 yuvaraj + + * source/common/vec/pixel8.inc: + Replace sad_x4_64 vector class function with intrinsic. + [d59dcf48b9de] + + * source/common/vec/pixel8.inc: + Replace sad_x4_48 vector class function with intrinsic. + [d370697071ed] + + * source/common/vec/pixel8.inc: + Replace sad_x3_64 vector class function with intrinsic. + [6dcae4946fe3] + + * source/common/vec/pixel8.inc: + Replace sad_x3_48 vector class function with intrinsic. + [c29821f80cd3] + + * source/common/vec/pixel8.inc: + Replace sad_64 vector class function with intrinsic. + [4f990ec05dc5] + + * source/common/vec/pixel8.inc: + Replace sad_48 vector class function with intrinsic. + [88378feb4794] + +2013-10-04 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unused code + [a201bc951e10] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Bidir ME: store bits required for bidir which will be used for merge + estimation + [5b987ed0a557] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix for gcc warning + [ae9c68edd6b2] + +2013-10-04 Steve Borho + + * source/common/vec/pixel.inc, source/common/vec/pixel16.inc: + pixel: remove 16bpp vector class satd functions + + They were not competitive with assembly and were unlikely to be + competitive as intrinsics + [bf14f75b8cf9] + + * source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse3.cpp, + source/common/vec/pixel-sse41.cpp: + pixel: comment nits + [1dd953bba5ed] + + * source/common/vec/pixel-sse3.cpp: + pixel: move vector class include to bottom of the file + + the clearly segregates vector class primitives from intrinsic + primitives + [cfe01e12b04f] + + * source/common/vec/pixel-sse41.cpp, source/common/vec/pixel8.inc, + source/common/vec/sse.inc: + pixel: move SSE4.1 intrinsic primitives into pixel-sse41.cpp + [9794528eef7b] + + * source/common/vec/pixel-sse3.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc: + pixel: move SSE3 intrinsic primitives into pixel-sse3.cpp + + This improves compile time slightly + [4d00380f0da0] + + * source/common/vec/CMakeLists.txt, source/common/vec/pixel-ssse3.cpp, + source/common/vec/pixel.inc, source/common/vec/pixel8.inc, + source/common/vec/vec-primitives.cpp: + cmake: move SSSE3 routines into pixel-ssse3.cpp + [4018a2281c9c] + +2013-10-03 Steve Borho + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel: cleanup pixel.inc + + Remove copy of C primitive transpose Move 8bpp-only functions to + pixel8.inc cleanup setup function + [64886382abe2] + + * source/common/vec/CMakeLists.txt, source/common/vec/pixel-xop.cpp, + source/common/vec/vec-primitives.cpp: + cmake: drop pixel-xop.cpp + + Now that there are very few vector class intrinsics left, there is + almost no point to compiling pixel.inc with the XOP flag enabled. + This saves compile time. It's setup function wasn't even being + called. + [9091bdd024b1] + + * source/common/vec/pixel-avx2.cpp, source/common/vec/pixel8.inc: + pixel: move avx2 functions into pixel-avx2.cpp + [b08fb0b5de8a] + +2013-10-03 Dnyaneshwar + + * source/common/vec/pixel8.inc: + replace sad_x4_32 vector class function with intrinsic. + [93c4e5d784d7] + + * source/common/vec/pixel8.inc: + replace sad_x4_24 vector class function with intrinsic. + [d92ccc92698e] + + * source/common/vec/pixel8.inc: + replace sad_x4_12 vector class function with intrinsic. + [86e702079865] + + * source/common/vec/pixel8.inc: + replace sad_x3_32 vector class function with intrinsic. + [489fac4bebfe] + + * source/common/vec/pixel8.inc: + replace sad_x3_24 vector class function with intrinsic. + [ea4f5cb0799f] + + * source/common/vec/pixel8.inc: + replace sad_x3_12 vector class function with intrinsic. + [7b8c822b884e] + + * source/common/vec/pixel8.inc: + replace sad_32 vector class function with intrinsic. + [fbc82a9e97e7] + + * source/common/vec/pixel8.inc: + replace sad_24 vector class function with intrinsic. + [0b19614fe2bf] + + * source/common/vec/pixel8.inc: + replace sad_12 vector class function with intrinsic. + [c96520dc9795] + +2013-10-03 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove methods unreachable after bidir replacement + [86469b2bc1bd] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/common.cpp, + source/x265.h, source/x265opts.h: + api: remove dead bipredSearchRange parameter and CLI option + [97444bf3e5bd] + +2013-10-03 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Bidir ME: add (0,0) candidate + [99812ad37453] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + Bidir ME: new logic adapted from x264 + + L0 and L1 MVs from unidir ME used for bidir MV. bidir cost is + calculated from the average of references. + [a6a0e6e90536] + + * source/common/pixel.cpp, source/common/primitives.h: + primitives: add pixelavg primitive + [3a2297992a24] + +2013-10-03 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibEncoder/NALwrite.cpp, source/common/common.h: + Tcombitstream: Encoded Bit stream storage (fifo) moved from + std::vector to Non STL Class + + 1.Removed std::vector Class from Tcombitstream 2.Removed + std::search_n algorithm from Tcombitstream 3.Implemented fifo using + Pointers to an array - Encoded Bit stream storage + [2d04a0bb09c3] + +2013-10-03 Sumalatha Polureddy + + * source/encoder/compress.cpp: + no rdo: Turn OFF the early exit code + + Turning off the early exit code since it increases the bitrate by 2% + This will be enabled when the hash mismatch issue is solved for the + reuse of NxN calculataion in early exit code + [3e24e1081c16] + +2013-10-01 Shazeb Nawaz Khan + + * source/common/reference.cpp, source/common/reference.h: + reference: add applyWeight method to MotionReference + + to generate weighted full pel pixels for one or more CTU rows + [4f68ed1126b6] + +2013-10-02 Shazeb Nawaz Khan + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/pixel.inc, source/common/vec/pixel8.inc, + source/test/pixelharness.cpp, source/test/pixelharness.h: + primitives: weightUnidir primitive to support pixel or uint16 inputs + [2c73823af522] + +2013-10-01 Steve Borho + + * source/common/primitives.cpp: + primitives: shorten log message to prevent 80char terminal wrap + [0be80d41c551] + + * source/common/common.cpp: + common: tighten up tool descriptions; save horizontal space + [41aa8e024102] + +2013-10-01 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + api: added x265_stats_t structure and x265_encoder_stats() function + [c5dc3e37d767] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + added states for storing ssim values per frame and globally at + encoder level + [572ddbf8428e] + +2013-10-01 Shazeb Nawaz Khan + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/reference.cpp, + source/common/reference.h: + Adding few checks and an init() function in MotionRefrence class + + to properly signal malloc failure for 'fpelPlane'; handling to be + added later + [b1900856741c] + + * source/x265opts.h: + Enable cli parameters for unidirectional weighted prediction + + As part of re-enabling unidirectional weighted prediction after + introduction of frame parallelism + [c94c053eda36] + +2013-09-30 Min Chen + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/cturow.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + get deterministic output when frame threading is enabled (for all + -Fn, n > 1) + + Output mistake reasons: + + 1. CABAC Table Initialize The HM have decide best table for CABAC, + but in frame parallelism, we can't get this information before + thread start, we have to disable it now. Side effect: maybe lost + some compress performance + + 2. SAO Global Disable The HM decide SAO global disable flag depend + previous same type of slice. but here, we can get right statistics + informat before start, so we have to disable it. Side effect: more + computer cost since we always try SAO on every LCU + + 3. CABAC status m_frac not reset. This HM bug still alive, we found + more here. + [af559fe3e565] + +2013-09-30 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: always use STRIDE for source buffers + + Because they are allocated based on that dimension and the max + iterations + [a03659cfa957] + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy: fix missing stride checks + [25f8402ad767] + + * source/common/x86/asm-primitives.cpp: + asm: use named defines rather than hard-coded values + [1206434af474] + +2013-09-27 Steve Borho + + * source/common/x86/ipfilter8.asm: + ipfilter: remove needless width==0 check + [3dae8ff2902a] + +2013-09-27 praveen Tiwari + + * source/common/x86/CMakeLists.txt, source/common/x86/asm- + primitives.cpp, source/common/x86/ipfilter8.asm: + asm code for ipfilterH_pp, 4 tap filter + [d65224005524] + +2013-09-28 Aarthi Thirumalai + + * source/common/common.cpp, source/x265.h, source/x265opts.h: + cli: add options for toggling PSNR and SSIM metrics (currently + inactive) + + By default (for now) PSNR is turned on, SSIM is off. + [21307df78bdb] + +2013-09-28 Steve Borho + + * source/common/common.h: + common: white-space nit + [55edc34e253c] + + * source/common/common.h: + common: directly call x265_malloc from CHECKED_MALLOC, fix malloc + size problem + [1ce1e18ee05c] + +2013-09-27 Steve Borho + + * source/common/vec/intra-sse3.cpp: + intra: fix eoln damage + [4014edcf2157] + +2013-09-26 Steve Borho + + * source/common/primitives.cpp, source/test/testbench.cpp: + primitives: build primitive list iteratively + + Rather than collect all intrinsic primitives and then collect all + assembly primitives, do the process iteratively for each CPU + capability. This plausibly allows an SSE41 intrinsic primitive to + have higher priority than an SSE assembly primitive. + [93319ce8bb92] + + * source/common/primitives.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/test/testbench.cpp: + primitives: convert Setup_Assembly_Primitives to a CPU mask instead + of ID + + This allows the testbench to isolate the functions added for a given + arch + [9cdf05d40249] + + * source/common/primitives.cpp, source/common/primitives.h, + source/common/vec/vec-primitives.cpp, source/test/testbench.cpp: + primitives: convert Setup_Vector_Primitives to a CPU mask instead of + ID + + This allows the testbench to isolate the functions added for a given + arch + [19e1d1c80268] + +2013-09-27 Steve Borho + + * source/common/vec/intra-sse3.cpp: + intra: re-enable __forceinline for MSVC to recover perf of angular + predictions + [6f85cb3f7813] + + * source/common/vec/intra-sse41.cpp: + intra: remove redundant intra angular prediction functions + [cdda742b1e83] + + * source/common/vec/intra-sse41.cpp: + intra: remove redundant 16bpp angular prediction code + [f8106e88f5e3] + + * source/common/vec/vec-primitives.cpp: + vec: prevent link errors on older compilers, do not try to call + missing funcs + [28febb6e52f0] + + * source/common/vec/intra-sse41.cpp: + intra: merge consecutive 8bpp regions and remove dead code + [e9fcfa1df742] + + * source/common/vec/intra-sse41.cpp: + intra: remove redundant planar primitives + [b8fa74815587] + +2013-09-26 Steve Borho + + * source/common/vec/intra-sse41.cpp: + intra: remove redundant copy of intra_pred_dc + [dfac2c1c4c64] + + * source/common/vec/vec-primitives.cpp: + vec: add missing call to Setup_Vec_IPredPrimitives_sse3() + [94c1079648a8] + + * source/common/vec/ipfilter-ssse3.cpp: + ipfilter: don't use 8bpp primitives for 16bpp builds + [99be942b3855] + + * source/common/vec/blockcopy-avx2.cpp, source/common/vec/dct- + ssse3.cpp: + vec: prevent unreferenced parameter warnings for 16bpp builds + [7f168ba0e444] + + * source/common/vec/intra-sse41.cpp: + intra: remove unused extern + [0607ecb8adef] + + * source/encoder/motion.cpp: + motion: fix shadow warnings + [008903846fb1] + +2013-09-25 Steve Borho + + * source/common/vec/CMakeLists.txt, source/common/vec/avx2.cpp, + source/common/vec/sse3.cpp, source/common/vec/sse41.cpp, + source/common/vec/ssse3.cpp, source/common/vec/vec-primitives.cpp, + source/common/vec/xop.cpp: + combine per-architecture files into vec-primitives.cpp + [1058aa043e5e] + + * source/common/vec/CMakeLists.txt, source/common/vec/pixel.inc, + source/common/vec/utils.h: + inline utils.h into the last place it is included + [2e2f773dff36] + + * source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/pixel-avx.cpp, source/common/vec/pixel-ssse3.cpp, + source/common/vec/ssse3.cpp, source/common/vec/vec-primitives.cpp: + remove ssse3 and avx pixel files, which makes avx.cpp redundant + [74efcc344e32] + + * source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/avx2.cpp, source/common/vec/ipfilter-avx.cpp, + source/common/vec/ipfilter-avx2.cpp, source/common/vec/ipfilter- + sse3.cpp, source/common/vec/ipfilter-sse41.cpp, source/common/vec + /ipfilter-ssse3.cpp, source/common/vec/ipfilter-xop.cpp, + source/common/vec/ipfilter.inc, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc, source/common/vec/sse3.cpp, + source/common/vec/xop.cpp: + split ipfilter.inc into ssse3 and sse41 CPP files + [953172b04f19] + + * source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/avx2.cpp, source/common/vec/intra-avx.cpp, + source/common/vec/intra-avx2.cpp, source/common/vec/intra-sse3.cpp, + source/common/vec/intra-sse41.cpp, source/common/vec/intra- + ssse3.cpp, source/common/vec/intra-xop.cpp, + source/common/vec/intrapred.inc, source/common/vec/ssse3.cpp, + source/common/vec/xop.cpp: + split intrapred.inc into sse3 and sse41 CPP files + [cad4970ef197] + + * source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/avx2.cpp, source/common/vec/dct-avx.cpp, + source/common/vec/dct-avx2.cpp, source/common/vec/dct-sse3.cpp, + source/common/vec/dct-sse41.cpp, source/common/vec/dct-ssse3.cpp, + source/common/vec/dct-xop.cpp, source/common/vec/dct.inc, + source/common/vec/xop.cpp: + split dct.inc into sse3, sse41, ssse3 CPP files + [6b0a78ffaaa9] + +2013-09-26 Steve Borho + + * source/common/vec/blockcopy-sse3.cpp: + blockcopy: fix INSTRSET define for SSE3 + [c47e5c9d89f4] + +2013-09-25 Steve Borho + + * source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/avx2.cpp, source/common/vec/blockcopy-avx.cpp, + source/common/vec/blockcopy-avx2.cpp, source/common/vec/blockcopy- + sse3.cpp, source/common/vec/blockcopy-sse41.cpp, source/common/vec + /blockcopy-ssse3.cpp, source/common/vec/blockcopy-xop.cpp, + source/common/vec/blockcopy.inc, source/common/vec/sse3.cpp, + source/common/vec/sse41.cpp, source/common/vec/ssse3.cpp, + source/common/vec/vecprimitives.inc, source/common/vec/xop.cpp: + merge blockcopy.inc into blockcopy-sse3.cpp and -avx2.cpp + + This is the first step towards four goals: + + 1 - reduce compile time, no more redundant primitive compiles 2 - + reduce redirections / templating of intrinsic primitives 3 - move + away from vector class library's INSTRSET CPU level define 4 - make + the test bench faster, only validate each primitive once + [baaefda928fa] + +2013-09-26 Deepthi Devaki + + * source/encoder/dpb.cpp: + Force numPics in RPS <= maxDecPicBuffering - 1 + + because maxDecPicBuffering is now set to numReferences+1 + [7bbdd9cb086f] + +2013-09-25 Shazeb Nawaz Khan + + * source/common/common.cpp, source/encoder/dpb.h, + source/encoder/encoder.cpp, source/x265.h, source/x265opts.h: + Added support for multiple references + + Added a commandline parameter '--ref '. + DPB size adjusted to use number of references. + [0dbfb0bbca1a] + +2013-09-25 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + Merge with stable + [a66a677b65b4] + +2013-09-25 Min Chen + + * source/common/vec/ipfilter8.inc: + Avoid VS2008 X64 compiler bug + [fb4e4dfdb841] + + * source/common/vec/ipfilter8.inc, source/encoder/motion.cpp, + source/encoder/motion.h: + Merge HPEL interpolations with the same HPEL offsets + + In the square1[9], the candidate HPEL offsets are: + + 5 1 7 3 x 4 6 2 8 + + The main idea is two HPEL distances makes one FPEL so we can merge + 1-2, 3-4, 5-6, and 7-8 into one interpolation each by adding 1 extra + row/col to the interpolation + [5768db8ac355] + +2013-09-25 Steve Borho + + * source/encoder/encoder.cpp: + encoder: range check number of NAL units + [d514d8685797] + +2013-09-25 Min Chen + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + fix hash mistake from --sao-lcu-opt=0 --frame-threads 2 + + Delay all deblock and SAO until the entire frame is available. Frame + SAO and frame parallelism will now generate correct results, but it + will have very poor performance, so the hard check is relaxed to a + warning. + [2d77d4a2c31b] + +2013-09-25 Steve Borho + + * source/encoder/encoder.cpp: + encoder: disable lookahead for all-intra encodes + + This works around a bug in the current Lookahead::slicetypeDecide(). + I have a patch in the works which fixes this properly, but I think + this is a reasonable workaround. Tested with ABR and it works ok. + [b8658deb4a77] + +2013-09-25 Gopu Govindaswamy + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h: + NAL: Remove unused NALUnitEBSP functions + + Removed the unused NALUnitEBSP(OutputNALUnit& nalu) and + copyNaluData() methods from NAL + [4d0ced5c64fe] + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibEncoder/TEncTop.h, source/common/CMakeLists.txt, + source/encoder/frameencoder.h: + Accessunit: Remove unused accessUnit class + + AccessUnit class derived from std template list, Accessunit class + replaced with pointers to an array + [bb88bbe34c95] + +2013-09-24 Steve Borho + + * source/encoder/encoder.cpp: + encoder: ensure returned NAL count matches output array size + + On CHECKED_MALLOC failure, m_packetData and/or m_nals will be NULL + and thus the returned count must be 0. Also, do not free the packet + data from within this utility function. It is cleaner to release + that memory in the function which declared the pointers on the + stack. + [bdd26fd0325a] + + * source/encoder/frameencoder.cpp: + frameencoder: do not cause deadlock on malloc fail + [bb5d3e9aadc3] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: prevent divide by zero if no frames were output by stream + [ae6d6584b193] + +2013-09-24 Gopu Govindaswamy + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + encoder: do not use std::list for the class AccessUnit + [d62d31ea1520] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h: + bitstream: remove unused std::vector class code + [235726d72281] + +2013-09-24 Steve Borho + + * source/encoder/slicetype.cpp: + Merge with stable + [dec0309ec94c] + + * .hgtags: + Added tag 0.4.1 for changeset 93707bc4fccd + [ddb7abf83f98] + + * source/common/vec/CMakeLists.txt: + cmake: ignore icpc warnings about EMMS from pixelcmp primitives + [93707bc4fccd] [0.4.1] + + * source/encoder/slicetype.cpp: + slicetype: fix variable shadow warning + [3a77059e1ebf] + +2013-09-24 Min Chen + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + Reset CABAC status to get same output between single and multi + threading + [fe00fb449af4] + +2013-09-24 Deepthi Nandakumar + + * source/common/CMakeLists.txt: + Merge with stable + [a8f6f62217d5] + + * source/common/CMakeLists.txt: + MinGW: build fix for common.cpp + [f56cd5c652ef] + +2013-09-24 Steve Borho + + * source/encoder/slicetype.cpp: + Merge with stable + [c56e392b2c68] + +2013-09-24 sumalatha polureddy + + * source/encoder/ratecontrol.cpp: + RateControl : Bug fix when previous estimated frame cost is zero + [2824b8e732e9] + +2013-09-23 Steve Borho + + * source/common/common.cpp: + common: prevent an API race hazard + + The public API should be thread-safe for a given encoder or param + object (they are distinct data structures). However x265 has a small + number of global vars that are configured on first use and must be + the same for all encoders in the same process (max CTU size and + pixel bit-depth). Using an atomic compare-and- swap here prevents + simultaneous encoder creations from violating those rules. + [3ee2a8a2d852] + + * source/common/common.cpp: + common: rename CONFIRM macro to CHECK + + The logic of the macro is: if (expr) fail So CONFIRM is highly + misleading + [15ab448f1607] + +2013-09-20 Gopu Govindaswamy + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibEncoder/NALwrite.h, + source/common/common.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + frameencoder: use CHECKED_MALLOC in getStreamHeaders() + [886a765eb760] + +2013-09-24 Steve Borho + + * source/encoder/slicetype.cpp: + lowres intra fixes + [46cc93c497c5] + +2013-09-23 Steve Borho + + * doc/intra/T16.TXT, doc/intra/T32.TXT, doc/intra/T4.TXT, + doc/intra/T8.TXT, doc/intra/intra-16x16.txt, doc/intra/intra- + 32x32.txt, doc/intra/intra-4x4.txt, doc/intra/intra-8x8.txt: + rename intra docs, switch to unix eoln + [ad7866811610] + + * build/README.txt, build/regression/commandlines-example.txt, + build/regression/config-example.txt, build/regression/email-csv.py, + doc/uncrustify/apply-to-all-source.py, + doc/uncrustify/codingstyle.cfg, source/Lib/README.txt, + source/VectorClass/README.txt, source/VectorClass/vectorclass.h, + source/VectorClass/vectori128.h, source/VectorClass/vectori256.h, + source/VectorClass/vectori256e.h: + use unix EOLN for all non-bat files + [a194cf5486e7] + + * .hgignore, doc/uncrustify/drag-uncrustify.bat, + doc/uncrustify/uncrustify.bat, doc/uncrustify/uncrustify.exe: + prune uncrustify.exe; ignore any executable copied into that folder + + uncrustify.bat was unused, so it was also removed + [0600e707b254] + + * doc/README_data-structure.ppt, doc/software-manual.pdf: + remove binary documentation files leftover from HM source tree + [46f25d98c8b2] + + * doc/astyle/AStyle.exe, doc/astyle/apply-to-all-source.py, doc/astyle + /astyle-config.txt, doc/astyle/drag-astyle.bat: + remove obsolete astyle scripts and tools + [fa07f9e00dcb] + + * source/common/lowres.cpp: + lowres: fixup bad merge/patch re-import + [a2d7412377dd] + + * source/encoder/encoder.cpp: + encoder: explicitly disable AMP if rectangular modes are disabled + + In the mode decision logic, asymmetrical motion prediction is + implicitly disabled when rectangular modes are disabled. This makes + the disabling more explicit, AMP will not appear as enabled in the + "encoding tools" log line. + [0656b16f0c4a] + +2013-09-23 Min Chen + + * source/common/vec/ipfilter8.inc: + Replace (maskmovq) by (blendvb + movq) + [98903821e0dc] + + * source/common/vec/ipfilter8.inc: + Replace combo padd(32)+psra(6) by pmulhrsw + [017743def32e] + +2013-09-23 Steve Borho + + * source/common/lowres.cpp: + lowres: remove NULL pointer checks, x265_free does this internally + [d6082fcfc3dd] + + * source/common/lowres.cpp: + lowres: white-space nits and minor cleanup + [b877bb01f231] + + * source/common/lowres.cpp: + Merge with stable + [500686c60c97] + +2013-09-23 Shazeb Nawaz Khan + + * source/common/lowres.cpp: + Adapting lowress frame dimensions to multiples of lowress CU size. + + The full res frame is required to be extended accordingly to achieve + a lowres frame with dimensions in multiples of lowres CU size. + [90be63021e7d] + +2013-09-23 Deepthi Devaki + + * source/common/lowres.cpp: + lowres: bug fix - move initialization from create() to init() + + lowres objects are reused, hence they should be re-initialized. + [bba314600ed0] + +2013-09-23 Shazeb Nawaz Khan + + * source/common/lowres.cpp: + Adapting lowress frame dimensions to multiples of lowress CU size. + + The full res frame is required to be extended accordingly to achieve + a lowres frame with dimensions in multiples of lowres CU size. + [4b794e2ffcb4] + +2013-09-20 Steve Borho + + * source/encoder/encoder.cpp: + encoder: hoist declaration of offset var to fix GCC goto warnings, + avoid casts + [ff797c5087ae] + + * source/encoder/encoder.cpp: + encoder: white-space fixes + [7291eaa504d2] + + * source/encoder/encoder.cpp: + encoder: remove NULL checks prior to X265_FREE, which does its own + [9dce39147a94] + + * source/encoder/encoder.cpp: + encoder: fix indentation of extract_naldata + [1a1752129829] + + * source/CMakeLists.txt: + cmake: force creation of ENABLE_PRIMITIVES_ASM option for all yasm + versions + [2a00fe5b8a1c] + + * Merge with stable + [d6494dd903f6] + +2013-09-20 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + Bug Fix - copy last Recq value to thread local rce object + + Missed this out in previous patch. + [1311936ef9cf] + +2013-09-19 Steve Borho + + * source/CMakeLists.txt: + cmake: fix auto-link issues with MSVC + + If you tell MSVC to link with the static lib, the dependencies + somehow fail and x265.exe is not re-linked properly when the static + lib is changed. + [d47ef13cb735] + +2013-09-20 Steve Borho + + * source/encoder/encoder.cpp: + encoder: style nits, no logic change + [e672b5f15e9c] + +2013-09-20 Gopu Govindaswamy + + * source/encoder/encoder.cpp: + encoder: Do not use X265_MALLOC use CHECKED_MALLOC + + Currently X265_MALLOC did't validate the returned pointer, + CHECKED_MALLOC Always validate the returned pointer and print the + Log message and jumps to fail lable if returned pointer is not + valid, + [af8daa6c720b] + + * source/common/common.cpp, source/common/common.h: + common: add CHECKED_MALLOC macro that logs malloc failures and jumps + to fail label + [678ae6e4f3fd] + +2013-09-20 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + Bug Fix - copy last Recq value to thread local rce object + + Missed this out in previous patch. + [de084699ba00] + +2013-09-19 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: fix shadow warning reported by GCC + [20d8fa0541d8] + + * source/CMakeLists.txt: + cmake: fix auto-link issues with MSVC + + If you tell MSVC to link with the static lib, the dependencies + somehow fail and x265.exe is not re-linked properly when the static + lib is changed. + [73a710b6d1c1] + + * source/common/CMakeLists.txt: + cmake: use explicit file list for HM TLibCommon files + [6aea8272dcd8] + + * source/encoder/CMakeLists.txt: + cmake: use explicit file list for HM TLibEncoder files + + This prevents issues when HM files are deleted; we simply remove + them from this list and then CMake automatically regenerates + projects or Makefiles. With globs, you have to know to poke CMake to + regenerate the files, but most do not know this. + [bd1d4bdc996e] + + * source/common/common.cpp: + common: white-space logging nit + [00429bb81ef7] + + * source/common/common.cpp: + common: improve logging string for lookahead configuration - conveys + more info + [233046035a5f] + + * Merge with stable + [5c6c9c095cb8] + + * .hgtags: + Added tag 0.4 for changeset 2ba6ec553f21 + [85219cda3127] + + * source/encoder/slicetype.cpp: + slicetype: respect --bframes count when --b-adapt is 0 + + --b-adapt 0 --bframes 3 => IPBBBPBBBPBBB + --b-adapt 0 --bframes 4 => IPBBBBPBBBBPBBBB + + This is a stop-gap feature until --b-adapt 1 and 2 are fully + functional + [2ba6ec553f21] [0.4] + + * source/encoder/slicetype.cpp: + slicetype: remove x265_ prefix from internal functions + + These were adapted from x264 functions, and the prefix was + incorrectly preserved + [4c97a2025103] + + * build/linux/batch.py, source/Lib/TLibCommon/SEI.cpp: + Merge with default + [9d97cddc0fac] + +2013-09-20 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + tuned up ABR logic to better adapt for frame parallelism + + Rate control needs to be more aggressive based on actual ecoded + bits cost rather than estimated costs from concurrent threads .Tuned + up some parameters to effect this idea. + [395bf0a490c2] + +2013-09-19 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + RateControl bug fixes for frame parallelism + + 1. Added lastRceq in ratecotrol structure and copied the value into + thread local rate control entry obj so that Bframes can get latest + lastRceq value. + + 2. Added framesDone state - to maintain a serial order of frames in + RateCotrol to get a correct estimate of wantedBits. + [d62c413b3c06] + +2013-09-19 Steve Borho + + * source/x265.cpp: + cli: use strerror to report signal error + [82aceba785f0] + + * source/Lib/TLibEncoder/TEncTop.cpp: + top: fix leak of output NALs on CTRL+C aborts + [e50237973977] + +2013-09-18 Steve Borho + + * source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h: + sei: remove unused SEIMessages type define + + After this, and after AccessUnit is replaced with a simple pointer + array, TComList is the only type derived from std::list, so we would + no longer need a templated list class. TComList could be replaced + with a simple list class that suits our exact needs. + + This also removes another HM file + [53c551867006] + + * source/common/common.cpp, source/x265.cpp, source/x265.h: + api: introduce an x265_picture_init() function + [7f069c6e6b89] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + lowres: pass poc and user slice type to lowres::init() + [14af4e13ab66] + + * source/common/lowres.h: + lowres: group fields by type, remove trailing white-space + [f646ebe0b520] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/dpb.cpp, source/encoder/slicetype.cpp: + lowres: change int keyframe to bool bKeyframe + [f5a6b908037c] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp: + lowres: change int scenecut to bool bScenecut + [8df58890ff0b] + +2013-09-19 Min Chen + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h, + source/x265.cpp: + framefilter: allow SAO to be disabled independently of deblock + filter + [7b26e7c0bf88] + +2013-09-19 Steve Borho + + * source/x265.cpp: + cli: fix memory leaks on early exits + [bc1a28d6f187] + + * source/common/common.cpp: + sao: prevent picture based SAO to be used in combination with frame + threading + [12db8e96402b] + +2013-09-19 Deepthi Devaki + + * source/encoder/slicetype.cpp: + lookahead: fix crash for I frame cost estimation + [e51ecfcabcaa] + +2013-09-19 Deepthi Nandakumar + + * source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/mbdstharness.cpp, source/test/pixelharness.cpp, + source/test/testbench.cpp, source/test/testharness.h: + testharness cleanup: Testharness uses the same malloc/free wrapper + as the rest of the encoder. + + Aligned mallocs in testharness use the wrapper (X265_MALLOC/FREE) + defined in CommonDef.h. Remove alignedMalloc/Free definitions in the + test class. + [eea9154229a0] + +2013-09-18 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: fix bug in intra estimation in + Lookahead::estimateCUCost() + + # HG changeset patch # User Steve Borho # Date + 1379538813 18000 # Wed Sep 18 16:13:33 2013 -0500 # Node ID + 5bab261d0dd7ac68c4af3837853d48d56017d4d6 # Parent + 197dd6d2f54d7193d7dcf28d03a0b3dccb353fad slicetype: fix bug in intra + estimation in Lookahead::estimateCUCost() + [26d6f155f8df] + + * source/common/common.cpp, source/x265.h: + white-space and comment nits + [2531c630739e] + + * source/x265.cpp: + cli: report errors from registering signal handler + [197dd6d2f54d] + + * source/Lib/TLibEncoder/TEncSbac.h: + TEncSbac: remove exit() calls from unsupported functions + [ded83381f219] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: replace debugging exit() call with abort(), use x265_log + [4b3ff29d661d] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/wavefront.cpp, + source/encoder/bitcost.cpp, source/input/y4m.cpp, + source/input/yuv.cpp, source/output/y4m.cpp, source/output/yuv.cpp, + source/test/testpool.cpp: + do not check for NULL prior to delete, do not reset pointers in + destructors + + A lot of our classes have destroy() methods. In those we try to zero + deleted pointers since there is a chance for multiple destroy() + calls. But in destructors the pointer storage itself will be + released when the function exits + [d8d209de87c2] + + * source/x265.cpp: + cli: use sig_atomic_t type for variable modified by SIGINT handler + + Other nits: + * removed unused b_exit_on_ctrl_c + * removed unnecessary static initializer + [6b1982cfc05d] + +2013-09-18 Deepthi Devaki + + * source/encoder/slicetype.cpp: + lookahead: fix cost estimation, use previous result when calculation + is skipped + [f467c4792a28] + +2013-09-17 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ABR - Removed unused states, cleaned up the comments. + + Removed states that are not used in ABR or CQP rate contnrol modes. + [d05f4310a976] + +2013-09-18 Min Chen + + * source/encoder/frameencoder.cpp: + frameencoder: disable filter/extend lag when SAO and deblocking are + off + + This improves frame parallelism when all loop filtering is disabled + (aka: benchmark mode) + [d694a96b181c] + +2013-09-18 Deepthi Nandakumar + + * source/encoder/slicetype.cpp: + lookahead: change const bool to const int, so as to use it safely as + a loop counter. + [9d5d4e4e6d6c] + +2013-09-17 Wenju He + + * source/input/y4m.cpp: + fix allocation crash if input y4m file is not found + [1dc07ca4c4ce] + +2013-09-17 Min Chen + + * source/common/wavefront.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + Resolve some patching issues for previous patch (deadlock). + [13531b8f19cc] + +2013-09-17 funman + + * source/x265.h: + [x265] [PATCH] library header: use #ifdef __cplusplus Remove a + warning when using from C + [1c66f40da3e9] + +2013-09-16 Min Chen + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + Use mixed bitmap between FrameEncoder and FrameFilter to Fix crash + and hash mistake in WPP mode + + I change task schedult bitmap to mixed FrameEncoder and FrameFilter + because there catch two bugs, and I want to reduce latency of Frame + Parallelism. The new bitmap mapping 2N+0 to FrameEncoder and 2N+1 to + FrameFilter. + + Side effect: 1. We can remove the lock from FrameFilter. 2. Mixed + bitmap let us do Filter early, so reduce latency of Frame + Parallelism + + Solved bugs: 1. CRASH: the reason is sometime two of threads finish + in same time, so they will enter Filter in wrong order and sent + Finished Event early. when main thread dequeue JobProvider and + execute FrameFilter, we will catch a crash! + + 2. HASH MISTAKE: the reason is same as below, but last row is right + order, we will got worng reconst image. + [0d33ff236f68] + +2013-09-16 Deepthi Nandakumar + + * source/common/common.cpp: + CLI Options: reset x265_param_default to extern "C" + [9a727efab9fa] + + * Merge + [960dbe714209] + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265opts.h: + CLI options: Eliminate rdoqts option; cleanup + + 1. Eliminate rdoqts CLI option: enabled when rdoq and ts are both + enabled. 2. Rearrange default initialisations in x265_param_ t + structure + [46b065f7d676] + + * source/encoder/framefilter.cpp: + framefilter: Fix memcpy for pel-pixel datatype change + [881444f5910b] + + * source/encoder/framefilter.cpp: + framefilter: Fix memcpy for pel-pixel datatype change + [810ceb9d2b7c] + +2013-09-16 Gopu Govindaswamy + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.h, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/common/TShortYUV.h, source/common/common.h, + source/common/lowres.h, source/common/md5.h, source/common/mv.h, + source/common/reference.h, source/common/threading.h, + source/common/threadpool.h, source/common/wavefront.h, + source/compat/msvc/getopt.h, source/encoder/bitcost.h, + source/encoder/cturow.h, source/encoder/dpb.h, + source/encoder/frameencoder.h, source/encoder/framefilter.h, + source/encoder/motion.h, source/encoder/ratecontrol.h, + source/encoder/slicetype.h, source/input/input.h, + source/input/y4m.h, source/input/yuv.h, source/output/output.h, + source/output/y4m.h, source/output/yuv.h, source/x265.h: + X265: header guards format Changed to X265__H + + Globally all the x265 header files header guards format changed into + X265__H + [d09f36e4dc8b] + +2013-09-13 sumalatha polureddy + + * source/encoder/ratecontrol.cpp: + ratecontrol: Tweak to better handle short term compensation + + Increase the coefficient cplxrSum is adjusted by so that short term + compensation does not suffer as much. + + Also, clip the QP for the first frame. + + Overall improvement is about 5%. + [6bab41a554b3] + +2013-09-13 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: move threadMain() into cpp file + [d0883c21fc7f] + + * source/encoder/slicetype.cpp: + slicetype: increase baseline bframe bias by 10% + + Our estimateFrameCost() function is not yet checking bidir + candidates (because bidir search in the main encoder needs to be + replaced first) and since B frame estimates cannot use intra modes + either this tends to make B frame scores relatively higher than what + x264 finds. So our default bias needs to be a bit higher in order + for the lookahead to select B paths. + [2555acbc9736] + + * source/cmake/mergestaticlibs.cmake: + cmake: whitespace cleanups in mergestaticlibs.cmake (should not + change behavior) + + this file was borrowed from another project, and wasn't scrubbed for + our cmake script style (no tabs, 4 spaces, unix eoln, lower case cmd + names) + [9c5a74cfacca] + +2013-09-13 Derek Buitenhuis + + * source/cmake/mergestaticlibs.cmake: + cmake: Don't rely on non-POSIX ar functionality + + Fixes build on stricter systems like FreeBSD. + [8e7d559df1ac] + + * source/Lib/TLibEncoder/NALwrite.cpp: + NALwrite: Include forgotten header + + Fixes build on Linux, where memcpy is not global. + [32922922a332] + +2013-09-13 Steve Borho + + * source/encoder/framefilter.cpp: + ppa: remove redundant PPA event + + The PPAScopeEvent macro causes a bar to appear in our profiler with + the given name, associated with the CPU core and start/stop time. + The problem with this particular instance is that the function was + copy-pasted from another in this file and this profile scope came + along for the ride and since they both have the same name and one + calls the other it is essentially a NOP + [f98bfe09f806] + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + framefilter: comment cleanups, use pixel data type + [a1e34874f642] + + * source/test/CMakeLists.txt: + cmake: fix test bench builds following static lib reorg + [4206091a6278] + +2013-09-13 Deepthi Nandakumar + + * source/encoder/encoder.cpp: + Encoder: Remove x265_ prefix from NAL data extract functions. + [45786d093571] + + * source/encoder/encoder.cpp: + Encoder: Factor out duplicated code in NAL data extraction [Gopu] + [fc32e3804b8b] + +2013-09-12 Gopu Govindaswamy + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + NAL : std::ostringstream replaced + [92b1820c3e42] + +2013-09-12 Derek Buitenhuis + + * source/common/threading.h: + threading: Include fcntl.h on POSIX systems + + This is where O_CREAT and O_EXCL are defined. + [17d41a7563aa] + +2013-09-11 Steve Borho + + * source/CMakeLists.txt: + cmake: set MACOS build flag on mac builds (fixes CPU count + detection) + [8fdafe573ef7] + + * source/common/threading.cpp, source/common/threading.h: + threading: use named semaphores for POSIX events + + Mac OS X does not support unnamed semaphores. It's not clear what + the performance implications of this are on Linux (or Mac). Also, + this method is not very robust. If x265 crashes, it leaves named + semaphores in the system until the kernel is restarted. + [d95d94ee0e1d] + + * build/linux/batch.py: + build: remove obsolete test script + [7a3699202860] + + * source/common/ipfilter.cpp: + ipfilter: remove two unused file-static functions + + Xcode reported these as warnings; Xcode build is now 100% clean + [2d59beb5c0f4] + + * source/common/x86/CMakeLists.txt: + cmake: add Xcode hacks for YASM integration (is there a better way?) + [dfda8ed7bbf4] + + * source/CMakeLists.txt: + cmake: detect Xcode generator, avoid static lib failure + [74d7078f9afe] + + * build/xcode/make-project.sh: + cmake: add xcode build folder + [881263c02cc2] + +2013-09-12 Steve Borho + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framefilter: prevent row filter overlap race hazards + [463fc13419b9] + +2013-09-11 Wenju He + + * source/Lib/TLibEncoder/TEncCu.cpp: + check before delete + [80c133ffdbef] + +2013-09-12 Steve Borho + + * source/common/CMakeLists.txt, source/common/vec/CMakeLists.txt, + source/common/x86/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: mark project libs as static + [d4a8f6b92eb2] + +2013-09-11 Steve Borho + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt: + cmake: link primitive projects into libx264 directly, not through + common + [d5ffa406e4fb] + + * build/icl32/build-all.bat, build/icl32/make-makefile.bat, + build/icl64/build-all.bat, build/icl64/make-makefile.bat: + cmake: detect and use ICL version 14 + [11785c1be804] + +2013-09-12 Deepthi Nandakumar + + * Merge + [a9ad48de415e] + + * source/x265opts.h: + cli: moving rd option description to rate distortion section. + [25291305a3b9] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + cli: replacing no-rdo and no-rdoq options with rd: 0 means + nordo,nordoq. 1 means nordo,rdoq and 2 means rdo, rdoq. + [52630a3de867] + +2013-09-11 Deepthi Nandakumar + + * source/x265.h: + cli: clarifying enum comment in No-RDO. + [989980166bed] + +2013-09-11 Steve Borho + + * source/CMakeLists.txt: + cmake: quiet ICL warnings from STLport's use of pragmas + [ea0aa1f8abda] + +2013-09-11 Min Chen + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + framefilter: Make code clearer + + We do not need the row delay when the loopfilter is disabled. + [ffbe50c2f743] + +2013-09-11 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: fast-no-rdo is stable enough to be a part of regular no- + rdo + [a6bf1f10e820] + + * source/x265.h: + cli: Add enum RDOLevel to list of rate distortion modes. + [babfab4f36a8] + +2013-09-11 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: remove extra parens (Mac GCC reports this as a warning) + [af8cddab103e] + +2013-09-09 Steve Borho + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framefilter: run filters synchronously at end of each row + [4ec9253cc3c1] + +2013-09-10 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/lowres.cpp, + source/common/lowres.h, source/encoder/dpb.cpp, + source/encoder/slicetype.cpp: + lowres: don't use bframes member var to store allocation count + + x264 used a bframes variable here to hold the number of leading B + frames encoded before an I or a P (I assume for rate control use) + [6ca33594fbf3] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: clamp mvmin.y to reflag distance as well + [e5b5277956a8] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: remove unused bframes (this was probably not the + bframes you need) + [90a4d668f833] + + * source/x265opts.h: + cli: move fast-no-rdo next to the other rdo options + [f241eedf1ae1] + + * source/CMakeLists.txt: + cmake: generate libx265 unconditionally + [21d8b3a825ad] + + * source/CMakeLists.txt: + cmake: merge fixup + [e27907038796] + + * source/encoder/compress.cpp: + compress: remove unnecessary line (GCC warning) + [e2123d174c7c] + + * source/CMakeLists.txt, source/input/CMakeLists.txt, + source/output/CMakeLists.txt: + Merge with stable + [b7646217f425] + + * source/CMakeLists.txt: + cmake: rename x265 project to cli, force output name to x265. use + libx265.a + [841aaa5fa2ec] + + * source/encoder/encoder.cpp: + encoder: replace long with UInt + [616f585fa646] + +2013-09-10 Deepthi Nandakumar + + * source/encoder/compress.cpp: + compress: different lambdas for P and B slices, no efficiency drop. + [4cd9216dd366] + + * source/encoder/compress.cpp: + compress: replace early exit macro with fast-no-rdo option + [5c8ce4a2e9fc] + + * source/common/common.cpp, source/x265.h, source/x265opts.h: + compress: Introducing option fast no-rdo, only when no-rdo mode is + enabled. + [cda8c3d1f83c] + + * source/encoder/compress.cpp: + compress: Intra mode checked only in P frames + [1c72e3fbd641] + + * source/encoder/compress.cpp: + compress: Enabling early exit macro. + [ab77d80491d8] + +2013-09-09 Steve Borho + + * source/common/vec/ipfilter8.inc: + ipfilter: remove unused maxVal variable + [98f0f7dde384] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/VectorClass/instrset.h, source/common/TShortYUV.cpp, + source/common/common.h, source/common/pixel.cpp, + source/common/threading.h: + convert stdlib.h includes to to avoid namespace conflicts + [cfd76423dbf4] + + * source/CMakeLists.txt, source/input/CMakeLists.txt, + source/output/CMakeLists.txt: + cmake: merge InputFiles and OutputFiles projects into cli project + (simplicity) + [60dec1deb894] + + * doc/LookaheadGuide.txt, doc/LookaheadPlan.txt, + source/Lib/TLibCommon/TComCABACTables.cpp, + source/Lib/TLibCommon/TComCABACTables.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/common/vec/blockcopy- + sse42.cpp, source/common/vec/dct-sse42.cpp, source/common/vec/intra- + sse42.cpp, source/common/vec/ipfilter-sse42.cpp, source/common/vec + /pixel-sse42.cpp, source/common/vec/sse42.cpp, + source/encoder/encoder.h: + merge default into stable; feature freeze + [1efd146deeeb] + + * source/x265.cpp: + cli: add version id into CSV output + [d11de33521cf] + + * source/x265.cpp: + cli: report global PSNR on summary line + [0811d6c95e44] + + * source/common/common.cpp: + common: fix lookahead check + [48ab740e5c7f] + + * source/CMakeLists.txt: + cmake: do not use -fPIC on MinGW64 + [84e49ab6fef1] + + * source/CMakeLists.txt: + cmake: fix eoln damage + [a24a04441e5e] + + * source/CMakeLists.txt: + cmake: add STLport build option + [943ffb8b220c] + + * source/input/input.h, source/input/y4m.h, source/input/yuv.h, + source/x265.cpp: + stl: fix warnings/errors reported by STLport headers + [96f8d46f9e13] + + * source/common/common.cpp: + common: fix lookahead depth dependency + [d8511336efdb] + + * source/encoder/slicetype.cpp: + slicetype: disable lookahead for all-P frame runs + [8859d7da393c] + + * source/common/common.cpp: + common: enable -b3 by default to match x264 + + Use -b0 to get the previous all-P frame default behavior + [7717f84c5cfa] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: remove rce member variable pointer + + Pass RateControlEntry to functions that need it. It was pretty + misleading to pass in an argument named m_rce. + [42c2c838d492] + +2013-09-09 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + ratecontrol: move lastRceq and qpaRc to RateControlEntry(). + [22ac6e8ca357] + + * doc/LookaheadGuide.txt, doc/LookaheadPlan.txt: + Merge + [76f1e04c897e] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: rateControlEntry is now a member of FrameEncoder. + [a5087ca01ab8] + +2013-09-09 Steve Borho + + * doc/LookaheadGuide.txt, doc/LookaheadPlan.txt: + doc: remove obsolete planning docs + [bc4a13c9017c] + + * source/encoder/slicetype.cpp: + slicetype: enable lookahead by default. --b-adapt 0 or --rc- + lookahead 0 disables + [d3c9a535c21d] + + * source/x265opts.h: + cli: add --b-adapt CLI option + [176055b7fadd] + +2013-09-08 Min Chen + + * source/encoder/framefilter.cpp: + support diable both thread and lft/sao + [f9150ab39bf9] + + * source/encoder/frameencoder.cpp: + support both diable thread and lft/sao + [4ace08e490a0] + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h: + simplify by processRowPost() + [6882f7eb6c31] + +2013-09-07 Steve Borho + + * source/encoder/motion.cpp: + motion: subpel offsets table needs a 'no-motion' entry + [1f8760fdc2d8] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove unused m_searchRange + [08ed93aebcdc] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove unused m_searchMethod + [72f6cfac517d] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp, + source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/slicetype.cpp, source/x265.cpp, source/x265.h, + source/x265opts.h: + api: add -m/--subme to adjust level of subpel refine + + This is very similar to x264, but ours has no effect on RD analysis, + it only affects the amount of HPEL and QPEL refinement for motion + estimation. + [0f8c8a921d1b] + + * source/Lib/TLibCommon/TComPic.h, source/Lib/TLibEncoder/TEncTop.cpp, + source/x265.cpp, source/x265.h: + api: pass through a void* user data pointer through the encoder + [dcc5e822fb05] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/cturow.h, source/encoder/frameencoder.cpp: + cturow: move completed column counter back into CTURow class (from + TComPic) + [8e945ca22fd5] + + * source/x265.cpp: + cli: initialize input picture slice type + [507ad2b9aa37] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTop: remove dead meathod declaration + [836a1b2beb3f] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: prevent deadlocks on CTRL+C aborts + [e46a1ae90f30] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + lookahead: do not leak pictures left in lookahead queues on abort + [1fd36612d2a9] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp: + TComSlice: remove unused lambda variables + [090e76ad93c9] + + * source/encoder/frameencoder.cpp: + frameencoder: use a more refined depth for SAO + [065ada6a83f7] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/frameencoder.cpp: + sao: fix a typo + [b7afed5ff5c4] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp: + slice: finally remove slice 'depth' parameter, use slice type to + approximate + + Instead of relying on a fixed depth cadence, apply all the high- + depth hacks to only B slices. + [ea196d10f05e] + + * source/encoder/encoder.cpp: + log: concise is almost always better + [d337394a2f86] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + TEncSampleAdaptiveOffset: remove hungarian prefixes + [bc6e986c96cc] + + * source/encoder/frameencoder.cpp: + frameencoder: update a TODO comment + [aa958a34d2f5] + + * source/encoder/compress.cpp: + compress: enable SSD based merge decisions for --no-rdo path + [302f9fd4fb0c] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + compress: cleanups + [791da0534bb7] + +2013-09-07 Aarthi Thirumalai + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + TEncCu: optimized merge routine xcomputeCostmerge2x2N + [3198b82d6709] + +2013-09-07 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp: + log: combine all threading info into one log line + [fd0c4026514b] + +2013-09-07 Min Chen + + * source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp: + framepp: Support NO-WPP + FrameParallelism Mode + [a376aeeb9638] + + * source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncSbac.h, source/encoder/frameencoder.cpp, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framepp: let sao use own context + [c9fd5009e72a] + +2013-09-07 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: remove broadcast methods, inline row initializations + [57959ef033e4] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: remove unused accessor methods, inline scaling list + sets + [19571e08f69e] + + * source/encoder/frameencoder.h: + frameencoder: remove unused accessor methods + [a7b6d9b86b4a] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: inline resetEncoder + [470884b53ec3] + + * source/encoder/frameencoder.h: + frameencoder: improve comments + [392e8ebaba0b] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: nits + [859f9b557938] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + frameencoder: move lambda initialization into the frame worker + thread + [831882d2f600] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/dpb.cpp, + source/encoder/dpb.h: + dpb: we don't need to pass the FrameEncoder to DPB::prepareEncode() + [5e00016ee73b] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + ratecontrol: avoid lookahead overhead for CQP + [a027cdf5a6e6] + + * source/encoder/ratecontrol.cpp: + ratecontrol: avoid some operations in CQP mode + [282d9b666254] + + * source/encoder/ratecontrol.cpp: + ratecontrol: remove redundant clip and double conversions + [61f44f7895c7] + +2013-09-07 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + ratecontrol: bug fix - qp for ABR cant exceed MAX_QP = 51 + [fb9aa649c13e] + +2013-09-06 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: lookAheadCost is a somewhat confusing variable name + [385c0b29be4f] + + * source/x265.cpp: + cli: assign poc on input pictures (ignored by encoder, but good + example) + [ef0c1d6fbd26] + + * source/common/vec/vec-primitives.cpp: + vec: simplify externs, fix MSVC compiler version dependencies + [090407717725] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncCfg: use param.decodedPictureHashSEI directly without accessor + [84f9983a5de7] + + * source/Lib/TLibEncoder/TEncCfg.h, source/x265.h, source/x265opts.h: + api: rename bEnableDecodedPictureHashSEI (not a bool) + [381e8ea239cb] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h: + api: add sliceType to output pictures + [79ade4825d33] + + * source/common/lowres.h, source/x265.h: + api: move slice type macros to public API + [1256b9f2678e] + + * source/x265.cpp: + x265: nits + [850a0ddfb3ae] + + * source/x265.cpp: + x265: print unrecognized short options as char instead of hex + [883098a93d61] + + * source/x265.cpp: + x265: emit a warning message if a short option is invalid + [5a14cf8e8a3b] + + * source/x265.cpp: + x265: remove -w short option for disabled weightp command + [e41fd1a98838] + +2013-09-06 Deepthi Devaki + + * source/encoder/slicetype.cpp: + slicetype: Handle Bframes immediately preceding an IDR correctly. + [aaeeb869133d] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/slicetype.cpp: + Set all I frames as keyframes, change nalunittype decision + accordingly. + [d934cc573e08] + + * source/encoder/slicetype.cpp: + slicetype: uncrustified + [66b3f3b38e84] + + * source/encoder/slicetype.cpp: + slicetype: B-frames analysis integrated + [86603fe88df5] + + * source/encoder/slicetype.cpp: + slicetype: Remove unnecessary initialization + [9c4c2b1c7fe0] + +2013-09-06 Min Chen + + * source/common/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + review: improvement filterVertical_p_p and filterHorizontal_p_p + [0f79f2f96170] + +2013-09-06 Steve Borho + + * source/CMakeLists.txt: + cmake: MSVC does not understand -fPIC + [11e0362a9835] + +2013-09-06 Rafaël Carré + + * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake: + Always use -DPIC/-fPIC when targetting x64 + + Fix link failure when making a shared object on Linux + [ae8499191c12] + +2013-09-06 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: nits + [63364b91b72a] + +2013-09-06 Deepthi Devaki + + * source/common/lowres.cpp, source/encoder/slicetype.cpp: + Scenecut detection functional with no bframes + [6b4a050476d1] + +2013-09-05 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + TEncSearch: clip mvmax by a fixed ammount for frame parallelism + + When frame parallelism is enabled, we enforce at least merange + + interpolation half-length of reconstructed reference pixels to be + available in all reference frames before allowing the encoder to + begin compressing a given row. So we cannot allow the motion search + to extend beyond merange past the colocated ref block (regardless of + the MVP search starting location). + + We can't use the actual number of rows available because this + results in non- determinism. + [f3a583f4d0b1] + + * source/x265.cpp: + ppa: use scoped PPA event for writeNALs + [1b49fdef200e] + + * source/x265.cpp: + x265: remove redundant help output line + [8baa04aa131e] + + * source/encoder/frameencoder.cpp: + frameencoder: poke idle thread after enabling a row + + This ensures at least one worker thread sees the state transition + [69c280c82c83] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/slicetype.cpp: + bitcost: always use non-I lambda table to seed bitcost table + [2c525d4b2010] + + * source/encoder/frameencoder.cpp: + nit + [0a24b5f9c372] + +2013-09-03 Steve Borho + + * source/encoder/frameencoder.cpp: + frameencoder: start encoding CU rows once minimum reference row lag + is met + [9c5b11c33960] + +2013-09-05 Steve Borho + + * source/encoder/encoder.cpp: + encoder: declare destructor virtual + [54612105847b] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTop: nits + [25bc610d324e] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: statistics lock was no longer necessary, always runs in API + thread ctx + [5e42cbe10f98] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTop: cleanup + [b0ac25bf249a] + +2013-09-06 Aarthi Thirumalai + + * source/encoder/ratecontrol.cpp: + tighten rate control logic for ABR + [9266f876f316] + +2013-09-05 Min Chen + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/framefilter.cpp: + improvement: replace compressMV by index mapping + [d6df26c90ee5] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp: + framepp: move compressMV into loop + [b121e96290e2] + +2013-09-05 Gopu Govindaswamy + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/encoder/frameencoder.cpp: + SEI: Replaced STL Classes + [bce4dbc57e09] + + * source/encoder/encoder.cpp: + encoder: Added pad size in Malloc + [842e8155f9dc] + +2013-09-05 Deepthi Nandakumar + + * source/common/common.cpp: + common: keyframeMax should always be >=0 + [c30558100e0d] + + * source/x265opts.h: + bOpenGOP: remove option openGOP, we only support closed GOP now; + clarify comment + [b3fe7c74c8ca] + + * source/encoder/encoder.cpp: + bOpenGOP: Open GOP has nothing to do with keyframe interval + [0a5662f7e27f] + + * source/x265.h: + bOpenGOP: clarify in comment + [550ae0ed71a2] + +2013-09-05 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp: + TEncTop: hoist compressFrame trigger back into encode() function + [3d5c461db4a7] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + frameencoder: set QP and lambda on the correct FrameEncoder + [84539616c439] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: hold a reference for frames connected to FrameEncoders + [3221580e71d6] + + * source/encoder/encoder.cpp: + encoder: uncrustify + [135b917429ac] + +2013-09-05 Gopu Govindaswamy + + * source/encoder/encoder.cpp: + Encoder: removed STL Container classes + [b1e048696138] + +2013-09-05 Min Chen + + * doc/intra/T16.TXT, doc/intra/T32.TXT, doc/intra/T4.TXT, + doc/intra/T8.TXT, source/common/vec/intrapred.inc: + doc: intra all angles algorithm + [c05c9410a379] + +2013-09-05 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: handle odd keyframe intervals better + [8d73d4ffd725] + +2013-09-05 Gopu Govindaswamy + + * source/x265.cpp: + x265Cli: Removed std::string in log + [ce7faa93a060] + +2013-09-05 Steve Borho + + * source/PPA/ppaCPUEvents.h, source/encoder/frameencoder.cpp: + ppa: measure a different FrameEncoder method + [9b9c4b842d10] + +2013-09-04 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265opts.h: + encoder: make rate control mode implicit from other configs + [6fb582a6d95e] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp: + TEncCfg: remove fixed GOP structures + [8b79137bdd37] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: require --log 4 to dump hash strings + + I don't see them being useful for display except for regression + scripts + [a871b170dd2d] + + * source/encoder/dpb.cpp: + dpb: do not use fixed GOP cadence to select colocated direction + [80ba827e2150] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/encoder.cpp: + TEncCfg: remove unused dQP table + [a8a002ebcde2] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/encoder.cpp: + TEncCfg: remove obsolete configurables + [60c1adcbc34a] + + * source/encoder/dpb.cpp: + dpb: nits + [0a211206d496] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp, + source/encoder/ratecontrol.cpp: + do not configure QP and lambda twice + [77881138aa1d] + + * source/encoder/ratecontrol.cpp: + ratecontrol: assign qp correctly + [ce4c14ba9b85] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + ratecontrol: move CQP support into our code + [7bcd5530d55a] + + * source/encoder/frameencoder.cpp: + FrameEncoder: remove min gop size from depth calculation + [9ba4683de281] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: hard code setNumRefIdxL0DefaultActive() and L1 default + + This was failing in debug with -i1 + [6a47835b91e1] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/lowres.h, + source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/slicetype.cpp: + slicetype: remove gopIdx from lookahead output, simplify fake + lookahead + + CQP uses a simple offset for non-I slices (temporary hack) + [23911d66a504] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/encoder/compress.cpp, source/encoder/frameencoder.cpp: + TEncEntropy: remove hungarian prefixes + [b48bafd35b68] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: remove hungarian prefixes from function arguments + [99a1fb3c4730] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + FrameEncoder: stop passing pic pointer to methods, use m_pic + [ead51e579c13] + + * source/encoder/frameencoder.cpp: + FrameEncoder: remove extra white-space + [493d0dfc608a] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: remove trailing white-space + [ce33f819359b] + + * source/Lib/TLibEncoder/SyntaxElementWriter.cpp: + SEI: remove hungarian prefixes + [a2c84fdf74d7] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: missed deletion + [43a27c6d551d] + +2013-09-04 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/dpb.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/slicetype.cpp: + using slicetypes from lookahead + [7df7dab02f10] + +2013-09-04 Min Chen + + * source/encoder/framefilter.cpp: + Fix bug in PCM mode + [6b2797b588c7] + +2013-09-04 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/frameencoder.cpp: + TComSlice : Removed Unused std::vector Variable + [6ef1bd02dd48] + +2013-09-04 Steve Borho + + * source/common/common.cpp: + common: replace std::string with const char * + [70966790b40e] + + * source/CMakeLists.txt: + cmake: make -Werror on GCC a build option defaulting to off + + It can be enabled on build-bots, etc, but not block typical + development + [e694a5fbf527] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp: + coding style + [209566e9acc5] + +2013-09-04 sumalatha + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp: + QP- Lambda table -x265_lambda2_tab_I , x265_lambda2_tab_non_ I : for + inter and intra blocks + [5f0f2466345f] + +2013-09-04 Deepthi Nandakumar + + * source/encoder/dpb.cpp: + dpb: encode structure limitations + + Min GOP size for IP config is 4, for B frames it is 8. Keyframe + interval should be a multiple of min GOP size. + [a4cec6558ccc] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/dpb.cpp, + source/encoder/frameencoder.cpp, source/encoder/slicetype.cpp: + TEncCfg: rename getGopSize to getGopSizeMin() + [375d9f3157c1] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/dpb.cpp, + source/encoder/encoder.cpp: + dpb: rename m_gopsize in TEncCfg.h to gopsizeMin + [9c558cf50fde] + + * source/common/common.cpp: + framepp: number of frame threads always less than lookahead depth + [06862133aecc] + +2013-09-04 Steve Borho + + * source/Lib/TLibCommon/TypeDef.h: + TypeDef: give explicit namespace to X265_DEPTH for 16bpp builds + [e7cfb49dc587] + +2013-09-03 Steve Borho + + * source/common/common.cpp: + common: prune default search length to 60, for optimal frame + parallelism + [3ca96381d4f0] + + * source/encoder/frameencoder.cpp: + frameencoder: clear enabled row bitmask at the start of each frame + [1e8a103ef4d5] + + * source/common/wavefront.cpp, source/common/wavefront.h: + wavefront: add a method to clear the enabled bitmask + [3fd72a7b33b8] + + * source/encoder/frameencoder.cpp: + frameencoder: remove TODO comment, it was already done + [e976fe0b62bd] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: use pre-allocated temp buffer for weighted and + chroma MC + [6dde4a22e763] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/common/lowres.cpp, + source/common/primitives.cpp, source/common/vec/blockcopy.inc, + source/common/x86/asm-primitives.cpp, source/encoder/compress.cpp, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/encoder.cpp, source/encoder/frameencoder.h, + source/encoder/slicetype.cpp, source/test/intrapredharness.cpp, + source/test/intrapredharness.h, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h, source/test/mbdstharness.h, + source/test/pixelharness.cpp, source/test/pixelharness.h, + source/test/testharness.h, source/x265.cpp: + remove redundant x265 namespace qualifiers + [7f8b72995aee] + + * source/Lib/TLibCommon/TComDataCU.cpp: + tpyo + [ade4ff7e2cb8] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/frameencoder.cpp: + use X265_MIN/X265_MAX to avoid GCC warnings + [a68aded38e69] + + * source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncAnalyze: cleanup hungarian notation and other issues + [6044f5d13785] + + * COPYING: + COPYING: add a mention to our available commercial license + [2fa5d1060ded] + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/encoder/compress.cpp, source/encoder/dpb.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/test/intrapredharness.cpp: + reorg: replace HM's Bool typedef globally with C++ bool + [4cbd0c51baa9] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TypeDef.h: + reorg: replace HM's Float typedef globally with C float + [2feaea24b9ff] + + * source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + reorg: replace HM's Double typedef globally with C double + [84e4e7bd4ba8] + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/common/TShortYUV.h, source/common/dct.cpp, + source/common/ipfilter.cpp, source/common/lowres.cpp, + source/common/vec/dct.inc, source/encoder/compress.cpp, + source/encoder/cturow.cpp, source/encoder/dpb.cpp, + source/encoder/dpb.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/slicetype.cpp: + reorg: replace HM's Int typedef globally with C int + [236d8e715ef2] + + * source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/common/TShortYUV.cpp, + source/common/dct.cpp, source/common/vec/dct.inc, + source/encoder/slicetype.cpp: + reorg: replace HM's Short typedef globally with C short + [e44a6950d22a] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + reorg: replace HM's Char typedef globally with C char + [37b99feefbfd] + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/common/TShortYUV.cpp, source/encoder/compress.cpp, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + reorg: replace HM's Void typedef globally with C void + [1a9780eb8bea] + + * source/Lib/TLibCommon/TComCABACTables.cpp, + source/Lib/TLibCommon/TComCABACTables.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h: + merge statically defined CABAC constant tables into TComRom + [25c5139fe205] + + * source/Lib/TLibEncoder/AnnexBwrite.h: + remove unused AnnexBwrite.h + [a867152e7071] + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComCABACTables.cpp, + source/Lib/TLibCommon/TComCABACTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/common/TShortYUV.cpp, source/common/TShortYUV.h, + source/common/common.cpp, source/common/dct.cpp, + source/common/intrapred.cpp, source/common/ipfilter.cpp, + source/common/lowres.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/reference.h, + source/common/vec/dct.inc, source/common/vec/intrapred.inc, + source/common/vec/ipfilter8.inc, source/encoder/cturow.h, + source/encoder/dpb.h, source/encoder/frameencoder.h, + source/encoder/framefilter.h, source/encoder/ratecontrol.h, + source/encoder/slicetype.h: + reorg: move HM classes and functions under x265 namespace + + This makes it possible to link x265 and HM into a single application + [aa94afeb8368] + +2013-09-03 Gopu Govindaswamy + + * source/common/ipfilter.cpp: + ipfilter : extendCURowColBorder() performance tuning + [30daf6927e11] + +2013-09-03 praveen Tiwari + + * source/common/vec/pixel8.inc: + pixel8.inc: VC9 fix for SAD_4, 'pextrd' replaced with 'movd' + [15c21c4da368] + +2013-09-03 Steve Borho + + * source/common/common.cpp: + common: log nit + [b27576343f8c] + + * source/common/common.cpp, source/common/wavefront.cpp, + source/encoder/dpb.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h, source/encoder/slicetype.h: + uncrustify + [71ee73499db4] + + * source/common/vec/pixel8.inc: + pixel8: sad_x4_16 improvements + [2a2c74c9eb02] + + * source/common/vec/pixel8.inc: + pixel8: sad_x4_4 improvements + [7caa93beb17f] + + * source/common/vec/pixel8.inc: + pixel8: sad_x3_16 improvements + [9e5372fe1f6c] + + * source/common/vec/pixel8.inc: + pixel8: sad_x3_8 improvements + [92256af4cf8b] + + * source/common/vec/pixel8.inc: + pixel8: sad_x3_4 improvements + [ae93ad833228] + + * source/encoder/frameencoder.cpp: + frameencoder: fix initialization order + [d62d57018d3d] + + * source/encoder/slicetype.cpp: + slicetype: fix unsigned/signed comparison + [8e4e2f0c10a9] + + * source/common/lowres.h, source/encoder/slicetype.h: + lowres: move slice type defines into common/ + + headers in common/ should not include encoder/ headers + [07599fd4a497] + +2013-09-03 Deepthi Nandakumar + + * source/common/common.cpp: + framepp: Changing output message + [9a2eaa27b174] + +2013-09-03 Min Chen + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/common.cpp, + source/common/ipfilter.cpp, source/common/primitives.h, + source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp: + framepp: Active frame parallelism + [bc1887f2bbc4] + +2013-09-02 Shazeb Nawaz Khan + + * source/encoder/slicetype.cpp: + Used slicetypeAnalyse() in slicetypeDecide() + + # HG changeset patch # User Shazeb Nawaz Khan + # Date 1378118764 -19800 # Mon Sep 02 + 16:16:04 2013 +0530 # Node ID + e4a57ae74dd96ceb31815a44098b81872e494fff # Parent + e2d93166e034040d61c897264e1dfe7aeeb3d661 Used slicetypeAnalyse() in + slicetypeDecide() + [2f9fcf768918] + + * source/encoder/slicetype.cpp: + Fixed slicetypeAnalyse() for no-B config + + # HG changeset patch # User Shazeb Nawaz Khan + # Date 1378118617 -19800 # Mon Sep 02 + 16:13:37 2013 +0530 # Node ID + e2d93166e034040d61c897264e1dfe7aeeb3d661 # Parent + 3ea029900ab3ee58ed6b16c5c5a0a89975ba8c03 Fixed slicetypeAnalyse() + for no-B config + [e4142126e8ac] + +2013-09-02 Gopu Govindaswamy + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + ipfilter : Removed unused filterHorizontalExtendCol from testbench + and primitive + [e3e4e2c33331] + +2013-09-02 praveen Tiwari + + * source/common/vec/pixel8.inc: + pixel8.inc: Optimization with sad_x4 4xn + [3ea029900ab3] + +2013-09-02 Deepthi Nandakumar + + * source/common/common.cpp: + slicetype: keyframe min and max values inported from x264 + [10ec000a0902] + + * source/common/common.cpp: + slicetype: default value for scenecutThreshold pulled in from x264 + [e347a519e060] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp: + slicetype: Double defined macros replaced. + + Todo: investigate whether its safe to replace the table with just I, + P, B (2,1,0) according to Table 7-7 + [d3a13581ef98] + +2013-09-01 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: pre-allocate intermediates temp buffer + [5c27dd5f8542] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: simplify xPredInterLumaBlk + [98ea1b8fcfea] + + * source/encoder/motion.cpp: + motion: split mvcost from subpelCompare + [6948b2b8d1fd] + + * source/encoder/motion.cpp: + replace 64s with MAX_CU_SIZE + [7ce07886514a] + + * source/CMakeLists.txt, source/cmake/CMakeASM_YASMInformation.cmake, + source/common/vec/CMakeLists.txt, source/common/vec/vec- + primitives.cpp: + cmake: detect and handle Mac OS X 10.8 default compiler + + It can't build any of the vector primitives, but at least now it + generates a working x265 exectuable + [6410d56ae417] + +2013-07-25 Vittorio Giovara + + * source/x265.cpp, source/x265opts.h: + replace --width and --height with --input-res + [d41fadfab6de] + + * source/x265opts.h: + change 'rate' option to 'fps' + [20a4642c6f5b] + +2013-09-01 Steve Borho + + * source/encoder/motion.cpp: + motion: lowres qpel + [61b2b54bf21b] + + * source/encoder/motion.cpp: + motion: simplify subpel logic + [dab5d5e5e69c] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: use lowresPlane for lowres subpel, make pixelcmp_t an + argument + [33900a0ca821] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/lowres.cpp, + source/common/reference.cpp, source/common/reference.h, + source/encoder/frameencoder.cpp, source/encoder/motion.cpp, + source/encoder/slicetype.cpp: + reference: remove lumaPlane[][] and prior calculation + [90af58daa658] + + * source/common/reference.cpp, source/common/reference.h, + source/encoder/motion.cpp, source/encoder/motion.h: + motion: cleanup subpel on-demand generation + [546d00cfe1fd] + + * source/encoder/motion.cpp: + motion: fix include slash + [c937dc7f6d8e] + + * source/common/ipfilter.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc, source/common/vec/pixel8.inc: + primitives: use intptr_t for stride arguments + [a2ff0d818bdf] + +2013-09-01 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.h: + Merge + [c31b254a4bfc] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + reference: no pre-generating reference planes + [35a513f3263b] + + * source/encoder/motion.cpp: + motion: replace intermediate values with a tmp buffer. + [0106d1f8d2a2] + + * source/encoder/motion.cpp: + motion: COST_QMV redefined + [95d92294576d] + + * source/common/reference.cpp, source/common/reference.h, + source/encoder/motion.cpp: + motion: correct strides in subpel buffer and intermediate values + buffer + [2cc40cfee287] + +2013-08-31 Gopu Govindaswamy + + * source/common/reference.cpp, source/common/reference.h, + source/encoder/motion.cpp, source/encoder/motion.h: + Motion :subpel Generation + [1032a9893607] + +2013-08-31 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: fix init order for GCC + [0e0a822fd344] + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: add missing include of math.h (sqrt) + [86e371cdb75e] + + * source/CMakeLists.txt, source/cmake/mergestaticlibs.cmake, + source/common/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: add ENABLE_STATICLIB build option (requires some black magic) + [02846fb1a082] + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused includes + [33efbdbb527c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove buffer copy for full-pel SATD measurement + [043b414d4317] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TComRdCost: remove distortion functions and + TComRdCostWeightPrediction + + TComRdCost is now just a pure-inline class with no CPP + [48e8b5c0fd9b] + +2013-08-30 Steve Borho + + * source/x265opts.h: + x265: disable weighted unipred CLI option + + The recent changes to TComPrediction::xPredInterLumaBlk() to + generate subpel on demand almost certainly busted weighted + prediction. + [326e3757c130] + + * source/x265opts.h: + x265: disable weighted bipred CLI option + [dd2e8492b873] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: simplify refPic initialization + [9d8b579bf811] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: use optimized satd primitives for bidir fractional + search + [aa7343de0a87] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix a typo + [7866a8b3d925] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove m_bc and inline xPatternSearchFracDIF + [5444249106a9] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComRdCost: remove step parameter, always 1, merge setDistParam + methods + [814c5a4538cf] + + * source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TComPicYuv: remove getLumaFilterBlock() methods, generate on demand + in subpel + + TEncSearch::xPatternRefinement() is only used for bidir refinement, + and is on the short-list to be removed once bidir is optimized. + [29d2a7ce4c68] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: replace getLumaFilterBlock() use with fpel getLumaAddr() + + This returned address is stored in m_distParam but it never actually + used. The fref is overwritten before any calls to SATD + [f36e35862749] + + * source/encoder/slicetype.cpp: + slicetype: hard-code 8x8 CU size for intra predictions output buffer + [94d8f58137bc] + +2013-08-30 Deepthi Devaki + + * source/Lib/TLibCommon/TComSlice.cpp: + Fix decoder crash, Initialize m_numberOfReferencePictureSets to 0 + [6798c3d229ac] + +2013-08-30 Steve Borho + + * source/Lib/TLibCommon/TComRom.cpp: + TComRom: tabs to spaces + [242c4c511c4f] + +2013-08-30 Deepthi Nandakumar + + * source/encoder/ratecontrol.cpp: + ratecontrol: Slightly lower I frame qp to prevent quality drop. + [090de727cc5b] + + * source/Lib/TLibEncoder/TEncTop.cpp: + ratecontrol: minor edits + [ac1f425f844f] + + * source/Lib/TLibEncoder/TEncTop.cpp: + ratecontrol: fix compile error. + [06c30405d308] + +2013-08-30 Aarthi Thirumalai + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/common.cpp, + source/encoder/ratecontrol.cpp: + modified x265_lambda2_tab[] , adjusted some rc factors, modifed + chroma weight for chromaLambda + [fbf6fedd9818] + +2013-08-30 Deepthi Nandakumar + + * source/Lib/TLibCommon/TComPrediction.cpp: + interpolate: fix hash error bug introduced by block-ip filter. + [87bb45fa2ff3] + +2013-08-30 Min Chen + + * source/common/ipfilter.cpp, source/common/reference.cpp: + interpolate: fix bug that generate error interpolate pixel in border + area + [291cbb41ab47] + +2013-08-29 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: cleanup xPredInterLumaBlk + [8b78d8cff9d8] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: rename file static variable + [cbf84884c34a] + +2013-08-29 Gopu Govindaswamy + + * source/Lib/TLibCommon/TComPrediction.cpp, source/common/reference.h: + TcomPridiction : IP calculation for a Block and modified the src + buffer + [341049cdcc5c] + +2013-08-29 Deepthi Nandakumar + + * source/common/lowres.cpp: + lookahead: Initialise I frame MBs to zero. This parameter gets + incremented in CUcost. + [4a5fd8756d8c] + +2013-08-29 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: fix eoln + [77d92ca1d183] + + * source/encoder/slicetype.cpp: + slicetype: do not allow edge CUs to contribute to frame cost totals + + (x264 does this, it seems to result in better cost estimates + overall) + [0632a39dd630] + + * source/encoder/slicetype.cpp: + slicetype: use satd for lowres intra estimate + [700603cdeb55] + +2013-08-28 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/dpb.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp: + add partial framework for frame parallelism + [1ce2c5c87b19] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use optimized sad_x4 for bidir full search + [b7c3aa334fda] + + * source/encoder/motion.cpp: + motion: fix a typo in full-search + [12addd543c0c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use optimized sad for bidir full search + [e06f1b5c456d] + + * source/common/CMakeLists.txt: + cmake: disable signed/unsigned comparisons from Intel C++ in common/ + [a60d5991e6aa] + + * source/common/vec/CMakeLists.txt: + cmake: disable some Intel warnings and errors we know are safe + [0a88b5cee672] + + * source/common/threadpool.cpp: + threadpool: fix a warning from InteL C++ compiler + [eba7f716c515] + + * build/icl/build-all.bat, build/icl/make-makefile.bat, build/icl32 + /build-all.bat, build/icl32/make-makefile.bat, build/icl64/build- + all.bat, build/icl64/make-makefile.bat: + cmake: split out 32bit and 64bit ICL nmake configurations + [5fded61de5e8] + +2013-08-28 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/dpb.cpp, + source/encoder/dpb.h: + Remove unused code related to dpb and rps + [cb3521c28879] + + * source/encoder/dpb.cpp: + dbp.cpp: white-space nits + [eab25d12ece9] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/dpb.h: + Move dpb related functions from TComSlice to DPB + [762bf799dca2] + +2013-08-28 Steve Borho + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: we have the param for frame parallelism + [476f363c87f9] + + * source/encoder/framefilter.cpp: + framefilter: use explicit 0 and 1 in lieu of Windows only FALSE and + TRUE + [7de6d0cde087] + +2013-08-28 Deepthi Nandakumar + + * source/encoder/framefilter.cpp: + Backout: breaks VC build + [bdb54195f558] + +2013-08-28 Rafaël Carré + + * source/encoder/framefilter.cpp: + Use C++ true/false rather than Windows TRUE/FALSE + + Fix build on Linux + [707c41aa5a63] + +2013-08-27 Shazeb Nawaz Khan + + * source/encoder/dpb.cpp: + A minor fix to computeRPS integration + + # HG changeset patch # User Shazeb Nawaz Khan # Date 1377595358 + -19800 # Tue Aug 27 14:52:38 2013 +0530 # Node ID + 8d26028f32cc9a2e5f09882368370689e1f317a0 # Parent + bf72e539f3d8e25ff7fe1bc176ea1516c60a73d3 A minor fix to computeRPS + integration + [0de7523f43fc] + + * source/encoder/dpb.cpp, source/encoder/frameencoder.cpp: + Integrating computeRPS to encoder + + # HG changeset patch # User Shazeb Nawaz Khan # Date 1377594214 + -19800 # Tue Aug 27 14:33:34 2013 +0530 # Node ID + bf72e539f3d8e25ff7fe1bc176ea1516c60a73d3 # Parent + 5245113fa0d66de8933b18ca03fffde4f3fbdef0 Integrating computeRPS to + encoder + [527e8a0a0f9e] + +2013-08-28 Deepthi Nandakumar + + * source/x265opts.h: + ratecontrol: adding bitrate to CLI options + [6ec33774f06a] + + * source/encoder/ratecontrol.cpp: + ratecontrol: adding inits + [62ce96dd8080] + + * source/encoder/ratecontrol.cpp: + ratecontrol: minor edits + [5c042746f711] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: fps member var renamed as framerate + [159d27a7c7d3] + +2013-08-27 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: white-space nits + [ea85b67907ca] + + * source/encoder/dpb.cpp: + dpb: fix "statement has no effect" warning + [2cf36835f6e1] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: fix initialization order + [27d87f071a92] + +2013-08-27 Deepthi Nandakumar + + * source/Lib/TLibEncoder/TEncTop.cpp: + ratecontrol: Merge + [d006294891d0] + + * source/Lib/TLibEncoder/TEncTop.cpp: + ratecontrol: RateControl methods only when ABR is enabled. + [a165b52487ca] + +2013-08-27 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/ratecontrol.cpp: + ratecontrol: fix header warning, replace tabs with spaces, + uncrustify + [f2041d164c74] + +2013-08-27 Deepthi + + * source/x265opts.h: + ratecontrol: correcting help message, Abr=0 and constQp=1 + [20c6a69a2fe1] + + * source/encoder/ratecontrol.cpp, source/x265.h, source/x265opts.h: + Adding rc-mode to CLI option list + [80baf6900512] + +2013-08-27 sumalatha + + * source/Lib/TLibEncoder/TEncTop.cpp: + computeLambdaForQP - using the formulae of HM directly to get the + lambda values + [ff4044130a96] + +2013-08-27 Shazeb Nawaz Khan + + * source/Lib/TLibCommon/TComSlice.h, source/encoder/dpb.cpp, + source/encoder/dpb.h: + Adding support for RPS generation (not integrated yet) + + # HG changeset patch # User Shazeb Nawaz Khan # Date 1377594122 + -19800 # Tue Aug 27 14:32:02 2013 +0530 # Node ID + 5245113fa0d66de8933b18ca03fffde4f3fbdef0 # Parent + 273b1face64ce9e0c391713165776c773ec54774 Adding support for RPS + generation (not integrated yet) + [71dedcc6922b] + +2013-08-27 sumalatha + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + Included the computeLambdaqp()-- reset the lambda based on new QP + [abd8a7189096] + +2013-08-27 Deepthi + + * source/encoder/frameencoder.cpp: + ratecontrol: Remove dqp, not relevant until AQ is implemented. Even + then, dqp is an LCU parameter. + [76ee630587a9] + +2013-08-26 Steve Borho + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: rename member vars for clarity + [273b1face64c] + + * source/common/reference.cpp: + reference: correctly handle frames not an even multiple of max CTU + height + [ee3ddeb3414d] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp: + lowres: use 8x8 blocks for lookahead analysis + [56110b3e965d] + + * source/encoder/dpb.cpp: + dpb: add default case for switch, remove trailing white-space + [407d8c61698d] + + * source/common/reference.cpp: + reference: fix member variable shadowing reported by GCC + [5cff93d45be5] + + * source/common/vec/pixel8.inc: + pixel: more explicit HAVE_MMX logic, fixes link errors with VC11 x64 + [8a6859929055] + +2013-08-26 praveentiwari + + * source/common/vec/pixel8.inc: + pixel8.inc: Avoiding overlap of _MSC_VER macro + [00873cc0099b] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_4 further optimization + [015f93435ac3] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x4_8 further optimization + [ff2a7b2dbfbf] + + * source/common/vec/pixel8.inc: + pixel8.inc: Further optimization + [22ccca70061e] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_8 further optimization + [64d346fc559e] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_4 further optimization + [faeb85aedd4a] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x4_16, vector replaced with intrinsic + [af46df46431c] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x4_8, vector replaced with intrinsic + [1736618230bc] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x4_4, enabled sad_x4_4 code for 32-build except VC + [80577235dba0] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_8 enabled MMX code for 32-bit build except vc + [47cf292c6619] + + * source/common/vec/pixel8.inc: + pixel8.inc:sad_x3_4 enabled MMX code for 32-build except VC + [63af120cd6cc] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_8 enabled MMX code for 32-build except VC + [5c38e12afe9a] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_4 cleanup with macro + [52e0b9869886] + + * source/common/vec/pixel8.inc: + pixel8.inc: Enabled MMX code for 32-bit build except VC [added macro + and swap postions of sse and MMX funtions] + [6b7000546c20] + +2013-08-26 Steve Borho + + * source/encoder/dpb.cpp, source/encoder/dpb.h: + dpb: pass free list by reference (fixes memory leaks) + + Passing by copy was resulting in TEncTop::m_freeList never being + used. recycleUnreferenced() was adding the free pictures to a list + that was destroyed as soon as the function returned. + [da8281ec88ac] + +2013-08-26 Deepthi + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Testharness: Removing filterV/HMultiplane from test harness. + [a37e03a3f0b6] + +2013-08-26 Min Chen + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/ipfilter.cpp, + source/common/primitives.h, source/common/reference.cpp, + source/common/reference.h, source/common/vec/ipfilter.inc: + framepp: row based interpolate + [63e0736c1f87] + +2013-08-26 Deepthi + + * source/encoder/dpb.cpp: + DPB::prepareEncode - replace code snippet with switch case + [055c97bd9b7d] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/dpb.cpp: + TEncTop: Move FrameEncoder->initSlice() to TEncTop::encode + [3dd7b6da60bb] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: (Redundant) setPOC to max value + [2f2acd2fc7ad] + +2013-08-25 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: refIdx typo corrected + [797c13ec5d2a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove redundant bestIdx initialization + [1f532e886a49] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove bFilled argument from xEstimateMvPredAMVP (always + false) + [a8b4963087b5] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nit + [9e020c0883c6] + + * source/common/vec/pixel8.inc: + pixel: use unaligned loads for fref pixels + [f05a6d740ede] + +2013-08-25 Min Chen + + * source/common/vec/ipfilter8.inc: + Improvement filterHorizontalMultiplaneExtend + [1ea25137e5cc] + +2013-08-25 praveentiwari + + * source/common/vec/pixel8.inc: + pixel8.inc: Uncrustified + [5d0c9e08265e] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x4_4 optimized assingment operators + [8af27ce24619] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3 64-bit build fail fixed + [21981597572b] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel8.inc: sad_x4_4 vector replaced with intrinsic + [827cfb10feba] + + * source/common/vec/pixel.inc: + uncrustified pixel.inc + [1306bdb29227] + + * source/common/vec/pixel8.inc: + pixel8.inc: new line cleanup + [34db6fb0cfbc] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_8 avoiding extra condition check + [86a5d7ceae78] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_16 cleanu some newlines + [83af948acce4] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_4 more optimization + [43d8ec7a9193] + + * source/common/vec/pixel8.inc: + pixel8.cpp: sad_x3_16 vector replaced with intrinsic + [aade693715b3] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_8 integrated fast MMX 32-bit build code + [22b6b2ad15c1] + + * source/common/vec/pixel8.inc: + pixel8.inc: sad_x3_4 integrated faster MMX code for 32-bit build + [828ec0701097] + + * source/common/vec/pixel8.inc: + pixel8.inc: Uncrustified + [8f4e4d104dc0] + +2013-08-25 Steve Borho + + * source/encoder/CMakeLists.txt: + cmake: force -march=i686 for framefilter.cpp + [5281d5789986] + + * source/encoder/ratecontrol.cpp: + ratecontol: fix case sensitive includes + [55d7edaa3e8e] + +2013-08-23 Steve Borho + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel: new 16xN and x3 intrinsic primitives require SSE4.1 + [c881d82f9d85] + +2013-08-23 praveentiwari + + * source/common/vec/pixel8.inc: + sad[16xN]:Intrinsic + [97d0506cb0db] + + * source/common/vec/pixel8.inc: + pixel8inc: sad_x3 8xN, replace vector with intrinsic + [9e7e1c772ec3] + + * source/common/vec/pixel8.inc: + sad_x3_4[4Xn]:Intrinsic + [5805f5a34497] + +2013-08-23 Steve Borho + + * source/common/threading.h, source/common/wavefront.cpp: + threading: rename CLZ64 to CTZ64 and reverse bit search for GCC + + __builtin_ctzll(x) - Returns the number of trailing 0-bits in x, + starting at the least significant bit position. If x is 0, the + result is undefined + [8cbc34f927b6] + + * source/common/common.h, source/encoder/ratecontrol.cpp: + common: make log2 functions safe for GCC, use multiply instead of + divide + [9da03fb899de] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel: sad primitives now require SSE4.1, properly wrap #pragma + [58dcf14afa79] + +2013-08-23 Deepthi + + * source/common/vec/pixel8.inc: + pixel8inc: sad 4xN, replace vector with intrinsic [Praveen] + [b489dfb2e90a] + + * source/common/vec/pixel8.inc: + pixel8inc: sad 8xN, replace vector with intrinsic [Praveen] + [eaf94c74774f] + + * source/common/vec/pixel8.inc: + pixel8inc: Uncrustify [Praveen] + [20bbd0ad6c1b] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + FilterExtendCURow: add primitives for horizontal filtering of each + row: Gopu + [77b53186d568] + + * source/encoder/ratecontrol.cpp: + ratecontrol: replacing more log2 calculations with macro + [c99cc112fd71] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + ratecontrol: move RCEnd back to TEncTop::encode + [e187433abd5e] + + * source/encoder/ratecontrol.cpp: + Avoid conversion to float + [a7fad933a6c3] + + * source/common/common.h, source/encoder/ratecontrol.cpp: + log2 define from x264 + [e306203049cf] + +2013-08-23 Aarthi + + * source/encoder/ratecontrol.cpp: + fixed bugs in ABR mode - implemeted log2() instead of log() as + required. + [41f20253a3b8] + +2013-08-22 Steve Borho + + * source/encoder/ratecontrol.cpp: + ratecontrol: include all enums in switch statement, avoid GCC + warning + [77418bf4a67b] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h: + sao: remove dead rdoSaoUnitAll function + [763165c09029] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: stub in partial code for CQP + [4fd6bd86ff92] + + * source/encoder/ratecontrol.cpp: + ratecontrol: initialize rce to NULL, prevent invalid free + [12199cf44193] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + sao: fix GCC warnings about autos shadowing member variables + [633718701cc9] + +2013-08-21 sumalatha + + * source/encoder/ratecontrol.cpp: + ratecontrol: added comments, corrected some errors in code + [1d890ba79f64] + + * source/Lib/TLibEncoder/TEncTop.cpp: + made changes to function call - rateControlEnd + [1498b673e95a] + +2013-08-22 Min Chen + + * source/CMakeLists.txt: + fix bug in WinXP mode + [9f38435eee26] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h, + source/encoder/framefilter.cpp, source/encoder/framefilter.h: + framepp: Parallelism of SAO (saoLcuBasedOptimization mode only) + [2b321cbfb953] + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h: + framepp: replace static class array countPreDblk and + offsetOrgPreDblk + [d11de19b9d26] + + * source/encoder/framefilter.cpp, source/encoder/framefilter.h: + improvement by replace lock to atom operator + [0e2c2ab41bd9] + +2013-08-20 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: do not use backwards L1 for slicetype estimates + + The lookahead engine can't really handle it. + [6fe2b6e1fd6f] + + * source/encoder/dpb.cpp: + dpb: ensure bframe count same as m_gopSize when fixed GOP is in use + [4bc719a5e735] + + * source/encoder/slicetype.cpp: + slicetype: remove redundant break statements + [5b67acbe7871] + + * source/encoder/slicetype.h: + slicetype: re-order member variables by size + [9994e722c6bc] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/common.cpp, + source/encoder/frameencoder.cpp, source/x265.h, source/x265opts.h: + x265: move qp parameter within rc struct, use QP logging line for + rate control + [779ac0d75231] + + * source/common/common.cpp, source/encoder/encoder.cpp: + common: combine WPP logging lines + [3bc38a85503a] + + * source/common/common.cpp: + common: add a summary log line for lookahead configuration + [566f30410df1] + + * source/common/common.cpp: + common: shorten "enabled coding tools" to "tools" + [8cb425d6dbc4] + +2013-08-20 Min Chen + + * source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/framefilter.cpp: + cleanup: remove unused function and process row once + [af98ad50dd96] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + cleanup: move temporary pointer from class + [b26267a38367] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + fix bug correct row delay with '--sao-lcu-bounds 1' + [52ba7c428cef] + + * source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + cleanup: remove reduce m_sad since we are single thread now + [9ba42b518d6b] + + * source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framepp: simplify FrameFilter control logic + [d3d6119d5662] + + * source/common/threading.h, source/common/threadpool.cpp, + source/common/wavefront.cpp: + move ATOM operators to threading.h + [b016dce3b990] + +2013-08-20 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: add a method to get estimated frame cost (calculated if + not cached) + [2a1e1e3fcba2] + + * source/encoder/dpb.cpp: + dpb: remove obsolete TODO + [24fc61094b55] + + * source/encoder/dpb.cpp: + dpb: remove redundant calls to setNumRefIdx() + [16f5434cf8e4] + + * source/encoder/ratecontrol.cpp: + ratecontrol: nit + [c177b592a915] + + * source/encoder/dpb.cpp, source/encoder/frameencoder.cpp: + dpb: move setNumRefIdx from frameEncoder to DPB for clarity + [6e45cb567792] + +2013-08-19 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: move special case logic for POC zero out of "fake" + category + [f5834b464a7c] + + * source/encoder/slicetype.cpp: + slicetype: initialize row satd sums at each row, delay + bIntraCalculated set + [fb9f092e8291] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: malloc prediction buffer to avoid stack size issues + [c4cd8d9d6c8c] + + * source/encoder/slicetype.cpp: + slicetype: x and y loops were reversed + [8157dcac7865] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: auto-variable cleanup + [da6f7e555f0d] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: do_search is a bool as well + [43f633c7b553] + + * source/common/lowres.cpp, source/common/lowres.h, + source/encoder/slicetype.cpp: + lowres: remove redundant stride variable, use + ReferencePlanes::lumaStride + [e5eeb053c459] + + * source/encoder/slicetype.cpp: + slicetype: remove redundant assignment + [e3fe0e2caa77] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: rename last_keyframe to lastKeyframe + [d06cdce272ee] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: use cfg->param structure directly, do not copy data items + [d568082adb02] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: copy bFrameBias from param structure + [c6a47d383128] + + * source/encoder/slicetype.cpp: + slicetype: fixup bIntraPenalty type + [75833aca1872] + + * source/encoder/slicetype.h: + slicetype: remove unused analyze_keyframe variable + [84fa8299c4e2] + + * source/encoder/slicetype.cpp: + slicetype: estimate SATD cost of first I frame + [60fbdb64ed0c] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: realScenecut is a bool + [2cc3f1686119] + + * source/encoder/slicetype.cpp: + slicetype: add disabled "real path" + [b250cbbd95ed] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: change slicetypeAnalyze argument type to bool, fix + cuCount + [7703d11126e8] + + * source/encoder/slicetype.cpp: + slicetype: set lowres.frameNum in addPicture() + [5d5247ccf3fb] + + * source/encoder/slicetype.cpp: + slicetype: fix order of operations, remove obsolete comment + [bafc37c130d7] + + * source/encoder/slicetype.cpp: + slicetype: improve history comment + [92e84207edc0] + +2013-08-19 Min Chen + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/frameencoder.cpp, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framepp: Refactor loopfilter thread + [800465af4795] + +2013-08-19 Steve Borho + + * source/common/wavefront.cpp: + wavefront: use _BitScanForward64 on Windows to get proper bit + priority order + + Spotted and fixed by Min Chen + [fc12faa1672e] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/dpb.cpp: + dpb: move the pushBack of the frame into DPB::prepareEncode() + [6bd04d658885] + + * source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncAnalyze: rename setFrmRate to setFrameRate + [ab8e517b3f5c] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + TEncTop: use multiple frame encoders (they do not overlap in + execution time) + [cf68474b7fd4] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/common.cpp, + source/common/threadpool.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/x265.cpp, source/x265.h, + source/x265opts.h: + x265: add -F/--frame-threads CLI option, param->frameNumThreads + + This required allocating an array of FrameEncoder instances, which + required adding a method for initializing the thread pool after + construction. + [84c939999179] + + * source/Lib/TLibCommon/TComPic.h, source/common/CMakeLists.txt, + source/common/lookahead.cpp, source/common/lookahead.h, + source/common/lowres.cpp, source/common/lowres.h, + source/encoder/ratecontrol.cpp, source/encoder/slicetype.cpp: + rename lookahead.cpp to lowres.cpp (and also header) + [9a5d62cf19f1] + + * source/Lib/TLibCommon/TComPic.h, source/common/lookahead.cpp, + source/common/lookahead.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + lookahead: rename LookaheadFrame struct to Lowres, statically + allocate frames[] + [f242ef373949] + + * source/common/CMakeLists.txt: + cmake: add COPYING to common project to remove any ambiguity + [c4963929c680] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: nit + [c6ffd3ab9493] + + * source/PPA/ppaCPUEvents.h, source/encoder/dpb.cpp, + source/encoder/frameencoder.cpp: + ppa: resolve event names which conflict with method names, add + DPB_encodeSlice + [1adb33d3536a] + +2013-08-18 Steve Borho + + * source/common/lookahead.cpp: + lookahead: re-initialize lowresMvs by dereferencing first MV (nit) + [5b97ea55042f] + + * source/encoder/cturow.cpp: + cturow: remove prefixes from loop autos + [e0b4698761fd] + + * source/encoder/cturow.cpp: + cturow: remove pc prefix from rdSbacCoder + [aaf4327d1be3] + + * source/common/lookahead.cpp: + lookahead: fix intraCost data type for malloc + [662d24a28312] + + * source/common/lookahead.cpp: + lookahead: re-enable downscale, ensure lowres stride is multiple of + 32 + [d907e9a55ad1] + + * source/encoder/slicetype.cpp: + slicetype: simplify intra processing + [6badaa4d5e5f] + + * source/encoder/slicetype.cpp: + slicetype: improve comments + [7146682f5133] + + * source/encoder/slicetype.cpp: + slicetype: remove warning disables now that functionality is all + present + [2048f9324890] + + * source/encoder/slicetype.cpp: + slicetype: update authors now that x264 routines are removed + [a1c1cbd257ff] + + * source/encoder/motion.cpp: + motion: remove hungarian prefixes from temp vars + [51d8d4962ab9] + + * source/common/mv.h: + mv: switch from class to struct since member vars do not have m_ + prefix + [5b7014113b29] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/common/lookahead.cpp, source/common/reference.cpp, + source/common/reference.h, source/encoder/motion.cpp, + source/encoder/slicetype.cpp: + reference: remove m_ prefix from member variables + [2a2569c73bdb] + + * source/common/reference.h: + reference: convert ReferencePlanes from class to struct + + I want to remove m_ prefix, and a policy of m_ for class but not for + struct seems reasonable + [bf0b1cb360f9] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/primitives.cpp: + ContextModel: move static initialization out of TEncTop constructor + [e0e620037485] + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h: + ContextModel: variable name cleanup + [86fbb4094143] + + * source/common/lookahead.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h: + slicetype: move cuWidth and cuHeight from LookaheadFrame to + Lookahead + [e3b40ff627b7] + + * source/encoder/slicetype.cpp: + slicetype: simplify intra cost check + [e1fe931856ad] + + * source/encoder/motion.cpp: + motion: avoid QPEL refinement during lookahead + + Our QPEL plane pointers simply reference the nearest HPEL plane so + doing any QPEL refine would be a waste of time, and add rnadom bits. + We should add an H.264 (A+B+1)>>1 primitive and then use this to + create fake QPEL blocks for refinment during lookahead. + [c1e1e8e43299] + + * source/common/lookahead.cpp, source/common/reference.h: + reference: add lowres flag + [6702102c2691] + + * source/encoder/slicetype.cpp: + slicetype: establish lowres search bounds + [db1d199e9f8e] + +2013-08-16 Min Chen + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framepp: fix bug with '--sao-lcu-bounds 1' and move part of sao into + compress loop + [9cecd693489c] + +2013-08-16 praveentiwari + + * source/common/vec/sse.inc: + sse_pp8: Eliminated shift + [87f93fe917f6] + +2013-08-16 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: more varname cleanups + [a5a265b57844] + + * source/encoder/slicetype.cpp: + slicetype: fix GCC nits + [442dd01cacb1] + + * source/encoder/slicetype.cpp: + slicetype: nits + [41b35c287a2f] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: fill missing detail, use x265 varname style, remove old + functions + [815a3cce3969] + + * source/encoder/slicetype.cpp: + slicetype: fix another cu_size + [4c4e30671ad7] + + * source/encoder/slicetype.cpp: + clicetype: uncrustify + [0a5a90a0899e] + +2013-08-16 ggopu + + * source/common/lookahead.h, source/encoder/slicetype.cpp, + source/encoder/slicetype.h, source/x265.h: + slicetype: Added slicetypeAnalyse + [6551f04e7eed] + +2013-08-16 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TrComQuant: fix 8bpp build + [c2f5275ecb49] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TrComQuant: lastpos formal parameter unreferenced + [16feaf4ec711] + + * source/common/vec/dct.inc: + dct: prevent compilation of quant for < SSE4.1 (fixes GCC build) + [8a0228a12a84] + +2013-08-16 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + idct: more dc mode detect code + [76ca2eae8f0b] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + cleanup: remove unused code invRecurTransformNxN + [983cc77e8526] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc: + idct: primitive blockfil_s for dc fill + [23b8aafb1fde] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + idct: Improved performance by DC only block detect + [0b225ee24b5d] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/vec/dct.inc, + source/test/mbdstharness.cpp: + generate lastPos in quant + [4be95d676094] + + * source/common/vec/dct.inc, source/test/mbdstharness.cpp: + quant: Improved performance by SSE4 + [681ab201ea0c] + + * source/encoder/slicetype.cpp: + temp for Intra Buffer generate + [ec4a9d8a39b1] + +2013-08-16 Steve Borho + + * source/encoder/ratecontrol.cpp: + tpyo + [1cf5ed68ab58] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: move defines within CPP file + [4a39b2fa427d] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: move math methods to CPP file as file statics + [1f2577034e53] + +2013-08-16 sumalatha + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/ratecontrol.cpp, + source/encoder/ratecontrol.h: + integrate ratecontrol methods and changed some APIs + [fe0adfe9d10d] + +2013-08-16 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: GCC needs math.h for log10() + [a1a2eceec263] + + * source/encoder/encoder.cpp: + encoder: add a comment explaining the odd nature of encoder.cpp + [e8d616dbbd16] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/CMakeLists.txt, + source/encoder/dpb.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h: + dpb: move fixed GOP initialization function into dpb, drop encoder.h + [b77b7dc21de9] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/CMakeLists.txt, + source/encoder/dpb.cpp, source/encoder/dpb.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + dpb: split DPB logic and data from TEncTop into a separate class + + The DPB class is still using the HM's fixed GOP mess, but at least + this is all mostly localized in one place so it can be cleaned up + together and has a clean interface. + [6d14b538caa6] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frame: inline wait_lft and re-order code for more clarity + [8f25a0cde46f] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frame: simplify determineSliceBounds + [96d870433517] + +2013-08-15 Steve Borho + + * source/encoder/frameencoder.cpp: + frame: nit + [7efbd7bb9606] + + * source/encoder/frameencoder.cpp: + frame: remove unused oneBitstreamPerSliceLength + [646346ad4fe1] + + * source/encoder/frameencoder.cpp: + frame: replace access method calls with direct variable accesses + [59d41d6da41c] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frame: rename compressSlice to compressCTURows() + [8a322b6c90ec] + + * source/encoder/frameencoder.cpp: + frame: hoist stat file logging into compressFrame() + [175ae0c43c10] + + * source/encoder/frameencoder.cpp: + frame: isolate CU processing functionality into compressSlice() + + Other unrelated functionality pulled up into compressFrame() + [8137fa713b09] + + * source/encoder/frameencoder.cpp: + frame: move xStoreWPparam() together with rest of weightp analysis + [4262ef8c2495] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/cturow.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/ratecontrol.h: + remove TEncGOP, merge remaining pieces into FrameEncoder + [f2bdd05b440d] + + * source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h: + SEIWrite: varname cleanups + [268b3e920752] + + * source/Lib/TLibEncoder/SEIwrite.h: + SEIWrite: remove only unused data member + [4de5d74111ee] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncGOP: move hash and PSNR calculations to TEncTop + + Much of the function needs to use singleton data elements in TEncTop + anyway, and very little of it needs the frame encoder. + [c98c3ad68604] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: move digestToString so it doesn't need forward decl + [bbcc53d4c501] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused xGetFirstSeiLocation + [885718e12a06] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: inline xAttachSliceDataToNalUnit, only used once + [b5027d04ff40] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: inline SEI creation functions + + I see no need to alloc/free them + [ea53a8153f61] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: drop HRD/DU logic and SEI message generation + + This HRD logic is going to be broken badly by frame parallelism, and + we do not allow it to be enabled, in any case, so I'm removing this + code before it can be broken. + [029ba5a2f11e] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/frameencoder.cpp: + TEncTop: move ASR initialization to compressSlice() + [12bf524e02d3] + +2013-08-16 Min Chen + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + framepp: move xPCMRestoration* outside from TComSampleAdaptiveOffset + [84c4b829261b] + + * source/x265.cpp: + framepp: loopfilter and sao need same control value + [15e1795cbc78] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h: + cleanup: remove unused code TComSampleAdaptiveOffset::processSaoCu + [3a0630fff639] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: remove unused code + [b1cb3809070d] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: remove unused code + [ddc1969b819a] + + * source/encoder/frameencoder.cpp: + fix VC9 compile error + [b079ba16208f] + +2013-08-15 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: unify hash and PSNR reporting into one routine + [23d8d29c5242] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove redundant argument to xCreateSEIActiveParameterSets + [d51950a93f5d] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: move prepareEncode and all RPS/DPB logic out of TEncGOP + [6dc06a8b5ebc] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: reorder some code for more clarity + [bc8d3239f833] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncGOP: prune unused cruft + [1b745ef91669] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove access var only used twice + [225ba0172ae5] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use m_cfg in lieu of m_top where applicable + [76b2e8aa1c18] + + * source/Lib/TLibEncoder/TEncCu.h, source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.h: + Remove unused TEncSlice + [95afc59fb926] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + TEncSlice: move xDetermineStartAndBoundingCUAddr to FrameEncoder + + And give a less idiotic name + [7aa3ea09223d] + + * source/encoder/frameencoder.cpp: + frameencoder: include math.h for GCC + [ca1a412ecaad] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + TEncSlice: move encodeSlice into FrameEncoder + [31676f5d2e19] + + * source/encoder/frameencoder.cpp: + frameencoder: enqueue framefilter after frameencoder + + This gives higher priority to compressCU jobs over deblocking jobs + [9d68f6d61ecf] + + * source/encoder/frameencoder.cpp: + frameencoder: simplify motion reference generation + [d1e0f1613380] + + * source/encoder/frameencoder.cpp: + frameencoder: nits + [e03a200c073c] + + * source/encoder/CMakeLists.txt, source/encoder/cturow.cpp, + source/encoder/cturow.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + cturow: split CTURow class into its own cpp and h + [93a8be5a9247] + + * source/encoder/CMakeLists.txt, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/framefilter.cpp, + source/encoder/framefilter.h: + framefilter: split FrameFilter class into its own cpp and h + + frameencoder is getting crowded with TEncSlice methods being + integrated, and it is only going to grow as TEncGOP is broken up. + [f4bbb38f427d] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + TEncSlice: move compressSlice logic into FrameEncoder + [b057e7325398] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/frameencoder.cpp: + TEncSlice: prune unnecessary cruft + [e160aef8a3a1] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: inline SAR configuration into prepareEncode() + [76636a121054] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: remove unused resetQP method + [19e8c9de3746] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + move TEncSlice::initEncSlice to FrameEncoder::initSlice + [ec371fd8c2a9] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: retrieve slice from pic + [2f2413f173f1] + + * source/Lib/TLibEncoder/TEncSlice.h, source/encoder/frameencoder.h: + more include cleanups + [26c84a647f51] + + * source/Lib/TLibEncoder/WeightPredAnalysis.h: + WeightPredAnalysis: simplify includes + [d0332c1c4efa] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: remove unused xGetQPValueAccordingToLambda + [88762f5981b5] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: report slice depth in verbose logging + [e184ead1cb2e] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/encoder.cpp: + encoder: remove unexposed m_recalculateQPAccordingToLambda + [32b7db776b91] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibEncoder/TEncSlice.cpp: + CommonDef: remove unnecessary HB_LAMBDA_FOR_LDC + [61fe1722b31d] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: re-order logic for more clarity + [4b57d673354b] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: remove obsolete comments for initEncSlice() + [6a1ed114c77a] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: re-order some functions by use + [08245cb48490] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: do not pass pocCurr to initEncSlice + [8cc6ca8a86df] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/PPA/ppaCPUEvents.h: + TEncGOP: split DPB/RPS management into prepareEncode() function + [449a3b84fb3e] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unnecessary pointer argument reference + [1f8f44007b88] + + * source/encoder/encoder.cpp: + encoder: use true/false to assign bool values + [312b7e281b1a] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp: + TEncTop: getStreamHeaders always returns one access unit + [5c22574b8dd9] + +2013-08-14 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncGOP: rename m_frameEncoders to singular + [735d3ee0ef97] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: remove m_picsEncoded + [90fe33838e63] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: move utility functions to the end of the file, cleanups + [b34a889f4388] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: increment m_totalCoded appropriately, improve comments + [80f6de28afb2] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused ref pic fields + [823a9a1ae579] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use configured bitrate instead of hard-coded value + [7b435f8feaad] + + * source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: prepare to bifurcate TEncGOP + [d7ecc2be68b7] + + * source/Lib/TLibEncoder/TEncCfg.h: + TEncCfg: drop unused m_framesToBeEncoded + [b6debda1b70d] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: fix recon file write with 16bpp builds + [ba0badc108ed] + + * source/encoder/ratecontrol.cpp: + ratecontrol: fix vc10 release build + [a506ed0b9704] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: clean compile for GCC - not sure all these changes are + correct + [e8a95b63af2b] + + * source/encoder/slicetype.cpp: + slicetype: more fixes + [c0f68e202961] + + * source/encoder/slicetype.cpp: + slicetype: include list used in lowresCosts + + 0 - intra 1 - inter L0 2 - inter L1 3 - bidir (future) + [0182e9c3abe4] + + * source/encoder/slicetype.cpp: + slicetype: fix I frame cost accumulation, use min to clamp + lowresCosts + [c744de283468] + + * source/common/lookahead.cpp, source/common/lookahead.h: + lookahead: add intraCost array + [c2a650e052a6] + + * source/encoder/slicetype.cpp: + slicetype: fixup intra cost and combinations + [ab0751792c3c] + +2013-08-14 ggopu + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype : estimateCUCost enhancement for selecting best cost + [29acacb4afe8] + +2013-08-14 Steve Borho + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/x265.h: + ratecontrol: use doubles, fix warnings + [74b11e67f664] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: re-order some code for clarity + [29635c591048] + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + ratecontrol: uncrustify + [52a709ce88a8] + + * source/common/common.cpp: + common: prevent warnings about double-to-float conversions + [f3ec4ebd72c9] + + * source/encoder/CMakeLists.txt: + cmake: add ratecontrol to the build + [5c7f8a82e267] + + * source/Lib/TLibEncoder/TEncGOP.cpp, source/common/common.cpp, + source/encoder/encoder.cpp, source/encoder/ratecontrol.h, + source/x265.h: + ratecontrol: nits + [4dfc091ce662] + +2013-08-14 sumalatha + + * source/common/common.cpp, source/common/common.h, + source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h, + source/encoder/slicetype.h, source/x265.h: + add rate control parameters to x265_param_t make corresponding + changes to ratecontrol.cpp and .h moved the macro definition of + QP_BD_OFFSET from slicetype.cpp to common.h + [d1d0d90ec2f1] + +2013-08-14 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: fix some obsolete comments + [2c89a497dc5b] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/encoder/slicetype.cpp, source/encoder/slicetype.h, + source/x265.cpp, source/x265.h: + x265: remove GOP compress cadence, encode frames one at a time + + it is still using the HM's fixed GOP schedule, but the encoder only + encodes one frame per encode call, and returns at most one encoded + frame. + [11cce4de927a] + +2013-08-13 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TComSlice: delay clearing of motion references until TComPic is + recycled + [a2026f0e1556] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: allocate frames as needed, recycle unreferenced frames from + pic list + [965dbb6e0880] + + * source/Lib/TLibCommon/TComPic.h, source/common/lookahead.cpp, + source/common/lookahead.h: + lookahead: add slicetype decision outputs + [45743a306237] + + * source/common/lookahead.cpp: + lookahead: hide unused formal parameter, prevent compiler warning + [b4ff6dca63fa] + + * source/common/lookahead.cpp, source/common/lookahead.h: + lookahead: fix EOLN, move methods to CPP file, disable downscale + again for now + [b8ad0352ce45] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: don't pretend the gop encoder has multiple frame encoders + [c34463e78f61] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/output/y4m.cpp, + source/output/y4m.h, source/output/yuv.cpp, source/x265.h: + recon: allow reconstructed images to be output in non-display order + + By returning the POC of the recon image, the output file writers can + seek to the appropriate location in the output file before writing + the frame data + [1d710404b8bb] + + * source/Lib/TLibCommon/TComPic.cpp, source/common/lookahead.cpp, + source/common/lookahead.h: + lookahead: move remaining create logic into create method + [90c11204554b] + + * source/common/lookahead.cpp: + lookahead: simplify create() method slightly + [506ee1b3ad69] + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: add missing call to lowres.create() + [8c468308ab8d] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/lookahead.h: + lookahead: move downscale and extend logic into lowres.init() + [e3bd326a9f6f] + + * source/Lib/TLibCommon/TComPic.h: + TComPic: add sliceType member variable for lookahead output + indication + + The TComPic doesn't have a TEncSlice associated with it until encode + time + [74cc4a1bfd2c] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/CMakeLists.txt, + source/common/lookahead.cpp, source/common/lookahead.h: + lookahead: isolate buffer allocation and release logic into + LookaheadFrame class + [79c8806844e7] + + * source/encoder/slicetype.cpp, source/encoder/slicetype.h: + slicetype: remove incorrect comments + [aea1b324270c] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/CMakeLists.txt, + source/encoder/slicetype.cpp, source/encoder/slicetype.h: + TEncTop: allocate a singleton lookahead instance + [9a5b30cb1224] + + * source/x265opts.h: + x265: no camelcase CLI options + [e866ff4da277] + + * source/x265opts.h: + x265: fix help for bFrameBias + [2f1c5bb1d471] + + * source/x265.cpp: + x265: show boolean flag defaults even if they have a short-option + [f94fa416ac32] + + * source/x265opts.h: + x265: re-order boolean flags for more logical groupings + [c7cfe440ace3] + + * source/x265opts.h: + x265: remove obsolete default documentation + [76775075d70d] + +2013-08-13 ggopu + + * source/x265.cpp: + x265 cli: Added default values to the CLI help text + [d374ab339bb1] + +2013-08-13 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: remove hungarian gc prefix from frame statistics members + [7caea03eee1a] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove unnecessary scope level + [03782d7def51] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTOP: forward decl Lookahead and ThreadPool + [d65cc8602bec] + + * source/encoder/encoder.cpp: + encoder: prevent warnings for open-gop config + [f3e7ad1ce870] + + * source/encoder/slicetype.cpp: + slicetype: only calculate lowres intra costs once per input picture + [9eb065201f31] + + * source/encoder/slicetype.cpp: + slicetype: rename tmp to predictions + [c9b3cd336732] + + * source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/lookahead.h: + lookahead: re-initialize lowres state for each new picture + [2751158af327] + + * source/encoder/slicetype.cpp: + slicetype: fix prediction and sa8d arguments + [78096e525541] + + * source/encoder/motion.h: + motion: expose COST_MAX + [cce777ac5df4] + +2013-08-13 ggopu + + * source/encoder/motion.h, source/encoder/slicetype.cpp: + slicetype: satd cost analysis bug fixed + [682a9fba1363] + +2013-08-13 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp: + encoder: repair open-gop behavior + + it was broken when we replaced uint getKeyframeInterval() with + signed int param.keyframeInterval + [560f4c14f09a] + + * source/encoder/slicetype.cpp: + slicetype: add comment to make uncrustify happy + [69cd47d7f82d] + + * source/common/lookahead.h: + lookahead: remove unused macro + [b87942b6386e] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/common.cpp, + source/encoder/encoder.cpp, source/x265.h, source/x265opts.h: + x265: add lookahead parameters to x265_param_t + [30c4a908cf65] + +2013-08-12 Steve Borho + + * source/common/lookahead.h: + lookahead: reorder and document LookaheadFrame + [b9c7aa48d921] + + * source/encoder/slicetype.cpp: + slicetype: rename X264_LOOKAHEAD_QP + [c8fff4b1bec8] + + * source/encoder/slicetype.cpp: + slicetype: use hex search in lookahead + [8d7ffa0d7433] + +2013-08-12 ggopu + + * source/encoder/slicetype.cpp: + slicetype : Uncrustify + [9f604820f7ef] + +2013-08-12 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: cleanup + [415f3a0ae098] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: move adaptive search range determination a bit later + [2ac81e7d676c] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use REF_PIC_LIST enums directly + [58b2989ec91e] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: move access unit allocation closer to bitstream generation + + This further isolates the slice initialization code + [7057a8e3603d] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: wrap long line + [4294560b0494] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use m_cfg to reference scaling list type + [4d5f7e8888c1] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: rename iRef to ref + [14f9aa8dbe2e] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp: + TEncGOP: TMVP has been hard-coded enabled for some time + [2181954a2d68] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp: + TEncGOP: drop file parsing of scaling lists, simplify slice + initialization + [7afeef6f2a9a] + +2013-08-11 ggopu + + * source/encoder/slicetype.cpp: + slicetype: added x265_median_mv + [8438cad92049] + +2013-08-12 Steve Borho + + * source/CMakeLists.txt: + cmake: add Windows-only option to make binary compatible with + Windows XP + [e6ad89277005] + +2013-08-11 Steve Borho + + * source/input/yuv.cpp: + yuv: delay signed int cast until after division by picture size + + This should fix reported negative frame count problems + [368c080859bd] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: GOP encoders are no longer allocated as an array + + As reported by JMK + [14a47a6c1ec1] + +2013-08-11 Aarthi + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + uncrustify, code cleanup , continue porting x264 rate control to + x265. + [4085c992877a] + +2013-08-11 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + TEncGOP: move slice reference flag from TEncSlice to TEncGOP + + Move it together with the rest of the code that determines slice and + NAL type + [7b25809b3fe4] + + * source/test/pixelharness.cpp: + pixelharness: revert memcmp counts to full block size + + I had changed a number of these to only compare the actual block + size processed by the primitive, but this was wrongly checking the + first n x m pixels instead of the n x m pixels in the upper left + corner of the output buffer (the stride of the buffers was always + 64). So it was comparing some of the wrong pixels, and not all of + the proper ones. Comparing all 64x64 pixels is harmless, and + actually verifies the primitive isn't writing beyond the correct + outputs. + [607eedcf3212] + +2013-08-09 Steve Borho + + * source/encoder/encoder.cpp: + encoder: nit + [88e0c10bf47b] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/encoder.cpp: + encoder: drop unused loop filter offsets from fixed GOP struct + [b496ddb751ac] + + * source/encoder/frameencoder.cpp: + frameencoder: fix member initialization order for GCC + [83def5041252] + + * source/encoder/slicetype.cpp: + slicetype: fix GCC warnings and EOLN damage and one bug + [69d4bdb68535] + +2013-08-09 sumalatha + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + Uncrustify ratecontrol.cpp and ratecontrol.h + [5f10bcb999ce] + +2013-08-09 praveentiwari + + * source/common/dct.cpp, source/common/vec/ipfilter8.inc, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Added name to worked files + [8a7256f288de] + + * source/common/vec/sse.inc: + sse.inc: Added name to contributed file + [93520e0c6df3] + + * source/common/vec/dct.inc: + dct.inc: Added name to contributed file + [d8777ff60029] + +2013-08-09 Steve Borho + + * source/common/lookahead.h, source/encoder/slicetype.cpp: + slicetype: fill in more lookahead detail + [63e629a504d1] + +2013-08-08 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + mark places where m_gopList is used + + aka, mark where the demolition charges are set + [14e5d42ea283] + +2013-08-08 Min Chen + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/PPA/ppaCPUEvents.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + framepp: thread of loopfilter + [facd861224d6] + +2013-08-08 Steve Borho + + * source/common/vec/pixel.inc: + pixel: allow intel compiler to generate AVX2 intrinsics + [95e6b13e7122] + + * source/common/vec/pixel.inc: + pixel: disable AVX2 instrinsic SAD primitives for VC11 + + They are failing unit tests, and causing odd encoder crashes that + appear to be related to aligned loads. We need ASM versions of these + functions. + [235519423524] + + * source/test/ipfilterharness.cpp: + ipfilter: and some more tabs + [ee7f9c2593fd] + + * source/test/ipfilterharness.cpp: + ipfilter: nuke some tabs + [d2f925d4fdb3] + + * source/Lib/TLibCommon/TComCABACTables.h: + TComCABACTables: fix ICL warning about order of variable modifiers + [820b7497f118] + + * source/test/ipfilterharness.cpp: + ipfilter: malloc output buffers for testbench - prevents ICL stack + overflow + [9130a070955e] + + * source/common/vec/sse.inc: + sse: variable renames to match pixelcmp style + [eb4da8a89357] + +2013-08-08 praveentiwari + + * source/common/vec/sse.inc: + sse_pp64: +1x over last commit + [e272439497e7] + + * source/common/vec/sse.inc: + sse_pp48: +1x over last commit + [5c0700683875] + + * source/common/vec/sse.inc: + sse_pp32: Better performance result + [14c2bcce6de1] + + * source/common/vec/sse.inc: + sse_pp24: Better performance result + [f03aca71c486] + + * source/common/vec/sse.inc: + sse_pp16: Better performance result + [e91911135862] + + * source/common/vec/sse.inc: + sse_pp12: Replced costly psrli + pmovzx and psrli + add + [307ade9f9aab] + + * source/common/vec/sse.inc: + sse_pp8: Replaced 'vpbroadcastd' with cheaper 'pxor' instruction + [11aed2197707] + +2013-08-08 sumalatha + + * source/encoder/ratecontrol.cpp, source/encoder/ratecontrol.h: + add (incomplete) ratecontrol.cpp and ratecontrol.h + [9974f57f56a1] + +2013-08-08 praveentiwari + + * source/common/vec/ipfilter8.inc: + filterVertical_s_p: VC9 fix + [5881b6f1e81a] + +2013-08-08 Steve Borho + + * source/encoder/CMakeLists.txt: + cmake: add slicetype.cpp to the encoder build + [b4fcf168c5d0] + + * source/Lib/TLibCommon/TComPic.h: + TComPic: nit + [dce4535983f4] + +2013-08-08 ggopu + + * source/common/lookahead.h, source/encoder/slicetype.cpp: + slicetype: modified lookahead structure and added + slicetype_cu_cost() + [1d6f838b4f64] + + * source/encoder/slicetype.cpp: + Uncrustify slicetype.cpp + [2bfbb7608340] + +2013-08-08 Steve Borho + + * build/icl/build-all.bat, build/icl/make-makefile.bat, + source/common/vec/CMakeLists.txt: + cmake: add batch files for Intel C++ 2013 on Windows + [57cef8258abe] + + * source/CMakeLists.txt: + cmake: Intel's ICL mimics MSVC compiler commandline, treat as equiv + [d4a7ea88c34d] + +2013-08-07 Min Chen + + * source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + fix bug when '--no-wpp --no-lft' + [33aa6210de6d] + + * source/common/common.cpp, source/x265opts.h: + rename option from 'lpf' to 'lft' + [084497e2ef56] + +2013-08-07 praveentiwari + + * source/common/vec/sse.inc: + sse_pp64: +1.5x for all versions + [710007b7f4ba] + + * source/common/vec/sse.inc: + sse_pp48: Improved performance result for all versions + [797416b5b84b] + + * source/common/vec/sse.inc: + sse_pp24: Improved performance result for all the versions + [d6403549def5] + + * source/common/vec/sse.inc: + sse_pp32: Improved performance for all versions above 1x + [480322faaa72] + + * source/common/vec/sse.inc: + sse.inc: Uncrustified + [4cd7311e31b5] + +2013-08-07 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGop: simplify call of setMvdL1ZeroFlag() + [e6d0a359648c] + + * source/common/pixel.cpp: + pixel: white-space cleanups in frame_init_lowres_core + [38f11ae10731] + + * source/test/pixelharness.cpp: + pixelharness: fix more test dimensions + [80fde3dd585d] + +2013-08-06 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: simplify pic list destruction + [e8fed4725b02] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: correct an old comment + [3afa31de4310] + +2013-08-06 Deepthi Devaki + + * source/common/vec/ipfilter8.inc: + ipfilter8.inc: Uncrustified + [3f879cd9e058] + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + ipfilter8.inc: vectorized vertical weighted filter + [56d7ae74bd6e] + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + ipfilterharness: unit test for vertical weighted filter + [053083121fe0] + +2013-08-06 Steve Borho + + * source/common/vec/pixel.inc, source/common/vec/sse.inc: + sse: sse_pp requires SSE4.1 + [9418d73daa48] + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: fix the malloc logic + [cd89e891c3b9] + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: fix white-space style + [80f154c0305f] + +2013-08-06 praveentiwari + + * source/common/vec/sse.inc: + sse_pp16: more than +1x performance improvement for all versions. + [c90132bb66fb] + + * source/common/vec/sse.inc: + sse_pp12 all versions, improved performance with intrinsic code + [d0a378326291] + + * source/common/vec/sse.inc: + +1x for all versions of sse_pp8 + [8854c6df7d10] + +2013-08-06 Min Chen + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + framepp: move frame col flag to Pic + [2d032b79a988] + +2013-08-06 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: initialize m_bframes and lowres struct to all zeros + [f48617fc87d3] + + * source/test/pixelharness.cpp: + pixelharness: futher magic value cleanups, use STRIDE as appropriate + [c9149cee2317] + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + TComLoopFilter: fix variable shadowing + + The HM was using Edge as a parameter and iEdge as a loop variable + [977bb357f2a0] + + * source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + TEncSBac: iOffset -> offset + [91743e8e2ecc] + + * source/Lib/TLibCommon/TComPattern.cpp: + TComPattern: piSrc -> sec + [0d5ec9084022] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h: + TComLoopFilter: strip hungarian prefixes from members and autos + [d42fdd481a32] + +2013-08-06 Min Chen + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + framepp: merge V/H filter into one loop + [4c49a45a5ce9] + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + framepp: loopfilter simplify and remove redundant + getDeblockingFilterDisable() + [aaf5ba14d7d4] + +2013-08-05 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: stride of 64, incr of 32, remove magical values + + This should fix AVX2 test failures and make the code more + maintainable + [922ef1b99c9a] + + * source/test/pixelharness.cpp: + pixelharness: remove redundant defines + [9c2792126e9d] + + * source/Lib/TLibEncoder/TEncCfg.h: + white-space cleanups in TEncCfg.h + [03880135c31b] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: remove dead subsample member + [18eefbb58790] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp: + TEncGOP: remove SOPDescriptionSEI, the HM's implementation required + fixed GOP + [697ed158f1e2] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove unused multi-slice logic + [4b91d6759a09] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: isolate colDir logic within B-frame expression + [43b753eef43d] + +2013-08-05 ggopu + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncTop.cpp: + lookahead: lookahead output cost allocations and initialization + [81c3ad052397] + +2013-08-05 Steve Borho + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + remove redundant m_deblockingFilterControlPresent + [ee84b4b64587] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/encoder.cpp: + remove redundant m_bLoopFilterDisable + [8db8e2fe6ba3] + +2013-08-05 Min Chen + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/common.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp, + source/x265.h, source/x265opts.h: + framepp: Loopfilter cleanup and control by --lpf + [d1f2b9cad11d] + +2013-08-05 Deepthi Devaki + + * source/common/vec/ipfilter8.inc: + Optimizations to horizontal weighted filter + [bdea613d4402] + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + vectorized horizontal weighted filter + [894e47d258a7] + + * source/test/ipfilterharness.cpp: + ipfilterharness: modifications to weighted filter testbench + [87dbfdda0769] + +2013-08-05 praveentiwari + + * source/common/vec/ipfilter8.inc: + filterHorizontal_p_s: saving instructions with control execution + [48c6641e19de] + + * source/common/vec/ipfilter8.inc: + filterHorizontal_p_p: sum decleared as local register + [77ee29b50554] + + * source/common/vec/ipfilter8.inc: + filterHorizontal_p_p: saving instruction with control execution + [0c2a6a18293e] + +2013-08-05 Steve Borho + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + ipfilter: ipfilter_pp uses SSE4.1 instructions + [a1133f538222] + +2013-08-04 Aarthi + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + remove duplicate vars, get() methods + [37cbf6432e63] + +2013-08-02 Steve Borho + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + ipfilter: remove vector versions of more optimized functions + [074c05f87c53] + + * source/common/vec/CMakeLists.txt, source/common/vec/blockcopy- + sse42.cpp, source/common/vec/dct-sse42.cpp, source/common/vec/intra- + sse42.cpp, source/common/vec/ipfilter-sse42.cpp, source/common/vec + /pixel-sse42.cpp, source/common/vec/sse42.cpp, source/common/vec + /vec-primitives.cpp: + remove SSE42 vector primitive files - they were nearly the same as + SSE41 + + There wasn't enough differentiation (only abs() changed with SSE42) + to warrant all the extra build time. + [b391765c9c96] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc, + source/x265.h: + x265: switch cpu levels from enum to defines + [3db96ea97abb] + + * source/common/vec/pixel.inc: + pixel: move sse.inc include closer to other primitive includes + [8afe0e089317] + + * source/common/primitives.h, source/common/vec/blockcopy.inc, + source/common/vec/dct.inc, source/common/vec/intrapred.inc, + source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + vec: use enums instead of magical integer values for SIMD build + macros + [6764c2750c35] + + * source/common/primitives.h: + primitives: white-space fix + [190063ccc281] + + * source/x265.h: + x265: add enums for CPU SIMD architecture levels + [adad7346e6f7] + + * source/x265.h: + x265: fix C++ism in public header and some white-space issues + [37e2c97d2478] + + * source/encoder/frameencoder.cpp: + frameencoder: do not check row priority if WPP is disabled + [d0b3ab46f903] + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/encoder/compress.cpp: + TEncCfg: nuke most set*() methods + [1b9935ff8dcf] + +2013-08-02 praveentiwari + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + filterVertical_p_p: intrinsic for vector replacement + [0a9afd5eed38] + + * source/common/vec/ipfilter8.inc: + filterVertical_s_p: constrution replaced with shuffle + [d2cbfda470bb] + + * source/common/vec/ipfilter8.inc: + filterVertical_s_p: fix for VC9 test bench fail + [62696c47e875] + +2013-08-02 Steve Borho + + * source/encoder/encoder.cpp: + encoder: fix member variable shadowing + [2416019326d9] + +2013-08-02 ggopu + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/CMakeLists.txt, + source/common/lookahead.h: + Lookahead data structures introduced + [de3e6c30815c] + +2013-08-02 aarthi + + * source/encoder/encoder.cpp: + replaced all the set*() methods with the member variables in + encoder.cpp + [83865be72a9d] + +2013-08-02 Shazeb N Khan + + * source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + TestBench for filter-Weighted + [30c7ae31afd5] + +2013-08-01 Steve Borho + + * source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/frameencoder.cpp: + wavefront: add a check for higher priority rows ready to be + processed + [322bea3559e7] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/encoder.cpp: + replace TEncCfg.getIntraPeriod with param.keyframeInterval + [86cc97cbcb3e] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + replace TEncCfg.getSourceHeight with param.sourceHeight + [67cc495aaa55] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + replace TEncCfg.getSourceWidth with param.sourceWidth + [3f971cea03e8] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + replace TEncCfg.getFrameRate with param.frameRate + [9e872c494da3] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + replace TEncCfg.getLogLevel with param.logLevel + [1f83472c5312] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + replace TEncCfg.getEnableWaveFront() with param.bEnableWavefront + [584dac26a4f2] + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncTop.h, + source/encoder/encoder.cpp: + give TEncCfg a copy of the input params + [184702b704a5] + + * source/encoder/encoder.cpp: + encoder: note some missing logic in level enforcement + [0e6828678464] + + * source/common/common.cpp: + common: update warning message about searchMethod values + [29e00b2bff05] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/motion.cpp: + add full search to optimized ME, always use our function for uni- + search + [187647eff86d] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/x265.h, + source/x265opts.h: + remove HM's unidirectional motion search except full search + [712d50ffcfe4] + + * source/encoder/encoder.cpp: + biprediction uses HM's full search routine unconditionally + + The search method for uni-directional search does not depend on + bipred search configuration. + [bdef1663b85a] + +2013-08-01 Shazeb N Khan + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSearch, TEncSlice, TComPrediction: Uncrustified + [59d9b3f08727] + +2013-08-01 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Using the weighted frames in prediction + [c77176c0a699] + +2013-08-01 Shazeb N Khan + + * source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/encoder.cpp: + Integration of weighted frames in the encoder + [2113842f58ea] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Used cached MotionReference pointers in Prediction + [62150133b753] + +2013-08-01 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/common/reference.cpp, + source/common/reference.h: + reference: add methods for setting and comparing weights + [dba06d54c67e] + + * source/common/reference.h, source/encoder/motion.cpp, + source/encoder/motion.h: + reference: split reference plane data from interpolation logic + + The motion search code only needs the ReferencePlanes data. Now that + it is separated, lookahead can use the reference plane structure + without all the MotionReference overhead. + [7b3fa7f5cfa5] + + * source/Lib/TLibCommon/CommonDef.h, source/encoder/compress.cpp: + Move LAMBDA_PARTITION_SELECT defnition from CommonDef to + compress.cpp + [f39175ab6093] + + * source/common/vec/ipfilter8.inc: + ipfilter: remove unused maxVal variable (for 8bpp it is unnecessary) + [689de127f659] + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + ipfilter_sp uses SSE4 instructions + [1ee78b4c3ba0] + + * source/common/common.cpp: + common: enable WPP by default, disable by --no-wpp or --threads 1 + [b1b84e8b7b70] + +2013-08-01 praveentiwari + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + Close to +2x performance improvement for filterVertical_s_p + [6a66dfc449bc] + +2013-08-01 sumalatha + + * source/Lib/TLibCommon/CommonDef.h: + adjusted LAMBDA_PARTITION_SELECT value to 0.9 - optimal for using in + Early-Exit NO-RDO + [053abe99ac91] + + * source/encoder/compress.cpp: + made logic changes to EARLY_EXIT_NO_RDO mode - optimized for perf + Gain + [f03c950448c0] + + * source/encoder/compress.cpp: + uncrustify : compress.cpp + [e108bfd4d6f5] + +2013-07-31 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove unused variable + [8eebc88f86e0] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: release m_refList as a singly linked list + [b59563cc34bb] + + * source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TComPicYuv: rename setBorderExtension() to clearExtendedFlag() + [0be525cd8583] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncSlice.cpp: + TComPicYuv: rename extendPicBorder to generateMotionReference() + + The function now can be said to generate a motion reference for a + reconstructed frame + [0d8544362337] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: inline the motion reference search + [f1d4fe6c85fa] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use cached motion references from slice header + [7a80be8edcfa] + +2013-07-31 Shazeb N Khan + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/reference.cpp, + source/common/reference.h: + Cached the motion reference list in the Slice + [cd150fe1d042] + +2013-07-31 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/reference.h: + MotionReference objects with distinct weights handled as linked list + [575c8ba71475] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: further simplifications + [66eab44e4c56] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp: + remove unreachable TemporalLevel0Index SEI message + [5843dfb610e4] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: further simplifications and cleanups + [a20d80859567] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove redundant slice->setNextSlice() (very next line + reverses it) + [faf8586102b5] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove redundant if() checks and pointer dereferences + [0602b9fb4f62] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp: + remove unreachable scalable nesting SEI message + [84e6d71e70e1] + +2013-07-30 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + remove unused temporal layer logic + [74060e8f4860] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: reorder some logic for more clarity + [17a48c3c393a] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused m_batchSize, m_startPOC + + Remnants of GOP parallelism + [1396e8d60605] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: move init-type code into init() function + [1610291e59a0] + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/compress.cpp, + source/encoder/encoder.cpp: + remove HM rate control code that has been long broken by WPP + [deb6891ead01] + + * source/CMakeLists.txt, source/common/vec/CMakeLists.txt: + cmake: cleanup tabs + [058ee5276203] + + * source/CMakeLists.txt, source/common/vec/CMakeLists.txt: + cmake: gcc 4.7 support for avx2 vector headers is a little touchy + [7d0616453134] + + * source/common/reference.cpp, source/common/reference.h: + MotionReference class with weighted frames + [0b55e4728a3d] + +2013-07-30 Deepthi Devaki + + * source/common/ipfilter.cpp, source/common/primitives.h: + Modified weighted IP Filter primitive to include Full Pel + [575b46e6d413] + +2013-07-29 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: re-order destruction so thread pool outlives job providers + [057b9a97c920] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: move frame list, recon list, and access units out of + TEncGOP + + This is in preparation for making a new slicetype / lookahead + structure and breaking TEncGOP into a half dozen pieces. + [ec1852ccacee] + + * source/common/common.cpp: + common: enable AMP by default + [2f3df1553a49] + + * source/common/common.cpp: + common: enable SAO by default + [9fd468696d9a] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + make decoder refresh type a configurable param [CHANGES OUTPUTS] + + Now that internal GOP parallelism is gone, we can allow the + different modes to be selected at runtime. + [4d923d9b501a] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: recover HM's original lambda calculations [CHANGES + OUTPUTS (back)] + + Now that internal GOP parallelism is no longer supported, this logic + is no longer a determinism bug waiting to happen. But we still want + to replace all this "magical math" with simple QP->lambda tables, in + the long term. + [4992e5165984] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nit + [62db6594a54e] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/common/common.cpp, + source/encoder/encoder.cpp, source/x265.h, source/x265opts.h: + remove GOP parallelism, simplify TEncTop and TEncGOP + [fabc0722f393] + + * source/CMakeLists.txt: + cmake: add x265.h to CLI visual studio project for convenience + + most people will open the x265 project looking for x265.h + [3cadbb2d0150] + +2013-07-27 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h: + x265: add x265_encoder_headers() public API + + This brings us more in sync with x264, and makes GOP parallelism + almost trivial to implement above the level of the encoder (so we + can remove the internal one) + [2bda33439bf9] + +2013-07-29 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove unused variable + [3e055671010e] + +2013-07-29 praveentiwari + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + ipfilter: +3x for filterHorizontal_p_s functions + + ipfilter_ps now requires at least SSSE3 + [5feb9f813a64] + + * source/common/vec/ipfilter8.inc: + filterHorizontal_p_p vector portion replaced with intrinsic code + [74d8c41266d5] + + * source/common/vec/ipfilter8.inc: + Uncrustified ipfilter8.inc file + [9fb0dd3a7460] + +2013-07-29 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: fix 16bpp test bench + [364f48a1e183] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: cleanups + [8f7dc5f4a05c] + +2013-07-27 Steve Borho + + * source/encoder/compress.cpp: + compress: inter2Nx2NCost was set but not used in default builds + [08061e08d3bb] + + * source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp: + fix compile with ENC_DEC_TRACE enabled in TComRom.h + [5ad0b644d15f] + + * source/common/vec/ipfilter8.inc: + ipfilter: fix variable shadowing reported by GCC + [2d13884687e8] + +2013-07-26 Steve Borho + + * Merge with stable + [30c257a1fa77] + + * build/regression/email-csv.py: + regression: improve email-csv.py + [36c6c198f885] + + * source/encoder/compress.cpp: + compress: code cleanups, no functional changes + [ca9d2057a56c] + +2013-07-26 Aarthi + + * source/encoder/compress.cpp: + changes to early Exit No RDO - roughly +14% with EARLY_EXIT_NO_RDO=1 + [5ef875c10c88] + +2013-07-26 Steve Borho + + * source/CMakeLists.txt: + cmake: remove unused NO_RDO_EARLY_EXIT build option + [2b4f9d12f141] + + * source/encoder/compress.cpp: + move EARLY_EXIT_NO_RDO to top of compress.cpp + [6e4a1748e98a] + + * source/common/vec/ipfilter8.inc: + ipfilter: use unaligned load for coefficients + + The chroma coeff table can't be loaded on 16byte alignment + [73d93cdcf978] + + * source/common/vec/ipfilter8.inc: + ipfilter: fix eoln damage from cut-paste + [70ff5b57e0e2] + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + ipfilter now requires at least SSSE3 + [06e9fb6913aa] + +2013-07-26 praveentiwari + + * source/common/vec/ipfilter8.inc: + Approx +5x for ipfilterH_pp<8> and ipfilterH_pp<4> + [dee0115561d9] + +2013-07-26 Steve Borho + + * source/common/x86/pixel.h: + asm: white-space nits + [76931d20b082] + + * Merge with stable + [e23b5e5f7347] + + * source/common/x86/README.txt, source/common/x86/const-a.asm, + source/common/x86/cpu-a.asm, source/common/x86/mc-a2.asm, + source/common/x86/pixel-a.asm, source/common/x86/sad-a.asm, + source/common/x86/x86inc.asm, source/common/x86/x86util.asm: + asm: refresh all asm to a recent x264 revision + [d0542565dc1d] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h: + asm: use macro decl of downscale prims + [fc069ab1ecd6] + + * source/common/x86/asm-primitives.cpp: + asm: remove pragma warning disable, connect some 16bpp routines + [b1bbb760f007] + + * source/test/pixelharness.cpp: + pixelharness: fix alignment of downscale input arguments + [4117d2f613ac] + +2013-07-26 ggopu + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/CMakeLists.txt, source/common/x86/asm- + primitives.cpp, source/common/x86/mc-a2.asm, + source/common/x86/pixel.h, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Lookahead: asm primitve for downscale + [71234ae45d95] + +2013-07-26 Steve Borho + + * source/common/common.cpp, source/x265.h: + api: rename x265_bit_depth to x265_max_bit_depth for clarity + + For 16bpp builds, we will eventually want to allow 12bit pixels, but + it should also be capable of encoding 10 and 8 bit streams as well. + Use 8 even for 16bpp builds for now, since we know higher bit depths + are broken. + [d44588a406ce] + + * source/PPA/ppa.cpp, source/common/threading.h: + use lower case consistently - fixes cross-compiles + [aa392516ae56] + + * source/test/testbench.cpp: + testbench: use x265_malloc / x265_free to avoid duplicate logic + [6b18efa79241] + + * source/common/common.cpp: + common: fix mingw64 malloc + [29fd8ece16d7] + + * open a permanent named branch "stable" + + The stable branch is for bug-fixes and documentation improvements + The default branch is where all new development takes place + + Each time bugs are fixed on stable, the stable branch is merged into + default. Default is only merged back into stable just prior to a new + release tag. + [1fd88edc7b11] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + intra: re-enable 64x64 downscale path + [f2f70fa9b4f3] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + intrapred: fix bug on --cpuid less than 4 + [bd8ef1eb5a6c] + + * source/encoder/encoder.cpp: + encoder: ensure the setLFCrossTileBoundaryFlag() method is called + once + + Even though we do not use tiles, this could have affected the logic. + For basic encodes, we are now valgrind clean. + [904c5cac454d] + + * source/encoder/encoder.cpp: + encoder: fix bug reported by valgrind; config vars referenced before + assignment + [13729094ca7e] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix crash in computeSSD when frame stride is not aligned + enough + [e53c6f48ba9f] + + * source/test/pixelharness.cpp: + testbench: fix leak in pixelharness + [4968a6bfd760] + +2013-07-25 Steve Borho + + * source/common/vec/CMakeLists.txt, source/common/vec/vec- + primitives.cpp: + cmake: clarify GCC versions which supported AVX, XOP, and AVX2 + intrinsics + [1dfbbb7d2476] + + * source/common/vec/pixel8.inc: + pixel: remove unused formal parameter names + [ddaaf7b2bd70] + + * source/VectorClass/vectori256.h: + vector: fix variable shadow warnings reported by GCC 4.8.1 + + The second shadow was almost certainly a bug + [d6da69ef1e14] + + * source/CMakeLists.txt: + cmake: on Mac OS X, it is not necessary to link with rt + [2f5480619e98] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/common/TShortYUV.cpp, + source/common/common.cpp, source/common/ipfilter.cpp, + source/common/reference.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/test/testbench.cpp: + common: introduce X265_MALLOC/X265_FREE to replace xMalloc/xFree + + Uses _aligned_malloc() on Windows and posix_memalign() on everything + else. Old MingGW32 requires a name mapping hack + [0e9399d65002] + + * source/common/x86/asm-primitives.cpp: + asm: more white-space nits + [a32a25e928e6] + + * source/common/x86/asm-primitives.cpp: + asm: white-space only + [5f20aa2d81bf] + + * source/common/x86/asm-primitives.cpp: + asm: CPUID 3 is SSSE3, 4 is SSE4. CPUID 1 == C only + [5c2f9bb8f965] + + * source/Lib/COPYING.HM: + license: add a copy of HM's license to source/Lib + [6f06736eb325] + + * source/x265.cpp: + x265: exit cleanly when missing input YUV parameters + [1fd3c11d5219] + + * source/x265.cpp: + vld: report leaks to file as well, useful for regression suite + [d0153575b4b4] + + * source/x265.cpp: + x265: clear progress report line at exit only if progress enabled + [fc9a6776f9d0] + + * build/regression/01build-and-smoke-test.bat: + regression: leave build remnants in place, delete cmake cache + between runs + [44c880a8932b] + + * build/regression/01build-and-smoke-test.bat, + build/regression/02perftest.bat: + regression: use --no-progress when running encoder, to clean up logs + [b8b9ff6596ee] + + * build/regression/01build-and-smoke-test.bat: + regression: use single-threaded msys make to avoid console color + issues + [62e3828fe4d9] + + * doc/LookaheadPlan.txt: + doc: publish a plan for porting x264 slice decision and adding frame + parallelism + [67d705db7085] + +2013-07-25 Min Chen + + * source/encoder/compress.cpp: + intrapred: sync buffer overflow patch to mode --no-rdo + [67da2187eb8b] + +2013-07-24 Steve Borho + + * source/common/vec/dct.inc, source/common/vec/intrapred.inc: + dct,intra: protect include of smmintrin.h from SIMD arch < SSE4.1 + [25b5e4a4cc1e] + + * source/common/vec/CMakeLists.txt: + cmake: use appropriate -mARCH for each intrinsic file set for GCC + [8f746d62e6e1] + + * source/x265.cpp: + x265: check YUV parameters before trying to open YUV file + [580ded735dc2] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: cache strides in a member variable + [d3ae586fca87] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: remove obsolete malloc includes + [6da8f47eb6e1] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: varname cleanup + [cffe02b2feda] + + * source/Lib/TLibCommon/TComYuv.cpp, source/common/TShortYUV.cpp: + TComYuv: variable name cleanups + [5487a1e80f4e] + +2013-07-24 Deepthi + + * source/Lib/TLibCommon/TComYuv.cpp: + Unreferenced variable removed. + [fd4954d7c2ac] + +2013-07-24 ggopu + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv: Added primitive functions for Addclip Luma and chroma + [47edc0c679e0] + + * source/common/TShortYUV.cpp: + TshortYUV: Added Primitive Functions for BlockCopy and AddClip for + both luma and chroma + [113fcd8b89a6] + +2013-07-24 Deepthi + + * source/encoder/compress.cpp: + Wrapping up early exit code development in a macro. + [ad24a3cc9e6d] + + * Merge + [fa3b07fcf212] + + * source/encoder/compress.cpp: + Variable renaming in no-rdo mode: removing ambiguities + [e61e07e405f1] + +2013-07-24 Steve Borho + + * source/PPA/ppa.h: + ppa: fix "potential uninitialized variable" warning from GCC + + gcc is being somewhat obtuse here, but I'll humor it + [138898b6c3e8] + + * build/regression/02perftest.bat: + regression: improve log message in perftest + [a210f3b7704c] + + * build/regression/01build-and-smoke-test.bat, build/regression + /config-example.txt, build/regression/run.bat: + regression: add MSYS builds to regression suite + [9c25b085049c] + +2013-07-23 Steve Borho + + * build/regression/02perftest.bat: + regression: properly use configured perfframes for performance tests + [a0e145b76ee5] + +2013-07-23 maheshpittala + + * build/regression/01build-and-smoke-test.bat, + build/regression/02perftest.bat, build/regression/commandlines- + example.txt, build/regression/config-example.txt, build/regression + /email-csv.py, build/regression/run.bat, source/test/testbench.cpp: + Add regression scripts + [db664de68d3a] + +2013-07-23 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: temporarily disable 64x64 intra fast path + + Running in debug it is reporting stack corruption around the above[] + array + [dabc5df15515] + +2013-07-23 Deepthi + + * Merge + [9f5a5ad402b6] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Function signature of xcomputeCostInter changed + [00115ae64f1a] + + * source/encoder/compress.cpp: + Computing NxN satd costs for the previous depth + [a952c558dfec] + + * source/encoder/compress.cpp: + Rearranging code + [68303445d7f6] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Renaming Recon to Pred: we'll be saving satd costs and predicted + frames for NxN vs 2nx2n decision + [28198b1f7e8b] + +2013-07-22 Min Chen + + * source/common/pixel.cpp: + intrapred: fix mistake between C and Instrinsic + [a5dd4011b8bb] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + intrapred: fix bug in HIGH_BIT_DEPTH=1 + [460189b98f61] + +2013-07-23 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp: + Adding a missing memory initialisation + [e6622364d73d] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Adding new recon structures in no-rdo mode - interNxN + [643c56139902] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Adding new structures in no-rdo mode - interNxN + [386daa4ac9a9] + +2013-07-22 Steve Borho + + * doc/UnitTestUsage.txt: + remove obsolete unit test doc + [2bbe9227d44c] + +2013-07-22 Deepthi + + * doc/Doxyfile, doc/mainpage.h: + Removing Doxygen related documentation from the repo + [0c4eebd03a00] + + * doc/gop-structure-example.pdf: + Removing GOP structure example - the POC/decode order is a familiar + concept carried over from H.264 + [6d0c465b4ba8] + +2013-07-21 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc, source/encoder/compress.cpp: + intrapred: Improvement Inra 64x64 [CHANGES OUTPUTS] + [18447bd07244] + +2013-07-19 Steve Borho + + * .hgtags: + Added tag 0.3 for changeset 3767fbfa970f + [158239bec6c1] + +2013-07-19 ggopu + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy.inc, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Primitive: Performance Primitives for Pixel add Clip - TcomYuv and + TshortYuv + + # HG changeset patch # User ggopu # Date 1374210970 -19800 # Node + ID 6a864fc57a58e6988969faf3b5b919dd3defa0c1 # Parent + a4c00c3f1897d9af8239bacf5f56621297e9785b Primitive: Performance + Primitives for Pixel add Clip - TcomYuv and TshortYuv + [3767fbfa970f] [0.3] + +2013-07-18 Steve Borho + + * source/common/threadpool.cpp: + gcc: fix threadpool warnings + [db7ed939be08] + +2013-07-19 Deepthi + + * source/Lib/TLibCommon/TComDataCU.h: + Cleanup: Removing codedQp member variable. + [bcc2539665c8] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Cleanup: Removing TotalBins access methods + [b56d92419898] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp, + source/encoder/frameencoder.cpp: + Cleanup: Removing TotalBits access methods + [374594f69e59] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp, + source/encoder/frameencoder.cpp: + Cleanup: Removing Distortion access methods. + [f60f409fe25f] + + * source/Lib/TLibCommon/TComDataCU.h: + Removed getTotalCost() method + [1c772b90b6ca] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp, + source/encoder/frameencoder.cpp: + Cleanup: Replace getTotalCost() with member access + [93164613a8dc] + + * source/Lib/TLibCommon/TComDataCU.h: + Making CU cost and misc variables public + [b476367a146d] + +2013-07-19 sumalatha + + * source/encoder/compress.cpp: + Changed the LOG, the pred mode was selected wrongly in few cases. + Fixed + [0c3e3a46fb7e] + +2013-07-18 Steve Borho + + * source/common/threadpool.cpp: + threadpool: repair EOLN + [a4c00c3f1897] + + * source/common/threadpool.cpp: + threadpool: nit + [7565dd4c7d54] + + * source/common/threadpool.cpp: + threadpool: reduce priority of worker threads slightly + + This gives higher priority to GOP (and later frame) threads which, + if they are not blocked should be given CPU cores over worker + threads. + [9ac15ef1fa44] + + * source/common/threadpool.cpp: + threadpool: cleanup + [1af121504330] + + * source/common/vec/CMakeLists.txt: + cmake: fix icpc AVX2 builds + [1ba93da79ac4] + + * source/common/vec/pixel8.inc: + pixel: remove the names of unreferenced parameters (GCC now + complains of these) + [a1810069717f] + + * source/common/vec/CMakeLists.txt, source/common/vec/vec- + primitives.cpp: + vec: GCC and icpc both have issues with XOP intrinsics + + Probably because both rely on GCC headers + [6e10601c8f2a] + + * source/common/primitives.cpp: + primitives: only print XOP FMA3 FMA4 if CPU level is at least AVX + + If the user asked for --cpuid 6, they should not see XOP, FMA3, or + FMA4 because those will not be used. + [ca27274a19ab] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: increase alignement of fenc buffer to 32bytes for AVX2 + [d371aff1b1fc] + + * source/common/vec/vec-primitives.cpp: + vec: only use XOP primitives when AVX CPU level is requested + [064808dc2286] + + * source/common/vec/CMakeLists.txt, source/common/vec/blockcopy- + xop.cpp, source/common/vec/dct-xop.cpp, source/common/vec/intra- + xop.cpp, source/common/vec/ipfilter-xop.cpp, source/common/vec + /pixel-xop.cpp, source/common/vec/vec-primitives.cpp, + source/common/vec/xop.cpp: + vec: add XOP build files, for late AMD CPUs + [8808d7d79279] + + * source/common/vec/CMakeLists.txt, source/common/vec/blockcopy- + avx.cpp, source/common/vec/blockcopy-avx2.cpp, source/common/vec + /blockcopy-sse3.cpp, source/common/vec/blockcopy-sse41.cpp, + source/common/vec/blockcopy-sse42.cpp, source/common/vec/blockcopy- + ssse3.cpp, source/common/vec/blockcopy.inc, + source/common/vec/vecprimitives.inc: + vec: break blockcopy intrinsics into their own files + [c27fbf50e931] + + * source/common/vec/CMakeLists.txt, source/common/vec/dct-sse2.cpp, + source/common/vec/intra-sse2.cpp, source/common/vec/ipfilter- + sse2.cpp, source/common/vec/pixel-sse2.cpp, + source/common/vec/sse2.cpp, source/common/vec/vec-primitives.cpp: + vec: drop SSE2 files, we don't care that much about CPUs that old + + This is mainly for improving compile times, since I will add XOP + files shortly + [be0fc8418935] + + * x265: fix EOLN damage + [e4b9418724ba] + + * source/x265.cpp: + x265: use puts for writing CLI args (makes GCC happy) + [ab4cb9c33e6f] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: cleanups + [2fd9dc2844af] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: tweak lambda magic value [CHANGES OUTPUTS] + + This recovers BasketBallDrive encode PSNR/Bitrate/FPS back to close + what it was when the encoder used the non-deterministic lambda + function + [4e1a8cf10b3b] + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: pass the correct lamda to bitcost calculator [CHANGES + OUTPUTS] + + This fixes a bug introduced in 15f993eceb4b, right after 0.1. The + different naming conventions had tripped me up. + [6847162fbcb1] + +2013-07-18 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + modifications to log file(LOG_CU_COST) to print the best mode, cost + chosen at each level. + [6849a1f910d0] + +2013-07-18 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove unused variable + [8878236b696d] + +2013-07-18 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intrapred.inc, + source/encoder/compress.cpp, source/test/intrapredharness.cpp: + intrapred: improvement intra_pred_planar + [c65a482b2879] + +2013-07-18 Deepthi + + * Merge + [dacf6c061e81] + + * source/CMakeLists.txt: + NO_RDO_EARLY_EXIT: experimental macro to play with early exit of + analysis + [3eca40de1a35] + +2013-07-18 Steve Borho + + * source/Lib/TLibCommon/TypeDef.h: + TypeDef: declare g_bitDepth extern if used for X265_DEPTH + [f813f110d69a] + +2013-07-18 Deepthi + + * source/encoder/compress.cpp: + Merge + [fa872fec6372] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Backout changeset: c45fe1b2bd90 + + Reasons discussed. Performance improvement at the expense of + quality/bitrate by avoiding the non-zero residual mode. + [7f52bb181cf8] + +2013-07-18 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h: + csv: fix global PSNR calculation + [031c4c889edc] + + * source/x265.cpp: + csv: add missing fps and line feed + [de92dfd387a2] + + * source/encoder/compress.cpp: + compress: fix debug build + [ed69790ac6d4] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h, source/x265opts.h: + x265: add --csv logfile command line + [5bb6d0e451c2] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: nit + [47f97d1adb8a] + +2013-07-17 Steve Borho + + * source/common/common.cpp, source/common/common.h, + source/encoder/encoder.cpp: + common: enforce global variables not changing within a single + process + [1d77c9eab71f] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h: + TShortYuv: cleanup + [b806d6e8f299] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/common/TShortYUV.cpp, source/common/TShortYUV.h: + TComYuv: cleanup + [34bbe4ec628f] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp: + remove other various internal users of bitdepth variables + [69b55b6be18c] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComRdCost: remove bitdepth from distParam + [f313ce8116b8] + + * source/Lib/TLibCommon/TComPicYuvMD5.cpp: + TComPicYuv: remove bitdepth arguments + [474327bdbefc] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + TComPattern: remove bitdepth argument from fillReferenceSamples + [2848c686e55f] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncEntropy.cpp: + TComRom: remove hungarian prefix, fix comment typo + [c227aa627d52] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/dct.cpp, + source/common/intrapred.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/reference.cpp, source/common/vec/dct.inc, + source/common/vec/intrapred.inc, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc, source/common/vec/pixel8.inc, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/mbdstharness.cpp, source/test/pixelharness.cpp: + primitives: remove bitdepth arguments from all primitives and other + functions + [0f5a9a141244] + + * source/common/primitives.h: + primitives: enable EMMS for random compilers, remove unused define + [1a0fbfe3066d] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/common/common.cpp, source/common/reference.cpp: + Make 8bpp bit depth a compile define (X265_DEPTH=8), g_bitDepthY -> + g_bitDepth + [648cf3243a4d] + + * source/Lib/TLibCommon/TypeDef.h: + TypeDef: white-space nits + [e5bd6f54af40] + + * source/Lib/TLibCommon/TypeDef.h: + Remove hacks for VC6, we support nothing before VC9 + [1c3713fe241e] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComTrQuant: eText argument is no longer needed for + invRecurTransformNxN + [ceec75c37e20] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/common/common.cpp, source/test/mbdstharness.cpp: + Remove separate bitdepth configurable for chroma + + Our x265_param_t has never allowed chroma depth to be different from + luma depth, so this is just a simplification of the code. This will + allow further optimizations to shortly follow. + [0a3d2667aab8] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/reference.cpp, source/encoder/compress.cpp: + change 16bpp Pel to UShort, equivalent to x264's uint16_t + + This removes a metric ton of ugly casts and makes the code a lot + more readable, and it should keep the 16bpp build from being + accidentally broken when Pels are passed to functions as pixels. + [57404da2493d] + + * source/common/TShortYUV.cpp: + blockcopy: further 16bpp fixes + [7fb13585cfa3] + + * source/test/pixelharness.cpp: + pixelharness: one change was missed from last commit + [d3594896e12e] + + * source/common/TShortYUV.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/blockcopy.inc, + source/test/pixelharness.cpp: + blockcopy: fix 16bpp build, pixelsubsp -> pixelsub_sp + [dd8d73aeb1de] + + * source/encoder/compress.cpp: + compress: uncrustify + [f10a163531a4] + +2013-07-15 sumalatha + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Changes in merge mode in no-rdo path for better psnr improvement and + time performance + [c45fe1b2bd90] + +2013-07-17 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/intrapred.cpp, source/common/primitives.h, + source/common/vec/intrapred.inc, source/test/intrapredharness.cpp: + Backed out changeset: aee4a23a8f01 (does not compile) + [4ffaf245b427] + +2013-07-17 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/intrapred.cpp, source/common/primitives.h, + source/common/vec/intrapred.inc, source/test/intrapredharness.cpp: + intrapred: improvement intra_pred_planar + [aee4a23a8f01] + + * source/test/intrapredharness.cpp: + cleanup: remove unused debug code + [dd6ca15e90ac] + +2013-07-17 Deepthi Devaki + + * source/common/ipfilter.cpp, source/common/primitives.h: + C primitive for weighted ipfilter + [2e067e3443c8] + +2013-07-17 Deepthi + + * Merge + [0cb8c7c9c7b2] + + * source/Lib/TLibCommon/TComPicYuvMD5.cpp: + HIGH_BIT_ENABLE build error fixed. + + Any better idea to get rid of the c4333 warnings welcome. + [800ea1ebadef] + +2013-07-17 ggopu + + * source/common/TShortYUV.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/blockcopy.inc, + source/test/pixelharness.cpp, source/test/pixelharness.h: + TShortYUV : Implemented perfomance Primitives pixelsub_sp + [054d8c409569] + +2013-07-17 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + TComSlice: relocate subpel generation to after weightp determination + [0becdecde6ee] + +2013-07-16 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: fix variable shadowing problem that was causing B frame + bugs + + This HM routine was using refRPC and pcRefRPC in the same function + with entirely different meanings, and it bit me hard when I + refactored this function with search+replace. + [a3ca6eceb27a] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + TEncGOP: cleanup, make TEncCfg WPP var a Bool + [28ce2df5cb4d] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: header cleanup + [50b6e306dcbe] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/common/reference.cpp: + TComPicYuv: cleanup + [c3fa187d5460] + + * source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComWeightPrediction.h: + Replace includes with class forward decls + [6ebabe789fc7] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: cleanup + [84c275972fee] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h: + TComPicSym: cleanup + [6afb8bad8568] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice:: fix end-of-list handling in xGetRefPic + [4a72479a9bc6] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix full search regression + [07b381e97f7d] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + repair debug build after removing formal arugments by removing + asserts + [49adf99e9138] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: fixup after refactoring + + I deleted the 0 argument instead of the NULL argument. MSVC didn't + care but GCC flagged this as a warning. NULL can't be passed as an + integer argument. + [10abee4a15c5] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + TEncCu: remove unused xTuCollectARLStats + [67c339ec83dd] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + remove buffers no longer used after removing AQ + [0fbbde73203f] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/vec/dct.inc, source/test/mbdstharness.cpp, + source/test/mbdstharness.h: + remove unreachable quantaq primitive + [5b61ed2e33e4] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp: + Remove broken adaptive QP, fix unused parameter and variable + warnings + + All of TEncPic now goes away + [979b9953d696] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + TEncCfg: hungarian cleanup + [fc4718bd4565] + + * source/Lib/TLibEncoder/TEncCfg.h: + TEncCfg: remove unused istringstream operator + [ae96f205c7b6] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + TEncSearch: remove unused formal parameters (quite a bit of + unwinding here) + [2389117c0085] + + * source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/encoder/CMakeLists.txt: + Stop ignoring many compiler warnings in TLibEncoder + + Enable warnings about unused parameters, clean up the warnings + reported. In a few cases I removed parameters that were obviously + obsolete. + + Enable warnings about unused variables, clean up the warnings. + [c65db4ee3972] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComPrediction: remove unused "bi" parameters to prediction + functions + [e047cdd02421] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + TComWeightPrediction: fix bug found by enabling warnings + [cf568953fbe7] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/CMakeLists.txt, + source/encoder/compress.cpp: + Stop ignoring many compiler warnings in TLibCommon + + Enable warnings about unused parameters, clean up the warnings + reported. In a few cases I removed parameters that were obviously + obsolete. + + Enable warnings about unused variables, clean up the warnings. This + actually found a recently introduced bug. + [4a86baaac709] + +2013-07-16 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intrapred.inc, + source/encoder/compress.cpp, source/test/intrapredharness.cpp: + intrapred: improvement intra_pred_dc + [cf614acd8099] + +2013-07-16 Deepthi + + * Merge + [8f632c25d1ee] + + * source/encoder/compress.cpp: + Merge + [0eb8a3db4b3f] + + * source/encoder/compress.cpp: + SSE cost calculation in inter-no-rdo. Raises quality and lowers + bitrate (slightly). Will be changed into a CLI configurable if perf + effect is significant. + [29c1f5b64795] + +2013-07-16 Steve Borho + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/TComRom.h: + more cleanups for CommonDef.h + + * remove circular defininition of g_bitDepthY and g_bitDepthC + * move Clip templates to TComRom.h + * remove some dead defines + [6ee321714568] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComCABACTables.h, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/dct.cpp, + source/common/pixel.cpp, source/common/reference.cpp: + TComSampleAdaptiveOffset: cleanup and global SR, major header + cleanups + + move all #defines from TypeDef.h to CommonDef.h and move all + feature-specific structs from TypeDef.h to their more specific + header + + TypeDef.h should just define types (duh!) and CommonDef.h should + just have defines. + [778f0786eff8] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + TComWeightPrediction: hungarian cleanup + [60812792ee78] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/common/TShortYUV.h: + TComWeightPrediction: cleanup and global SR iPartUnitIdx -> + partUnitIdx + [0ad503ad1110] + +2013-07-16 Deepthi Devaki + + * source/common/vec/pixel8.inc, source/test/pixelharness.cpp: + pixel8.inc, pixelharness : Uncrustified. + [4774cbad2bfb] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc, + source/test/pixelharness.cpp: + Vectorized WeightUni + [f7d8f489f694] + +2013-07-16 Steve Borho + + * source/common/vec/pixel-avx.cpp, source/common/vec/pixel-avx2.cpp, + source/common/vec/pixel-sse2.cpp, source/common/vec/pixel-sse3.cpp, + source/common/vec/pixel-sse41.cpp, source/common/vec/pixel- + sse42.cpp, source/common/vec/pixel-ssse3.cpp, + source/common/vec/pixel.inc: + pixel: move includes of pixel8.inc or pixel16.inc inside pixel.inc + + This makes the pixel-ARCH.cpp files much simpler, and makes it + easier to add headers for the pixel functions. + [44b4fb45e005] + +2013-07-15 Steve Borho + + * source/x265opts.h: + x265: document --cpuid 1 behavior + [c9bb72e8cb8e] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: cleanup classmethods + [48fb229ef210] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibEncoder/TEncCu.cpp: + TComDataCU: remove static ARL buffers, these were not thread safe + + This is further proof that the ARL buffers are essentially unused + the way we are using the encoder (with AQ disabled). It is very + tempting to remove them globally. + [a75833091e14] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPicSym.cpp: + TComDataCU: cleanup, remove "is a decoder" variable + [75e5f85ae8e1] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/compress.cpp: + TEncSlice: less eye-bleed, more global SR + [846887941425] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: cpp cleanup, remove unreferenced class static + [cd5750e16bd7] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h: + TComSlice: cleanup + global SR + [26f767779d57] + +2013-07-15 ggopu + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: CleanUp and Removd Hungarian Notation + [ae234c4c22e1] + +2013-07-15 praveentiwari + + * source/Lib/TLibCommon/TComRdCost.cpp: + Cleaup the remaing functions in TComRdCost.cpp file + [b47059899650] + +2013-07-15 ggopu + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/encoder/frameencoder.cpp: + TEncCU and TEncEntrophy: Clean Up + [cfb5b30ddadb] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp: + Cleanup and removed Hungarian notation + [add9f7dc8c83] + +2013-07-12 Steve Borho + + * source/x265.cpp, source/x265opts.h: + x265: add --no-progress CLI option to disable reports when capturing + output + [9e689682ffb1] + + * source/common/vec/blockcopy.inc: + blockcopy: AVX2 compress and extend ended up being a perf loss + [9afb0e925451] + + * source/common/vec/blockcopy.inc: + blockcopy: add AVX2 fast paths + [17a78803a971] + + * source/common/vec/blockcopy.inc: + blockcopy: ensure strides obey alignment requirements + + This should fix crashes reported by Shanthan and Brian from + Telestream + [64e98159b25e] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel: add AVX2 sad_x4 for 32 and 64 wide blocks + [c51d6f92205e] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + pixel: add AVX2 sad_x3 for 32 and 64 wide blocks + [037d8072cf60] + + * source/common/vec/pixel8.inc: + pixel8: CurN -> frefN + [17bec876e8ed] + + * source/common/vec/CMakeLists.txt, source/common/vec/vec- + primitives.cpp: + cmake: allow icpc to build AVX2 intrinsics + [bee415cae643] + + * source/VectorClass/vectori256.h, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc: + pixel: add AVX2 sad primitives for 32 and 64 wide blocks + [87eadb9849bc] + + * .hgtags: + Added tag 0.2 for changeset b3471d9009f5 + [391d4aeb3acc] + + * source/cmake/version.cmake: + cmake: properly handle tagged release archives + [b3471d9009f5] [0.2] + + * source/common/mv.h: + mv: remove unused toFPel() method + [cabc9474ccfc] + + * source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/frameencoder.cpp, source/test/testpool.cpp: + wavefront: coding-style fixes, improve comments + [09cc9680e85d] + + * source/common/wavefront.h: + wavefront: cleanup method comments + [1a2e10dbb8a3] + + * build/README.txt, source/cmake/version.cmake: + cmake: do not cache x265 version from ENV var, update status message + [60d1a6e70946] + + * source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h: + ppa: change license to x265 GPL+commercial license + [b99630f44717] + + * source/CMakeLists.txt: + cmake: add a cmake option to statically link CRT on Windows release + builds + [c9625ddb711d] + + * source/VectorClass/vectorclass.h, source/VectorClass/vectorf128.h, + source/VectorClass/vectorf256.h, source/VectorClass/vectorf256e.h: + remove vector class floating point, simplify includes + [4f9a3bda6185] + + * source/VectorClass/special/complexvec.h, + source/VectorClass/special/decimal.h, + source/VectorClass/special/quaternion.h, + source/VectorClass/special/vector3d.h, + source/VectorClass/special/vectormath.h: + remove vector class special floating point headers + [91271bbcda30] + + * source/CMakeLists.txt, source/tools/CMakeLists.txt, + source/tools/dr_psnr/CMakeLists.txt, + source/tools/dr_psnr/PsnrCalculator.cpp, + source/tools/dr_psnr/PsnrCalculator.h, + source/tools/dr_psnr/SSIMCalculator.cpp, + source/tools/dr_psnr/SSIMCalculator.h, + source/tools/dr_psnr/dr_psnr.cpp: + remove dr_psnr, it's a mess and no-one is working on it + [0839ec783eaf] + + * source/tools/TestForChecking_BadCommit.bat, + source/tools/performanceProfiler/Profiler.bat, + source/tools/performanceProfiler/Readme.txt, + source/tools/performanceProfiler/config.txt, + source/tools/performanceProfiler/performanceProfiler.bat: + remove obsolete perf batch files + [41ec50a27f4f] + + * build/README.txt: + build: update README.txt + [00400cf33211] + + * build/BuildEncoderApplications.bat, build/ConfigureBuild.bash, + build/ConfigureBuild.bat, build/CreateRegressionPackage.bat, + build/CreateRegressionPackage.sh, build/RunEncoderApplications.bat, + build/config.txt: + remove obsolete regression tests and batch build scripts + [4207769d7bd6] + +2013-07-12 Min Chen + + * source/common/vec/intrapred.inc: + intrapred: cleanup pDst + [18f282e09b36] + +2013-07-12 praveentiwari + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSSE32 cleanup + [dd2dfd0e1acf] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xGetSSE32Help cleanup + [08a9378c2d71] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSSE16N cleanup + [99196b0684e6] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xGetSSE16NHelp cleanup + [dfc5c4114f63] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSSE16 cleanup + [11d28b173c04] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xGetSSE16Help cleanup + [75305fa33a5f] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSSE8 cleanup + [03bbb4c9ae99] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xGetSSE8Help cleanup + [f0038e6006e1] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSSE4 cleanup + [3f8bffc3f0ba] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xGetSSE4Help cleanup + [fd8c0d5c4983] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSSE cleanup + [18f845fde53f] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xGetSSEHelp cleanup + [ea7669697410] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD48 cleanup + [169e5b28f63f] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD64 cleanup + [5ed30c9ea3c5] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD24 cleanup + [1dc0453c1670] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD32 cleanup + [0bfaaf368618] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD16N cleanup + [904837a3a1de] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD12 cleanup + [f8933cd25eca] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Clean up the white-space by lining up + [eb92dec76124] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD16 cleanup + [ae6d26dee61f] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD8 cleanup + [d9e9186e2e6f] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD4 cleanup + [151ff0670a0a] + +2013-07-12 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: disable sad_x3 for AVX2, does not pass tests + [077dc358e9fa] + +2013-07-12 praveentiwari + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::xGetSAD cleanup + [f3b49b4bc244] + + * source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp: + Fix for Debug version build + [6fc940f59c01] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::setDistParam overloaded version clenup + [7a1c0fc82cb0] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::setLambda cleanup + [6c97e265b662] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost::setDistParam cleanup + [191eca565ac9] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + bApplyWeight member variable replaced with applyWeight + [22a03cf91021] + +2013-07-12 Steve Borho + + * source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + Fix Build Errors for HIGH_BIT_DEPTH enable + [8758e968520c] + +2013-07-12 praveentiwari + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + iSubShift member variable replaced with subShift for all referances + [deffbdd77bb4] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Uncrustified TEncSearch.cpp file + [8847796da616] + +2013-07-11 Steve Borho + + * source/x265.cpp: + x265: prune dead short_options and fix no-arg flags + [1858833d3ef1] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove unnecessary pointer argument refs + [a1b29fa01efb] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + compress: hungarian cleanups, white-space fixes + [b15db4f286f6] + + * source/common/common.h: + common: fixup commit for previous (msvc didn't write the file) + [d36cc90f9848] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/common.cpp, + source/encoder/encoder.cpp, source/x265.cpp: + x265: call x265_set_globals() when the encoder is created + [3d61f1b09eb6] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/compress.cpp, + source/encoder/frameencoder.h: + TEncCu: more de-hungarian, global SR, and AMVPInfo cleanups + [d75d51592719] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComYuv.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/common/TShortYUV.h, + source/encoder/compress.cpp, source/encoder/frameencoder.cpp: + TEncCu: more de-hungarian and global SR and more cleanups + [d5ab0435a15b] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + TEncCu: de-hungarian buffer member variables + [870fe66f4533] + + * source/Lib/TLibEncoder/TEncCu.h: + TEncCu: white-space and comment cleanups + [0a126dd487b2] + + * source/Lib/TLibCommon/TComPattern.cpp: + TComPattern: finishing touches + [f2ff5cb195fc] + + * source/Lib/TLibCommon/TComPattern.h: + TComPattern: white-space cleanup of header + [98e1d31e7029] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + TComPattern: de-hungarian, and global SR + [c75bdba9683f] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/common/pixel.cpp: + TComPattern: cleanup + global SR + + neighbouring => neighboring, iStride -> stride, iComp -> comp + [5d5c89968b66] + + * source/common/common.cpp: + Backed out changeset: 064e6bfa93f0 (too much PSNR loss to be + default) + [61d15d0b453d] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComMotionInfo: simplify TComCUMvField + [24d0170026dc] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h: + TComMotionInfo: simplify TComMvField + [73fec016b9b9] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + x265: hide busted adaptive QP functionality + + This might get ripped out entirely, will wait until after the GPL + release + [be8598d81e19] + + * source/common/common.cpp: + x265: roughly alphabetize tool option reporting + [771d4d232d7e] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + x265: hide --sao-max-offsets argument + [93d5e906c6ef] + + * source/common/common.cpp: + x265: enable --fast-cbf by default [CHANGES OUTPUTS] + + This appears to be a really good trade-off when RDO is enabled + [064e6bfa93f0] + + * source/common/common.cpp: + x265: tweak option logging + [41d1a452c033] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + x265: hide --tmvp parameter, it does not appear useful + + --tmvp 0 and 2 appear to both be worse than the default In the + interest of simplifying our API for the near-term, I'm removing the + param + [98fb5cbf6f57] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + x265: hide --merge-level parameter, it does not appear useful + + --merge-level 3 decreases bitrate very slightly + --merge-level 4 increases bitrate + + In the interest of simplifying our API for the near-term, I'm + removing the param + [c1fd77c0a995] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + TComTrQuant: nits and global SR + [3cfa626d1ec4] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + TComTrQuant: remove hungarian from QpParam + [959bd4f14b23] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: remove a couple more invalid pointer refs, fix a + comment + [a930ca2615c1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + fix variable shadow warnings reported by GCC + [2f36960fb069] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp: + refactor: iDir->dir, iList->list + [17f0a360918f] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp: + TComPrediction: remove ptr arg references, plus some global + search/replace + [b14b992b3f0c] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TComPrediction: cleanup hungarian, plus some global search/replace + [37d226b549d7] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/dct.cpp, + source/common/intrapred.cpp, source/common/ipfilter.cpp, + source/common/vec/dct.inc, source/common/vec/ipfilter.inc, + source/common/vec/ipfilter16.inc, source/common/vec/ipfilter8.inc, + source/test/ipfilterharness.cpp: + refactor: move subpel interpolation filters to TComRom, remove dups + [8b7079adef37] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TComRdCost: adopt x264 style names, move last mv cost remnant to + TEncSearch + [9ffb3769afdb] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/encoder/compress.cpp: + TEncCU, TEncSAO: global search/replace for common hungarian names + [3edafbb958a7] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: cleanup set method names + [251a3fb404f0] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: further cleanups + [0fe565be0395] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + TEncSearch: remove more hungarian from member vars, other cleanups + [1992995f62f3] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: cleanup init, remove unused s_dFilter, m_puiDFilter + [1aa4e291c6dc] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove hunariang prefixes from member vars, remove + unused min/max + [42754524511a] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove hungarian prefixes from temp buffers + [5098bd2dfa09] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove unused singleton temp buffers + [879890f4aa94] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: qtTempAccessLayer -> qtlayer + [ab54c5d80683] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove redundant white-space + [bb1b55c79d3d] + +2013-07-11 Deepthi Devaki + + * source/common/pixel.cpp, source/common/primitives.h: + WeightedPredicton: Code cleanup, variable renaming - no logic + change. + [b09304da8423] + + * source/Lib/TLibCommon/TComWeightPrediction.cpp: + Weighted prediction primitives integrated to encoder. + [a9cf11753b9d] + + * source/common/pixel.cpp, source/common/primitives.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + C primitives and testbench support for weighted prediction + unidirectional. + [a8fba5231ff3] + +2013-07-11 praveentiwari + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp: + iStrideCur member variable replaced with strideCur for all + referances + [cdc2757a09f0] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp: + iStrideOrg member variable renamed to strideOrg for all referances + [24670d4aea5e] + +2013-07-11 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xEncodeResidualQT + [c1e2e7ee80f8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: further cleanup of xEstimateResidualQT + [e8c50a292890] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: further cleanup of xEstimateResidualQT + [dfdd07b57714] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: more cleanups + [2d6cbbfeae8b] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: partial cleanup of xEstimateResidualQT + [54b7e44a945f] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup encodeResAndCalcRdInterCU + [740e4002bc8a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xPatternSearchFracDIF + [5cfdcda40cd9] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xPatternSearchFast + [5170b6383156] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xPatternSearch + [ad8f26c19b65] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xMotionEstimation + [2158663a6e10] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xGetTemplateCost and xEstimateMvPredAMVP + [17699ac30ca9] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup predInterSearch + [75d4bc043fa1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xMergeEstimation + [322076f45e2a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup IPCMSearch + [a566e9fdb97a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xEncPCM + [1a15e5414b07] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup estIntraPredChromaQT + [6e8092345092] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup estIntraPredQT + [725885f26120] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup preestChromaPredMode + [a81b1600a1ae] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xSetIntraResultChromaQT + [a3bf887469c8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xRecurIntraChromaCodingQT + [d8c562aab747] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xLoadIntraResultChromaQT, use block copy + primitives + [87516ea1b92e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xStoreIntraResultChromaQT + [204d2839f830] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xLoadIntraResultQT, correct stride and size for + chroma + [7367545ec76a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xStoreIntraResultQT + [65409511940b] + +2013-07-10 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xSetIntraResultQT + [b549964b4636] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xRecurIntraCodingQT, use block copy primitives + [5542e517b305] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xIntraCodingChromaBlk + [e383fa1d83b7] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xIntraCodingLumaBlk + [155ee7e2e1a8] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: cleanup xEncCoeffQT, remove unused bRealCoeff + [f5e166b6a6f2] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xGetIntraBitsQT + [c21dd51ed3ca] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xEncIntraHeader + [3031fd60571a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xEncCoeffQT + [ff557176bba7] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanup xEncSubdivCbfQT + [d4433b481d08] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove meaningless consts and comments + [2245a4a8b34d] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: followup renames + [c1a9e7bb9749] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: cleanup the header + [d09e321b4d06] + + * source/common/primitives.h, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc: + primitives: cleanup funcdefs + [d6ad445e4eeb] + + * source/encoder/compress.cpp: + compress: remove trailing white-space + [cd6cdea03bbe] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/encoder/compress.cpp: + more mass search-replaces and various other cleanups + [f04c1ab7b493] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/encoder/compress.cpp, source/encoder/frameencoder.cpp: + global search-replaces for common variables, remove prefixes + + pcCu -> cu, uiTrMode -> trMode, pcYuvPred -> predYuv, eRefPicList -> + picList, iRefList -> refList, iRefIdxTemp -> refIdxTmp + [4349ad516f23] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: fix memory leak + [48e99b31ca1e] + +2013-07-10 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup xGetCodedLevel function + [ccb2f92de6a4] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Clenup xGetICRateCost function + [6239f27298a1] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup xGetICRate function + [26e5e760a27a] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup setErrScaleCoeff function + [3b98d991c3e2] + +2013-07-10 Steve Borho + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/encoder.cpp: + TEncSearch: remove m_bUseHADME, clean more hungarian + [7cffa89fad19] + +2013-07-10 ggopu + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncodeSearch : Cleanup Removed un used codes, white space and + Removed Hungarian Notation + [435170a8901e] + +2013-07-10 Deepthi + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Removed duplicate copy of Merge RD cost estimation. + [422cbbbb1493] + + * source/encoder/compress.cpp: + Code reuse of Merge: part 2 + [6c320ed872a4] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Code reuse of Merge cost estimation for rdo/no-rdo: part 1 + [e2ded94ab75d] + + * source/Lib/TLibEncoder/TEncCfg.h: + Removing fdm from TEncCfg + [d291ed7c19b5] + + * source/common/common.cpp, source/encoder/encoder.cpp, source/x265.h, + source/x265opts.h: + Removing bEnableFastMergeDecision from CLI Options. + [2b4e59ecd75b] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Fast Decision for Merge always set - this CLI option does not give + any higher efficiency. + [8aff90a1c454] + + * source/encoder/compress.cpp: + Enabling merge modes with non-zero residual in no-rdo mode. + [ff2b06916f87] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Recon for merge modes - store in TEnCU structure + [6c654e521fa4] + +2013-07-10 Steve Borho + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/reference.cpp, + source/common/threadpool.cpp, source/common/vec/dct.inc, + source/common/vec/intrapred.inc, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/compress.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/motion.cpp, + source/encoder/motion.h, source/test/intrapredharness.cpp, + source/test/ipfilterharness.cpp, source/test/pixelharness.h, + source/test/testbench.cpp, source/x265.cpp, source/x265.h: + uncrustify + [ae3cf6062c1a] + +2013-07-09 Steve Borho + + * source/test/mbdstharness.cpp: + mbdstharness: workaround for linux test failures + [2bad78a65d37] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + TEncCU: use loops to create/destroy objects, rename m_IntraInInterCU + [e126e6386acc] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: fix shadowed variables for GCC + [df76c016468d] + + * source/common/common.cpp: + common: fix a potential 8bpp usage bug + [f6552d7189dd] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/common/common.cpp: + TComRom: only allow zscan tables to be initialized once + [917b56038be3] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/TShortYUV.h, + source/common/butterfly.h, source/common/common.cpp, + source/common/dct.cpp, source/common/intrapred.cpp, + source/common/vec/intrapred.inc, source/encoder/compress.cpp, + source/encoder/encoder.cpp, source/encoder/frameencoder.cpp: + TComRom: remove hungarian notation, remove unused butterfly.h + [2f68b284ba53] + + * source/Lib/TLibCommon/TComRom.h: + TComRom: simplify white-space + [daa1bf21a3ce] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: de-hungarian the member vars and auto-vars + [95ae28081f11] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComTrQuant: remove hungarian notations from member variables + [6af026c91d32] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/encoder/frameencoder.cpp: + TComTrQuant: remove m_bEnc member variable + [a9e3cf2ee919] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: CU -> cu + [079bbdf4d687] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove hungarian prefixes from auto-vars + [d366eeefac0f] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSbac.cpp: + TComTrQuant: hungarian removal, white-space cleanup, remove useless + comments + [a096d4fdefe5] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComTrQuant: simplify function arguments + + * do not pass arlCCoef as a pointer reference + * return acSum rather than pass as integer reference + [d6c1bf1a4801] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, source/common/dct.cpp, + source/common/vec/dct.inc: + TComTrQuant: remove deprecated DCT functions + [733534550553] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: lowercase cu, other cleanups + [10a797823849] + +2013-07-09 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup xRateDistOptQuant function + [6cef4fc5a850] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup xITransformSkip function + [0124ebb836bb] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup xTransformSkip function + [593ccf9b637b] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup xIT function + [23b7b876ad58] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup invRecurTransformNxN function + [3e2ea30421a1] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup invtransformNxN function + [ce4c31d4fcad] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + More cleanup with transformNxN + [6151c3d14f14] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + cleanup transformNxN function + [dc146c2b451a] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + cleanup init function + [741604b8a4b5] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + cleanup xDeQuant function + [0847be750c4e] + +2013-07-09 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: lowercase cu + [bd0ece7e95a1] + +2013-07-09 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + cleanup xQuant function + [c1bdb5c6f841] + +2013-07-10 Deepthi + + * Merge + [707b458d0e5e] + +2013-07-09 Deepthi + + * source/encoder/compress.cpp: + Exit early if merge modes detected + [df22ad807c9f] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Updating merge cost calculation to enable early skips. + [0cd3c9c94bb0] + +2013-07-09 Steve Borho + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc, + source/test/testbench.cpp: + misc fixes + [aa8ddcc78c4a] + + * source/common/vec/intrapred.inc: + intrapred: further cleanups + + * rename methods to match primitive funcdef + * remove unoptimized all-angs methods, use C ref + [5a81d522e9e3] + + * source/common/vec/ipfilter8.inc: + ipfilter: white-space fixup + [2e6f01521c90] + + * source/common/vec/ipfilter8.inc: + ipfilter: remove buggy SSE2 version of + filterHorizontalMultiplaneExtend + [c1df7dba97be] + + * source/common/vec/ipfilter16.inc, source/common/vec/ipfilter8.inc, + source/test/ipfilterharness.cpp: + ipfilter: follow up cleaning, one 16bpp compile fix + [0602a41d8ced] + +2013-07-09 ggopu + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc: + ipfilter*.inc : Cleanup function names and variables + [1d24abd75465] + + * source/common/common.cpp, source/common/ipfilter.cpp: + ipfilter: Cleanups function names and Variables + [05b43fadedd8] + +2013-07-09 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + cleanup signBitHidingHDQ function + [ea86a1524985] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Cleanup storeSliceQpNext function + [7b33153ce08c] + +2013-07-09 Deepthi + + * source/encoder/compress.cpp: + Cleanup: remove commented code + [fe3a1d78f9d9] + + * source/encoder/compress.cpp: + Merge modes now use RDO cost to compare against inter/intra. Quality + up, bitrate down. + [4efd9d9d4b60] + + * source/encoder/compress.cpp: + Remove unused if-checks + [9605252d9dd8] + +2013-07-09 Steve Borho + + * source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp: + testbench: more test result format tweaking + [a5c300785b8f] + +2013-07-08 Steve Borho + + * source/test/intrapredharness.cpp: + intrapred: tweak test result reporting + [a103a8a37fb8] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/ipfilter.cpp, + source/common/primitives.h: + TComInterpolationFilter: remove dead code and dead primitive + [eee0360420e2] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/ipfilter.cpp, + source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/blockcopy.inc, source/common/vec/ipfilter.inc, + source/encoder/motion.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp: + primitives: normalize primitive function names + [3f91e1121a00] + + * source/common/primitives.h: + primitives: white-space alignment for primitive structure, re-order + for clarity + [6d15ceb19861] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc, + source/common/x86/asm-primitives.cpp, source/encoder/compress.cpp, + source/encoder/motion.h, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h, source/test/pixelharness.cpp, + source/test/pixelharness.h: + primitives: unify primitive funcdef type names + [ed9b76835696] + + * source/x265.cpp: + x265: cleanups + [420bb0f5b279] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + pixel: cleanup test harness methods + [992721365f6f] + + * source/input/input.cpp, source/input/input.h, + source/output/output.cpp, source/output/output.h, source/x265.cpp: + i/o: lower case open methods + [b6d72f3e0096] + + * source/common/reference.cpp: + reference: add a comment + [b36838e5b7cf] + + * source/CMakeLists.txt: + cmake: move version.cmake include to just before the CLI link + + this should avoid rebuilds in cmake generated Makefiles when the + version number changes (which happens at every commit) + [87d2b6578384] + + * source/common/common.cpp, source/common/common.h: + common: cleanups, remove unused dumpBuffer routine + [11194a1a5917] + + * source/common/CMakeLists.txt: + cmake: force i686 arch for wavefront.cpp for GCC (to generate + intrinsics) + [c3b5d95f11c3] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, source/common/reference.cpp, + source/common/threading.cpp, source/common/threading.h, + source/common/threadpool.cpp, source/encoder/frameencoder.cpp, + source/test/testpool.cpp: + threading: lower case method names + [7bdf867f5565] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/common/reference.cpp, + source/common/threadpool.cpp, source/common/threadpool.h, + source/common/wavefront.cpp, source/encoder/encoder.cpp, + source/encoder/frameencoder.cpp, source/test/testpool.cpp: + threadpool: lower case method names + [2724414e4c97] + + * source/test/testpool.cpp: + testpool: update to new interfaces + [1e0de23fd8cf] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: rename irow to row + [ca4ef78851dc] + + * source/encoder/frameencoder.cpp, source/encoder/frameencoder.h: + frameencoder: rename m_nrows to m_numRows + [0cd64834fc44] + + * source/Lib/TLibEncoder/TEncSlice.cpp, source/common/reference.cpp, + source/common/reference.h, source/common/threadpool.cpp, + source/common/threadpool.h, source/common/wavefront.cpp, + source/common/wavefront.h, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h: + wavefront: lower case some method names + [460e36b39510] + + * source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/CMakeLists.txt, + source/common/threadpool.cpp, source/common/threadpool.h, + source/common/wavefront.cpp, source/common/wavefront.h, + source/encoder/CMakeLists.txt, source/encoder/frameencoder.cpp, + source/encoder/frameencoder.h, source/encoder/wavefront.cpp, + source/encoder/wavefront.h: + threadpool: split QueueFrame into own files, rename wavefront to + frameencoder + + This commit also removes the un-exposed x265_init_threading() + function which would have leaked the thread pool if anyone had used + it. + [ef1a5aeed45c] + + * source/common/reference.cpp, source/common/threadpool.cpp, + source/common/threadpool.h: + threadpool: cleanup + [1f126e6d3188] + + * source/common/reference.cpp: + reference: general cleanup + [8e9e65404251] + + * source/common/primitives.cpp: + primitives: general cleanup + [25dd71fec257] + + * source/common/primitives.h: + primitives: more argument cleanups + [ac679985a223] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/intrapred.cpp, + source/common/primitives.h, source/common/vec/intrapred.inc, + source/encoder/compress.cpp, source/test/intrapredharness.cpp, + source/test/intrapredharness.h: + primitives: rename intra primitive function pointers + [f0ed0e012c69] + + * source/common/primitives.h, source/test/intrapredharness.cpp, + source/test/intrapredharness.h: + primitives: rename intra funcdefs, cleanup hungarian from + primitives.h + [676b0ce0b073] + + * source/common/primitives.h: + primitives: update comment for PartitionFromSizes + [45237ca3f80e] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + cleanup dct testbench names + [c9bf846287de] + + * source/test/mbdstharness.cpp: + one more fixup + [e05d5cb444ed] + + * source/test/mbdstharness.cpp: + testbench fixup + [7ff03b771df4] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/vec/dct.inc, + source/test/mbdstharness.cpp: + primitives: rename deQuant to dequant + [7f9581358d71] + + * source/common/dct.cpp: + dct: tweak C reference function names + [64a60d981754] + +2013-07-08 praveentiwari + + * source/common/vec/dct.inc: + xIDCT32 renamed + [c6c2f393542b] + + * source/common/vec/dct.inc: + xIDCT16 renamed + [6f3e60bfc015] + + * source/common/vec/dct.inc: + xIDCT8 renamed + [41b0fd84e529] + + * source/common/vec/dct.inc: + xIDCT4 renamed to idct4 + [9f1b4c87102e] + + * source/common/vec/dct.inc: + xIDST4 renamed to idst4 + [96f0f843ce32] + + * source/common/vec/dct.inc: + xDCT32 renamed and code cleanup + [6b41676cb38e] + + * source/common/vec/dct.inc: + xDCT8 renamed and cleanup + [9e014c33e51f] + + * source/common/vec/dct.inc: + xDCT16 renamed and code cleanup + [b339a499aed7] + + * source/common/vec/dct.inc: + xDCT4 vector function renamed and code cleanup + [60a0b40415c0] + + * source/common/vec/dct.inc: + xDST4 renamed and code cleanup + [b433892d93bc] + + * source/common/vec/dct.inc: + xDeQuant renamed and code cleanup + [977e4de578f8] + + * source/common/dct.cpp: + xDeQuant renamed to dequant and code cleanup + [f4a8ee3589fc] + + * source/common/dct.cpp: + xIDCT32_C renamed to idct32_c and code cleanup + [ad9fa655dd85] + + * source/common/dct.cpp: + xIDCT16_C renamed to idct16_c and code cleanup + [6c88b49c9df6] + + * source/common/dct.cpp: + xIDCT8_C renamed to idct8_c and code cleanup + [628340f39377] + + * source/common/dct.cpp: + xIDCT4_C renamed to idct4_c and code cleanup + [96ba4ac11c58] + + * source/common/dct.cpp: + xIDST4_C renamed to idst4_c and code cleanup + [baf12bcf3edb] + + * source/common/dct.cpp: + xDCT32_C renamed to dct32_c and code cleanup + [f7b83d5d0517] + + * source/common/dct.cpp: + xDCT16_C renamed to dct16_c and code cleanup + [7b0ef137a4f3] + +2013-07-08 Steve Borho + + * source/common/vec/intrapred.inc: + intrapred: fixup variable renaming + [fcca2928bf47] + + * source/common/intrapred.cpp: + intrapred: remove hungarian prefix for file static + [ec2b823011be] + + * source/common/pixel.cpp: + pixel: cleanup C references + [3fa99086d2f5] + + * source/common/vec/pixel16.inc: + pixel: use x264 style argument names + [a9d1cbda189f] + + * source/common/vec/pixel8.inc: + pixel: use x264 style argument names + [e0224143803e] + + * source/common/intrapred.cpp: + intrapred: finish variable cleanups + [1f4cd3821191] + + * source/common/vec/pixel.inc: + pixel: remove caps in function arguments, macro white-space cleanups + [066d75ab8433] + + * source/common/vec/pixel.inc, source/common/vec/utils.h: + pixel: cleanup includes + [dd217d262bbf] + + * source/common/vec/pixel.inc: + pixel: simplify SET_FUNC_PRIMITIVE_TABLE macro and relocate + [44ca2af6c6b9] + +2013-07-08 ggopu + + * source/common/intrapred.cpp, source/common/vec/intrapred.inc: + IntraPred: Removed Unused Codes and Cleanup the Function names and + Hungarian Notations + [56eeac8279f3] + + * source/common/vec/pixel.inc, source/common/vec/pixel16.inc, + source/common/vec/pixel8.inc, source/common/vec/sse.inc: + Vector Primitives : Removed hungarian notation + [0ba9d8a0ec82] + +2013-07-08 Steve Borho + + * source/common/vec/intrapred.inc: + intra: fix HIGH_BIT_DEPTH builds + [7242d6166f87] + +2013-07-08 praveentiwari + + * source/common/dct.cpp: + xDCT8_C renamed to dct8_c and code cleanup + [dc13d07919db] + + * source/common/dct.cpp: + xDCT4_C renamed to dct4_c and code cleanup + [6030b16d3025] + + * source/common/dct.cpp: + xDST4_C renamed to dst4_c and clenup + [0cf5a2be157b] + +2013-07-08 Steve Borho + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: use mini-GOP sized frame batches with --gops 1 [CHANGES + OUTPUTS] + + This changes the cadence slightly, so outputs will change. This also + greatly reduces the memory requirements when GOP parallelism is not + in use. + [30346629dbc4] + +2013-07-05 praveentiwari + + * source/common/dct.cpp: + Cleanup some left hungarian notation left in first commit for + xDeQuant + [7ee04e912bdd] + + * source/common/dct.cpp: + xDeQuant code cleanup + [e46a76843569] + + * source/common/dct.cpp: + xIDCT32_C code cleanup + [92c0f50f7ec9] + + * source/common/dct.cpp: + xIDCT16_C code cleanup + [712eb60831c0] + + * source/common/dct.cpp: + xIDCT8_C code cleanup + [f431146891cc] + + * source/common/dct.cpp: + xIDCT4_C code cleanup + [65517cac4316] + +2013-07-05 Deepthi + + * source/common/common.cpp: + Rect option is enabled by default + [6b12d890e0e9] + +2013-07-05 Min Chen + + * source/common/vec/intrapred.inc: + intrapred: code cleanup + [f84fb8dcdf32] + +2013-07-05 Deepthi Devaki + + * source/common/vec/ipfilter16.inc: + Used memory align macro in ipfilter16.inc for compatibility with + other compilers. + [8543860c3817] + +2013-07-05 praveentiwari + + * source/common/dct.cpp: + xIDST4_C code cleanup + [df4bd7c4a4a1] + + * source/common/dct.cpp: + xDCT32_C code cleanup + [30022ece780f] + + * source/common/dct.cpp: + xDCT16_C code cleanup + [eee88868ac98] + + * source/common/dct.cpp: + xDCT8_C code cleanup + [52e9946621cd] + + * source/common/dct.cpp: + xDCT4_C code cleanup + [8c0bdd83e6dc] + + * source/common/dct.cpp: + xDST4_C code cleanup + [1f0f735545bf] + + * source/common/vec/dct.inc: + partialButterfy32 code cleanup + [71ed2bcad90a] + + * source/common/vec/dct.inc: + partialButterfly16 code cleanup + [a3a0bb7eced1] + + * source/common/vec/dct.inc: + partialButterfly8 code clenup + [1955cac08ed8] + +2013-07-05 ggopu + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h: + TShortYUV : Removed Get*() Method and Moved Member variables private + to Public + [5149904749c6] + +2013-07-05 praveentiwari + + * source/common/vec/dct.inc: + xIDCT32 code cleanup + [02973bed3d3c] + + * source/common/vec/dct.inc: + xIDCT16 code cleanup + [46d7c537f220] + +2013-07-05 Deepthi + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h: + Backout changeset: TShortYUV + [e5eded426f38] + +2013-07-05 sumalatha + + * source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + intra and merge in no-rdo latest patch + [d492600f757a] + +2013-07-04 Deepthi Devaki + + * source/common/vec/ipfilter16.inc: + Fix for vc10 crash in ipfilter16.inc + [bf00c76a537d] + +2013-07-02 ggopu + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h: + TShortYUV : Removed all the get() method and moved private data + member into Public + [5c307e0cd2f5] + +2013-07-03 praveentiwari + + * source/common/vec/dct.inc: + xIDCT4 intrinsic code cleanup + [133b907bb8ee] + + * source/common/vec/dct.inc: + xIDCT8 intrinsic code cleanup + [dfa4be3b6bab] + + * source/common/vec/dct.inc: + xIDST4 intrinsic code cleanup + [e55954ec3be8] + + * source/common/vec/dct.inc: + xIDST4 vector code cleanup + [bf7dcf1b6cb1] + + * source/common/vec/dct.inc: + xDCT32 intrinsic code cleanup + [48aa9d0d5d74] + +2013-07-03 Deepthi Devaki + + * source/common/vec/ipfilter16.inc: + Fix for crash in vc9 for IP filter + [15ad7154c62f] + +2013-07-03 praveentiwari + + * source/common/vec/dct.inc: + xDCT32 vector code cleanup + [0ab33d7ffaea] + + * source/common/vec/dct.inc: + xDCT16 intrinsic code cleanup + [1a70ee3a0be9] + +2013-07-02 praveentiwari + + * source/common/vec/dct.inc: + xDCT16 vector code cleanup + [31801697cf9b] + +2013-07-05 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/common/vec/intrapred.inc: + # From 2b989d9a7e9ecf86a592f33c39555cd63dbebd5a Mon Sep 17 00:00:00 + 2001 # From: Min Chen # Date: Tue, 2 Jul 2013 + 16:54:35 +0800 # Subject: [PATCH 3/3] intrapred: IntraAngle33 32x32 + intrapred: IntraAngle33 32x32 + + --- source/Lib/TLibEncoder/TEncSearch.cpp | 2 +- + source/common/vec/intrapred.inc | 2628 + ++++++++++++++++++++++++++++++++- 2 files changed, 2628 + insertions(+), 2 deletions(-) + [f1101476cf1c] + +2013-07-03 Deepthi + + * source/common/vec/pixel.inc: + Fix for HIGH_BIT_DEPTH enable + [209cce8f38be] + + * source/common/vec/pixel.inc, source/common/vec/utils.h: + Assert error in pixel.inc resolved + [e59400fe1240] + +2013-07-02 praveentiwari + + * source/common/vec/dct.inc: + xDCT8 intrinsic code cleaup + [57b142b20d91] + +2013-07-02 Deepthi + + * source/common/threadpool.cpp: + # From d515ac32f810fe6e6a986862771c85564193ee51 Mon Sep 17 00:00:00 + 2001 # From: Min Chen # Date: Mon, 1 Jul 2013 + 17:45:07 +0800 # Subject: [PATCH 2/3] Disable #pragma in INTEL + Compiler Disable #pragma in INTEL Compiler + --- source/common/threadpool.cpp | 2 ++ 1 files changed, 2 + insertions(+), 0 deletions(-) + [4ecacec3c8fa] + + * source/common/vec/pixel.inc: + # From 8fa0f843e7fcf96f934856e5f0541501f077909a Mon Sep 17 00:00:00 + 2001 # From: Min Chen # Date: Tue, 2 Jul 2013 + 15:57:32 +0800 # Subject: [PATCH 1/3] intrapred: transpose_32x32 + intrapred: transpose_32x32 + --- source/common/vec/pixel.inc | 12 +++++++++++- 1 files changed, 11 + insertions(+), 1 deletions(-) + [396712bd0111] + +2013-07-02 praveentiwari + + * source/common/vec/dct.inc: + xDCT8 vector code cleanup + [4b738b357911] + + * source/common/vec/dct.inc: + Removed hungarian notation from xDCT4 intrinsic code + [01b2f4e5cbdd] + + * source/common/vec/dct.inc: + Removed hungarian notation from xDST4 intrinsic code + [c7acbdbd0986] + + * source/common/vec/dct.inc: + xDST4 code cleanup + [97a2b34abe05] + +2013-07-01 praveentiwari + + * source/common/vec/dct.inc: + Removed hungarian notation and code cleanup for xDeQuant vector code + [31e8e201a7ab] + +2013-07-01 Deepthi Devaki + + * source/common/ipfilter.cpp: + code cleanup and variable renaming in ipfilter.cpp + [1857d23adace] + +2013-07-01 praveentiwari + + * source/test/mbdstharness.cpp: + Removed hungarian notation and code cleanup for + check_xdequant_primitive test code + [936c56e589ec] + + * source/test/mbdstharness.cpp: + Removed hungarian notation and code cleanup for + check_quantaq_primitive test code + [65f14e9260f6] + + * source/test/mbdstharness.cpp: + Removed hungarian notation and cleanup for check_quant_primitive + test code + [d47ea60f050f] + +2013-07-01 Deepthi Devaki + + * source/common/vec/ipfilter8.inc: + Removed cliping from vertical filter implementation; observed to be + redundant + [f72fac178387] + +2013-06-30 Steve Borho + + * source/cmake/version.cmake: + cmake: remove cache property for X265_VERSION + + This was causing cmake to only check the version when the cmake + cache was entirely cleared (usually just once). After this change + cmake will update X265_VERSION every time it generates the CLI + project. + [30c0e5591120] + +2013-06-29 Steve Borho + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: ensure lambda per QP does not change [CHANGES OUTPUTS] + + This fixes non-determinism when GOP parallelism is enabled. I think + we should just use x264's lambda tables and get rid of all of this + logic, but it's too big of a change to make right before I leave for + vacation. + + My 1.55 hack attempts to get our compression per-QP close to where + it was before the change. + [c3953142041c] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/wavefront.cpp, + source/encoder/wavefront.h: + TEncSearch: configure bitcost estimators at same time as TComRdCost + + This ensures the lambdas used for RD cost are always in sync with + the bit costs + [15f993eceb4b] + +2013-06-29 Min Chen + + * source/common/vec/intrapred.inc: + intrapred: Enable primitive with Intel Compiler + --- source/common/vec/intrapred.inc | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [b0d55cc8161e] + +2013-06-28 Steve Borho + + * .hgtags: + Added tag 0.1 for changeset 99fab2ef92be + [e13a79c5ea45] + + * source/common/vec/ipfilter.inc: + VC9 crashes running filterHorizontalMultiplaneExtend in release + + The test outright fails in debug, but doesn't crash + [99fab2ef92be] [0.1] + + * source/encoder/encoder.cpp: + encoder: ensure primitives (and ROM) are initialized before + x265_check_params + + This might be an issue for API users + [c34b079119ab] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/wavefront.cpp, + source/encoder/wavefront.h: + rename EncodeFrame to FrameEncoder + [47f4f559739b] + + * source/common/common.cpp, source/common/common.h, + source/encoder/encoder.cpp: + common: use g_aucConvertToBit for getMaxCuDepth + [47e80c91afaf] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/common/primitives.cpp, + source/encoder/encoder.cpp: + TEncTop: move ROM management out of TEncTop + + Eventually we will want to allow more than one TEncTop in a process + [1b4c93071203] + + * source/Lib/TLibCommon/TComRom.cpp: + TComRom: make initROM and destroyROM safe for cycled calls + [aa4402b22049] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: use aligned allocs for temporary interpolation + buffers + + Hoping this resolves the problems found with 32bit heap corruption. + [3b923c5e34ee] + + * source/common/common.cpp: + vc9 requires unambigous casts for log() arguments + [c31d63feaf21] + + * source/common/common.cpp: + gcc requires math.h for log() + [82b80c7a3ebc] + + * source/common/vec/ipfilter.inc: + ipfilter: disable vector version of filterHmulti + [12036d016834] + + * source/common/reference.cpp: + reference: remove obsolete code + [972659be1ea9] + + * source/common/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + ipfilter: white-space nits, no code chage + [fc3e18cdca42] + +2013-06-28 Deepthi Devaki + + * source/common/vec/ipfilter8.inc: + Modification on Horizontal filter + [010b7590003a] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/reference.cpp, source/common/vec/ipfilter.inc, + source/common/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Merged buffer extension with Horizontal filter; integrated with + encoder + [f9cf63bae785] + +2013-06-28 Steve Borho + + * source/encoder/encoder.cpp: + encoder: Always use IDR decoder refresh type + [e025b15b8856] + +2013-06-28 praveentiwari + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code for quantaq and quant functions. + [c5b3ab055087] + + * source/common/vec/dct.inc: + Vector code for quantaq and quant functions. + [96e5fa2a60cd] + + * source/common/dct.cpp: + Replaced uint64_t with int in quantaq_C and quant_C functions, + tested with BasketballDrive. + [4257f9931e27] + +2013-06-28 Steve Borho + + * source/common/common.cpp, source/common/common.h, + source/encoder/encoder.cpp, source/x265.h, source/x265opts.h: + Removed tuQTMaxLog2Size and tuQTMinLog2Size parametr from CLI option + [6f0059abaf51] + + * source/common/common.cpp, source/x265.h, source/x265opts.h: + Removed maxCUDepth parameter from CLI option + [e052ebcb13d5] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: improve comment about determining where the second I frame + will be + [c79ed90edca5] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h, source/x265opts.h: + x265: add --bframes/-b parameter to select B GOP structure + [d9cf2ba5e957] + + * source/encoder/encoder.cpp: + encoder: round keyframeInterval after establishing GOP structure + + This prevents 30Hz video from accidentally triggering our -i32 B + frame hack + [83d6f513ba93] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: free motion reference when picture is no longer + referenced + + This is a stop-gap fix to the enourmous memory requirements of GOP + level parallelism. It would be better to re-use these structures + instead of allocing them every time. + [10a9bc997966] + +2013-06-27 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: nits + [57a074870b71] + + * source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: remove obsolete member function definition + [f1ffad55ce94] + + * source/common/reference.cpp, source/encoder/wavefront.cpp: + thread: add serial code-paths for job provider if thread pool is not + available + [fe6020943bd0] + +2013-06-27 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + intrapred: Active code IntraAngle33 16x16 + --- source/Lib/TLibEncoder/TEncSearch.cpp | 14 ++------------ 1 files + changed, 2 insertions(+), 12 deletions(-) + [aa95488eaa2d] + + * source/common/vec/pixel.inc: + pixel: fix bug in Transpose 16x16 + --- source/common/vec/pixel.inc | 76 + +++++++++++++++++++++--------------------- 1 files changed, 38 + insertions(+), 38 deletions(-) + [265e7e0333c4] + +2013-06-27 sumalatha + + * source/encoder/compress.cpp: + Fixed the Hash error with -no-rdo ON + [3b93256a844b] + +2013-06-26 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: don't flush two GOP coders in one call + [5b42d1c900b4] + + * source/common/threadpool.cpp: + threadpool: tweak for thread deletion loop + [9fab0eb1538a] + + * source/common/threadpool.cpp: + threadpool: more explicit shutdown code + [d12f113a1e42] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: fix a small memory leak + [82d95660132f] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: white-space cleanup + [7bacf11b7f2e] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: allow GOP coder execution to overlap execution + + Finally.. GOP parallelism. + [80d6e85cdd03] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: do not flush if no pics are queued, prevents deadlock + [d064340c3227] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + cleanup: move SCALING_LIST_OUTPUT_RESULT to TEncCavlc + [f89b453bc412] + + * source/Lib/TLibCommon/TypeDef.h: + cleanup: remove unused REMOVE_SAO_LCU_ENC_CONSTRAINTS_3 + [90a69b42da1f] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h: + cleanup: move DISABLING_CLIP_FOR_BIPREDME to TComYuv, add comment + [8685e99751bc] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + cleanup: move PRINT_RPS_INFO to TEncCavlc + [2adc62db4d55] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: move VERBOSE_RATE define to TEncGOP + [518c70801520] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: add a worker thread to process keyframe intervals + [93ca3f5bafec] + + * source/common/TShortYUV.cpp, source/common/TShortYUV.h: + TShortYUV: remove or correct comment lines + [7f49c1a3ffcc] + + * source/common/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: move HM headers into their own source group in VC + [cfa1897d63eb] + + * source/encoder/CMakeLists.txt: + cmake: remove deleted compress.h from encoder project + [c7c32a64931f] + + * source/encoder/compress.cpp: + compress: remove warning disable pragmas, fix warnings + [8055fd9930c2] + + * source/encoder/compress.h: + compress: remove unused header + [a1831863c966] + + * source/encoder/compress.cpp: + compress: uncrustify + [6492774bd3e5] + + * source/encoder/compress.cpp: + compress: remove dead variables + [b5bbcf9da827] + + * source/common/primitives.h, source/test/mbdstharness.cpp, + source/test/mbdstharness.h: + primitives: fix symbol collision + [b3539e001ea4] + +2013-06-26 Min Chen + + * source/common/vec/intrapred.inc: + inrapred: generate all of 33 IntraAngle 16x16 modes once (vc9-win32 + only) + + --- source/common/vec/intrapred.inc | 4151 + ++++++++++++++++++++++++++++++++++++++- 1 files changed, 4148 + insertions(+), 3 deletions(-) + [3977edfa3ee1] + + * source/common/vec/intrapred.inc: + cleanup: Remove unused debug code + --- source/common/vec/intrapred.inc | 3 --- 1 files changed, 0 + insertions(+), 3 deletions(-) + [60955ad48ef9] + +2013-06-27 Mandar Gurav + + * source/common/vec/intrapred.inc: + primitves: 8 bit : PredIntraAng8x8 function table implementation + [dc1663eaf7f7] + +2013-06-26 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TrComQuant: replaced original code with primitive calls + [3e60dfba74c9] + + * source/common/dct.cpp: + Separated the logic block of code from xQuant function as two + functions for optimization + [60a4710e5fc3] + + * source/common/primitives.h: + primitives: Added function pointer types for quant + [e22c26d712a1] + +2013-06-26 sumalatha + + * source/encoder/compress.cpp: + commit : added log for printing costs for each partSize + [c23b800f04e5] + +2013-06-26 praveentiwari + + * source/common/primitives.h: + White-space fixes in primitives.h + [e48f23e628df] + + * source/common/dct.cpp: + Uncrustyfied dct.cpp file + [cfebb7774024] + +2013-06-25 Deepthi Devaki + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TypeDef.h: + Enabled clipping for Bframes in ME as a fix for 8bpp-16bpp mismatch. + [9f6a0fb1947a] + +2013-06-26 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + White-space fixes in TComTrQuant.cpp file + [12322d9f0b18] + +2013-06-26 Mandar Gurav + + * source/common/vec/intrapred.inc: + primitves: 8 bit : PredIntraAng4x4 function table implementation + [f7c7c5e792dc] + +2013-06-25 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix signed/unsigned comparison + [ecc9ce8993c2] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTOP: small perf tweak for --gops 1 + [f5605a0579d4] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncGOP: move access unit buffering to TEncGOP + [0ff90feebcbe] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTop: prune unused headers, this stuff has all been moved + downstream + [83fc7f016b60] + + * source/common/CMakeLists.txt, source/common/IntraPred.cpp, + source/common/intrapred.cpp: + cmake: lower-case the intrapred.cpp filename + [f8f794c5e611] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/mv.h: + mv: remove getHor() and getVer() compatibility methods + [9607556126e3] + + * source/Lib/TLibCommon/TComDataCU.cpp, source/common/mv.h: + mv: remove set() compatibility method + [a0d858f0462e] + + * source/Lib/TLibEncoder/TEncPic.cpp: + TEncPic: use MAX_DOUBLE from CommonDefs.h + [3310b6859a26] + + * source/Lib/TLibCommon/TComDataCU.cpp, source/common/mv.h: + mv: remove setHor(), SetVer() compatibility methods + [1807e4e93e87] + + * source/common/CMakeLists.txt, source/common/mv.h, + source/encoder/CMakeLists.txt, source/encoder/mv.h: + mv: move mv.h from encoder/ to common/ + [309e04c35ea4] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComMv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + common: replace TComMV with x265::MV everywhere + [8753803f57e1] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTOP: simplify logic + [d570ee7646f0] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTOP: remove incorrect comment + [589dbbf841d1] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncPreanalyzer.cpp, + source/Lib/TLibEncoder/TEncPreanalyzer.h: + TEncPic: make TEncPreanalyzer::xPreanalyze a method of TEncPic + [06de0a8f273d] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncGOP: move all picture prep work from TEncTop to TEncGOP + [74cfd68d431f] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: do not use accessors for member variables + [60c1cc2e014a] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove m_cpbRemovalDelay, which was never read + [9601ff4148b8] + + * source/encoder/encoder.cpp: + encoder: set progressive flag, since we only support progressive + sources + [a041ffd6c0ad] + + * source/encoder/encoder.cpp: + encoder: allow CRA mode when gopNumThreads is 1, add some comments + [31f7ebda9f97] + + * source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + TEncTOP: allow -i32 --gops 2, but detect and avoid end-of-stream + crashes + [180e91b6bbf4] + + * source/Lib/TLibCommon/TypeDef.h: + Backed out changeset: fdb2ffe6b29b + [05b082d276f4] + + * source/Lib/TLibCommon/TypeDef.h: + TypeDef: remove unused DISABLING_CLIP_FOR_BIPREDME + [fdb2ffe6b29b] + + * source/encoder/encoder.cpp: + encoder: repair open-gop again + [0cd6fd8d8c39] + + * source/encoder/encoder.cpp: + encoder: fix keyframe interval before InitializeGOP() + [0f06e7b6d298] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix the first I frame (temporary fix) + [744e04edb379] + + * source/encoder/encoder.cpp: + encoder: remove redundant assignments + [3df972c1a6f8] + + * source/encoder/encoder.cpp: + encoder: enforce keyframe as multiple of GOPSize + [57919ce774e1] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTOP: remove unused m_vRVM_RP vector + [5da286a92f5a] + +2013-06-25 praveentiwari + + * source/test/mbdstharness.cpp: + Fixed size issue in xDeQuant test code + [fe4b745a0332] + + * source/test/mbdstharness.cpp: + uncrustified mbdstharness.cpp file + [542f3ef09ca1] + +2013-06-25 sumalatha + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Included merge mode for FMD = ON. Changed the + xComputeCostMerge2Nx2N() to calculate the SATD cost and decide the + mode based on that. + [89d3a8de1f47] + +2013-06-25 Min Chen + + * source/common/vec/intrapred.inc: + intrapred: Enable primitive since VC9 haven't this bug + --- source/common/vec/intrapred.inc | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [e335b5a9ddca] + +2013-06-25 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + TEncGOP: temporary workarounds for --keyint 32; will finish tomorrow + [f3c41ba3423b] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncGOP: ugly hack for our ugly --keyint 32 hack + + This allows the encoder to survive up till the last mini-GOP, still + debugging + [1fb8e850e893] + + * source/encoder/encoder.cpp: + encoder: switch to IDR decoder refresh mode, more GOP parallelism + friendly + [70a215f4e0fa] + +2013-06-24 Steve Borho + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTOP: remove special cases for first keyframe interval + [ba72bbb6ec31] + + * source/encoder/encoder.cpp: + encoder: disable GOP parallelism if open GOP configured + [7dadd3f6b861] + + * source/encoder/encoder.cpp: + encoder: add a comment for default keyframe interval + [b5e283a6703e] + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp: + encoder: change default keyframe interval from fixed value (16) to 1 + second + [7a2555036e8d] + + * source/common/common.cpp, source/encoder/encoder.cpp: + encoder: consolidate logic which validates feature combinations + [cfc53bb18e32] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTOP: nuke xCalculateRVM() and RVM_VCEGAM10_M + [7d8bb458e541] + + * source/common/x86/README.txt: + asm: add a couple more notes to the README + [76f7fc3b8804] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: round-robin cycle through GOP encoders + [4cd8f659cee2] + + * source/common/IntraPred.cpp, source/common/dct.cpp, + source/common/ipfilter.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/blockcopy.inc, + source/common/vec/dct.inc, source/common/vec/ipfilter16.inc, + source/common/vec/ipfilter8.inc, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/x86/asm-primitives.cpp: + primitve: remove CDECL globally + [5bc43fd3c39b] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/common/common.cpp, + source/encoder/encoder.cpp, source/x265.h, source/x265opts.h: + x265: introduce gop thread count parameter, currently mostly ignored + [1ccb2a5664e2] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncGOP: encode frames in keyframe interval batches + + This makes progress reports even more infrequent + [f559317ff736] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTOP: 50% less hungarian, remove or correct wrong comments + [1e7f5c80ee9f] + +2013-06-24 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Added support for bipred, but 8bpp, 16bpp mismatch + [4d95584be40d] + +2013-06-21 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + TComPrediction: fixes for biprediction + + Restructured xPredInterBi. Added xPredInterluma/chromablk/Uni + functions with TShortYuv argument for bipred + [254a52d53a2f] + +2013-06-24 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nits + [a240b7530a29] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: stub in a processKeyframeInterval() method + + This is the next step to GOP parallelism. 1) Process an entire + keyframe interval of frames at one go; ensure the picture lists can + deal with this. 2) Add multiple TEncGOP encoders and round-robin + them to encode GOPs. 3) Finally add threading so they work in + unison. + [38efcc5dcbf5] + + * source/common/vec/intrapred.inc: + intra: work around VC10 and VC11 Win32 compiler bugs + [a1de9e7f8ee7] + + * source/common/ipfilter.cpp, source/common/vec/intrapred.inc, + source/common/vec/pixel.inc: + gcc: fix warnings reported by GCC + [0e83bb44b8e0] + +2013-06-23 Steve Borho + + * source/common/common.cpp, source/encoder/encoder.cpp, + source/x265.cpp, source/x265.h, source/x265opts.h: + x265: remove ui/i hungarian prefixes from x265_param_t, standardize + bool flags + [7f6b0ae32985] + +2013-06-24 Deepthi + + * source/encoder/compress.cpp: + Merge + [d45053ca8e2b] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [a609e11dc6a5] + +2013-06-22 Deepthi + + * source/encoder/compress.cpp: + Adding in entropy measurements to nordo after encode residual. + [628313e72a1d] + + * source/encoder/compress.cpp: + Removing merge modes for now; need more debugging + [9e95d8d524a9] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Removing an erroneous cost calculation + [4239f6bf06d7] + +2013-06-23 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp: + encoder: fix open-gop behavior + [30b142f02135] + + * source/common/common.cpp: + common: fixups for keyframe interval + [cdb337e59eb0] + +2013-06-22 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: remove more dead code + [e76879d77757] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: nits + [1aec16cc83ee] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + TEncCU: remove dead code paths + [9df420362f0b] + +2013-06-23 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix 16bpp build + [8e8edbaa351b] + + * source/common/vec/pixel.inc: + pixel: fix typo in build define + [f20bccea941b] + +2013-06-22 Steve Borho + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc, + source/test/pixelharness.cpp: + pixel: simplify and optimize residual and recon primitives + [0b1625fd6625] + + * source/VectorClass/vectori128.h: + vector: nit + [67b7bf7ec06c] + +2013-06-22 Min Chen + + * source/common/vec/ipfilter8.inc: + interp: [review] PACKUSWB included clip + --- source/common/vec/ipfilter8.inc | 6 ------ 1 files changed, 0 + insertions(+), 6 deletions(-) + [f1a23b5dfaec] + +2013-06-22 Steve Borho + + * source/common/vec/ipfilter-avx.cpp, source/common/vec/ipfilter- + avx2.cpp, source/common/vec/ipfilter-sse2.cpp, source/common/vec + /ipfilter-sse3.cpp, source/common/vec/ipfilter-sse41.cpp, + source/common/vec/ipfilter-sse42.cpp, source/common/vec/ipfilter- + ssse3.cpp, source/common/vec/ipfilter.inc: + ipfilter: move logic from cpp files to ipfilter.inc + [a58b34f95d5a] + +2013-06-22 Min Chen + + * source/common/vec/pixel.inc: + pixel: intrinsic Tranpose 16x16 + --- source/common/vec/pixel.inc | 83 + ++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 82 + insertions(+), 1 deletions(-) + [0d005f8ba719] + + * source/common/vec/pixel.inc: + pixel: intrinsic Tranpose 8x8 + --- source/common/vec/pixel.inc | 45 + +++++++++++++++++++++++++++++++++++++++++- 1 files changed, 43 + insertions(+), 2 deletions(-) + [1d7ab6944679] + +2013-06-22 Steve Borho + + * source/common/primitives.h, source/test/pixelharness.cpp: + primitives: rename NUM_BLOCKS to NUM_SQUARE_BLOCKS, use for intra + and transpose + [1ac63a9e6dea] + +2013-06-22 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc: + pixel: intrinsic Tranpose 4x4 + --- source/Lib/TLibEncoder/TEncSearch.cpp | 9 +------ + source/common/pixel.cpp | 19 +++++++++++++++ + source/common/primitives.h | 2 + source/common/vec/pixel.inc | 41 + +++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), + 8 deletions(-) + [9e07e835929a] + +2013-06-22 Steve Borho + + * source/common/vec/intrapred.inc: + intrapred: wrap pragma warning with #if _MSC_VER + [f262575efbe4] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix cost adjustments following HM ME + [ae98c873c26e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: simplify full search + [23c6b755811e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + ppa: move motion search event to a higher level + [0fede11b81e5] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: catch --keyint -1 sanely + [fd14b3dd6986] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: nit + [a4cbe5f4cda3] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp, + source/x265.h, source/x265opts.h: + x265: expose HM's full search as a CLI option --me 5 + [9be42d1878ee] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove more cruft + [1035b13af978] + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComRdCost: remove unused uiComp + [0e60557dfa5a] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove setDistParamComp + [e31e6df77aba] + +2013-06-21 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: cleanups + [895b3f846a08] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: move predictor fetching earlier in inter pred + + This allows us to merge xTZSearch into xPatternSearchFast + [5b79dc7d982d] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: hoist our optimized ME up a function layer + + Avoid a lot of useless overhead, simplify how ME is called + [e29623267bbe] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: simplify xSetSearchRange + [cd45b4dd8e73] + + * source/encoder/motion.cpp, source/encoder/mv.h: + MV: do not allow implicit creation from int + [2fbda9cade34] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nits + [609f5607d1a5] + + * source/common/common.cpp: + common: do not allow weighted prediction with optimized ME + + This prevents having to check for this deep within the encoder + [a66478706e61] + +2013-06-21 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/common/vec/intrapred.inc, source/test/intrapredharness.cpp: + [x265] [PATCH] inrapred: generate all of 33 IntraAngle-8x8 modes + once + --- source/Lib/TLibEncoder/TEncSearch.cpp | 2 +- + source/common/vec/intrapred.inc | 1017 + ++++++++++++++++++++++++++++++++- source/test/intrapredharness.cpp | + 3 +- 3 files changed, 1019 insertions(+), 3 deletions(-) + [851fe263dabf] + +2013-06-21 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove last Double uses from TEncSearch + [de1044dfbaf1] + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused getLambda() method + [3fe92111bc85] + +2013-06-20 Min Chen + + * source/common/IntraPred.cpp, source/common/vec/intrapred.inc, + source/test/intrapredharness.cpp: + intrapred: 1.fix C model when size more than 8, 2.fix buffer + overflow in testbench + --- source/common/IntraPred.cpp | 40 + ++++++++++++++++++------------------- + source/common/vec/intrapred.inc | 9 ++----- + source/test/intrapredharness.cpp | 9 +++---- 3 files changed, 26 + insertions(+), 32 deletions(-) + [14cc0e972566] + +2013-06-21 Deepthi Devaki + + * source/common/ipfilter.cpp, source/common/primitives.h: + Filtervertical-short-short and pel-short to support bipred + [cfea4bf27fa8] + +2013-06-21 Deepthi + + * source/encoder/compress.cpp: + Cleanup: compress.cpp + [0abefa6bc340] + + * source/encoder/compress.cpp: + Removing incorrect null checks + [90965b5b4da8] + + * source/encoder/compress.cpp: + Branch decision taken only if CU does not contain boundary. Always + true otherwise. + [bec0647b8d3f] + + * Merge + [20caf72adaac] + + * source/encoder/compress.cpp: + Replacing SATD computations with blkcpy and then satd + [6c2368a2a98a] + +2013-06-21 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nits + [3a080c7ba1d0] + + * source/common/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: separate HM headers from the cpp files for convenience + [fa1a70530abd] + + * source/common/pixel.cpp: + pixel: refix 16bpp builds + [ab01a775ab83] + + * source/x265.cpp: + x265: fix bitrate progress reports + [bced30963f15] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use member SPS and PPS structs, stop refetching slice + [ae82def2da71] + +2013-06-20 Steve Borho + + * source/common/x86/asm-primitives.cpp: + asm: do not override new SSE2 ASM functions with slower templated + XOP calls + [79231017d0da] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nits + [575f21816974] + + * source/CMakeLists.txt: + cmake: assign value to X86_64 when defined + [5de7df75a0fe] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: move a couple of methods + [fc339086c88d] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nits + [5de89ceadeeb] + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h: + bitcost: use more accurate fractional bit cost accounting from x264 + [07015bbe306b] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: drop SUBSAMPLE_SAD [CHANGES OUTPUTS] + + The complexity outweighed the benefits + [d5e5d3812eaa] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix comment + [cadd2ff53d3c] + + * source/encoder/bitcost.cpp: + bitcost: fix clamp of bitcost to 16bits + + The typecast to uint16_t was just wrong + [6557f5e10c86] + + * source/common/reference.cpp: + reference: cleanup worker function + [2c5d35632fea] + + * source/common/vec/ipfilter-avx.cpp, source/common/vec/ipfilter- + avx2.cpp, source/common/vec/ipfilter-sse2.cpp, source/common/vec + /ipfilter-sse3.cpp, source/common/vec/ipfilter-sse41.cpp, + source/common/vec/ipfilter-sse42.cpp, source/common/vec/ipfilter- + ssse3.cpp, source/common/vec/ipfilter8.inc: + ipfilter: use memcpy to extend top/bottom rows + [f27f99b01e66] + + * source/common/vec/ipfilter8.inc: + ipfilter: fix indention (all white-space changes) + [9ab99c57b42a] + + * source/common/vec/ipfilter8.inc: + ipfilter: fix spacing + [e6ef40975e4a] + + * source/common/pixel.cpp: + pixel: fix sa8d_16x16 C primitive + [1cec6b092506] + +2013-06-20 Deepthi Devaki + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/reference.cpp, source/common/vec/ipfilter.inc, + source/common/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Vertical filter with Border extend + [fea6feb1152d] + +2013-06-20 Min Chen + + * source/test/intrapredharness.cpp: + intrapred: adjust debug info format + --- source/test/intrapredharness.cpp | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [367a3f300210] + + * source/test/testbench.cpp: + testbench: fix logic on option --cpuid + [4054533a85e9] + +2013-06-20 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: ensure pic list contains at least two TComPic + [cadcbb46ffd9] + +2013-06-20 Min Chen + + * source/common/vec/dct.inc: + dct: disable optimize version dct in 16bpp, all of intermediate + value must be 32-bits + --- source/common/vec/dct.inc | 5 +++++ 1 files changed, 5 + insertions(+), 0 deletions(-) + [7135a13cf3a5] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/IntraPred.cpp, + source/common/primitives.h, source/common/vec/intrapred.inc, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + intrapred: implement IntraAngle mode to all size + --- source/Lib/TLibEncoder/TEncSearch.cpp | 5 ++- + source/common/IntraPred.cpp | 22 ++++++++++------- + source/common/primitives.h | 2 +- source/common/vec/intrapred.inc | + 39 +++++++++++++++++------------- source/test/intrapredharness.cpp | + 41 +++++++++++++++++++------------- source/test/intrapredharness.h | + 2 +- 6 files changed, 64 insertions(+), 47 deletions(-) + [12ef1f072327] + +2013-06-20 Mandar Gurav + + * source/common/pixel.cpp, source/test/testbench.cpp: + primitives: some macro implementation to improve readabilty of code. + [f97119251704] + +2013-06-20 Steve Borho + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel.h: + asm: use more generic 64bit build define + [6df10ad11f4a] + + * source/common/x86/asm-primitives.cpp: + asm: prune trailing white-space + [094b99a737b6] + +2013-06-20 Mandar Gurav + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + primitives: asm: satd: fix for 32 bit issue + [18b0bc26f3a0] + +2013-06-20 Deepthi + + * Merge + [17ddca539510] + + * source/common/common.cpp: + Print rdo if enabled, no-rdo if disabled + [8454a96806ff] + +2013-06-20 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: simplify POC search + [e926b2d50cee] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove unused variable + [bbf02b319ed7] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncGOP: move all picture management to TEncGOP + [276ab8776f48] + + * source/x265.cpp: + x265: do not ask for recon pictures if no output file was requested + [48d22ee16225] + + * source/encoder/encoder.cpp: + encoder: consider output NALs even if iNumEncoded is zero + [636674cc5f50] + +2013-06-19 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncGOP: move frame list from TEncTOP to TEncGOP + [abfa8a5a1ebc] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/wavefront.cpp, + source/encoder/wavefront.h: + TEncGOP: move SPS, PPS to TEncGOP + + These structures are modified by compressGOP and thus cannot be + encoder singletons. The init functions are kept in TEncTOP since + they are tightly coupled with TEncCfg and TEncTOP + [43a210488e02] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncTop.h: + make a pile of get*() methods const + [79d8bdd3612f] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncGOP: move getReferencePictureSetIdxForSOP and + selectReferencePictureSet + [998ee354be96] + +2013-06-19 Min Chen + + * source/common/vec/intrapred.inc: + intrapred: correct HIGH_BIT_DEPTH fault + --- source/common/vec/intrapred.inc | 2 +- 1 file changed, 1 + insertion(+), 1 deletion(-) + [bfde7f16c723] + +2013-06-19 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/encoder/encoder.h: + TEncGOP: directly return references to encoder's recon frames, do + not copy + + This is all prep-work for GOP threading, but it doesn't hurt to + remove a frame copy from the main loop. Now if the user wants no + recon frames there is no overhead associated in maintaining them. + [519b62011b23] + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/x265.cpp, source/x265.h: + x265: change public API slightly to allow batch reconstructed frame + delivery + [45b31cb09114] + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: nits + [63afd266e232] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: return TComPic from xGetNewPicBuffer, don't use reference + args + [770b306f1163] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + Backed out changeset: 8af5a9eb198c + + This backs out the lambda change from Double to UInt64, but leaves + the typo and other white-space fixes in place. This change ended up + being a net performance loser. The double->UInt64 conversions in the + middle of the tight loop were slower than the savings from using + integer math. + + We need an effective way to perform the cost scaling using integer + math in order to use integers throughout this function. + [eba9a84e9054] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: reduce number of double<->int conversions + [092152f58712] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + TComTrQuant: switch lambda from Double to UInt64 [CHANGES OUTPUTS] + [8af5a9eb198c] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.h: + motion: add bufSA8D() method, so sa8d and satd are both available + [76449efed663] + +2013-06-19 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + Included the merge mode for selecting the mode for xCompressInterCU + [94d7d31fc312] + +2013-06-20 ShinYee Chung + + * source/common/vec/intrapred.inc: + intrapred: Fix unused variables. + + error: statement has no effect [-Werror=unused-value] + [bb78ee5841f2] + +2013-06-19 Steve Borho + + * source/common/common.cpp: + common: move RDO settings together + [8039746d5cb0] + + * source/common/common.cpp: + common: output logs to stderr consistently + [d2a568b0c078] + +2013-06-20 ShinYee Chung + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: Fix compile error due to char array subscript. + + Sample compile errors in Linux64/GCC4.8.1: + + $HEVC/source/Lib/TLibEncoder/TEncSearch.cpp:2134:75: error: array + subscript has type ‘char’ [-Werror=char-subscripts] x265::pixelcmp + sa8d = x265::primitives.sa8d[g_aucConvertToBit[uiWidth]]; ^ + $HEVC/source/Lib/TLibEncoder/TEncSearch.cpp: In member function + ‘Void TEncSearch::estIntraPredQT(TComDataCU*, TComYuv*, TComYuv*, + TShortYUV*, TComYuv*, UInt&, Bool)’: + $HEVC/source/Lib/TLibEncoder/TEncSearch.cpp:2206:79: error: array + subscript has type ‘char’ [-Werror=char-subscripts] x265::pixelcmp + sa8d = x265::primitives.sa8d[g_aucConvertToBit[uiWidth]]; ^ + [dd5dd47e17f4] + + * source/common/IntraPred.cpp: + intrapred: Fix unused global variable. + + Compile error in Linux64/GCC4.8.1: + + $HEVC/source/common/IntraPred.cpp:252:15: error: + ‘{anonymous}::g_aucIntraFilterType’ defined but not used [-Werror + =unused-variable] unsigned char g_aucIntraFilterType[][35] = { ^ + [ac3e7ab6cdc0] + +2013-06-19 ShinYee Chung + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: Fix reordered initialization list. + + Sample compile errors in Linux64/GCC4.8.1: + + $HEVC/source/Lib/TLibCommon/TComPrediction.h: In constructor + ‘TComPrediction::TComPrediction()’: + $HEVC/source/Lib/TLibCommon/TComPrediction.h:80:12: error: + ‘TComPrediction::m_iLumaRecStride’ will be initialized after + [-Werror=reorder] Int m_iLumaRecStride; ///< stride of + #m_pLumaRecBuffer array ^ + $HEVC/source/Lib/TLibCommon/TComPrediction.h:79:12: error: ‘Pel* + TComPrediction::m_pLumaRecBuffer’ [-Werror=reorder] Pel* + m_pLumaRecBuffer; ///< array for downsampled reconstructed luma + sample ^ $HEVC/source/Lib/TLibCommon/TComPrediction.cpp:60:1: error: + when initialized here [-Werror=reorder] + TComPrediction::TComPrediction() ^ In file included from + $HEVC/source/Lib/TLibCommon/TComPrediction.cpp:39:0: + $HEVC/source/Lib/TLibCommon/TComPrediction.h:79:12: error: + ‘TComPrediction::m_pLumaRecBuffer’ will be initialized after + [-Werror=reorder] Pel* m_pLumaRecBuffer; ///< array for downsampled + reconstructed luma sample ^ + $HEVC/source/Lib/TLibCommon/TComPrediction.h:62:15: error: ‘Pel* + TComPrediction::m_piPredBuf’ [-Werror=reorder] Pel* m_piPredBuf; ^ + $HEVC/source/Lib/TLibCommon/TComPrediction.cpp:60:1: error: when + initialized here [-Werror=reorder] TComPrediction::TComPrediction() + ^ + [f8d043a4f6b4] + + * source/common/IntraPred.cpp, source/common/vec/intrapred.inc: + intrapred: Fix unused variables. + + Sample compile errors in Linux64/GCC 4.8.1: + + $HEVC/source/common/vec/intrapred.inc:4677:13: error: statement has + no effect [-Werror=unused-value] (pLeft1); ^ + $HEVC/source/common/vec/intrapred.inc:4678:14: error: statement has + no effect [-Werror=unused-value] (pAbove1); ^ + [e373dbe02018] + +2013-06-19 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/common/vec/intrapred.inc: + inrapred: generate all of 33 IntraAngle-4x4 modes once + [1353e6681c7d] + +2013-06-19 Mandar Gurav + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + primitives: asm: update: implementation of satd(sse2) + [762b53ecba5a] + +2013-06-18 Mandar Gurav + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + primitives: asm: implementation of satd_16x12(sse2) + [594020d9d19b] + +2013-06-19 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/vec/intrapred.inc, source/encoder/motion.cpp: + 16bpp fixes + [67342f25244e] + +2013-06-18 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + TComTrQuant: remove wrong comments, cleanup funcdefs, move inlines + to header + [6ecc4a1ca26c] + +2013-06-19 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Cleanuo: Removing swapCU + [ab93e542c0a4] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + Cleanup: Removing copyCU + [c46b9a8ead56] + + * source/CMakeLists.txt: + Removing FMD from cmake options. + [332b3145d9be] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + Replacing all FMD macros with cfg file option + [77b91dd88829] + + * source/encoder/encoder.cpp: + Set cfg flag to enableRDO CL option. + [e0528e455855] + + * source/Lib/TLibEncoder/TEncCfg.h: + Adding rdo to cfg structure + [49737829b64f] + + * source/common/common.cpp, source/x265.h, source/x265opts.h: + Add rdo/no-rdo to command line interface + [00a331ad69a4] + +2013-06-18 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: move getCost closer to calcRdSADCost, to show how they + are related + [c79263efd38d] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused calcHAD() method + [401167498626] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost: fix a typo in a comment + [ee06dc714841] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused overload of setDistParam + [08310bc6f97d] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused overload of setDistParam + [1ba3c30c42c2] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: move bufSATD back into motion.h and declare both inline + [f1d34f7a9d9b] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/x86/asm-primitives.cpp, source/encoder/motion.cpp, + source/encoder/motion.h, source/test/pixelharness.cpp: + pixel: add sa8d_inter primitives to match TComRdCost::calcHAD() + + Now we can pick between sa8d and satd at runtime via a pointer + selection + [b4f99ece0aeb] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use motion.bufSAD instead of m_pcRdCost DistParam + [d1604fa4e39e] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: make bufSATD function exactly like TComRdCost::calcHAD() + [dec21e1c6ee9] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/x86/asm-primitives.cpp, + source/test/pixelharness.cpp: + primitives: collect sa8d primitives into blocksize array + [a21a11f9e1b6] + + * source/common/primitives.h: + primitives: fix two wrong comments + [4915f80d3cdb] + + * source/encoder/motion.h: + motion: add bufSATD method + [760aa86a54ba] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove extra indent level; nit + [b0d5c4f14d3b] + + * source/common/primitives.cpp: + primitives: add rationale for disabling EMMS when ASM is disabled + [21b34444026c] + + * source/common/vec/pixel.inc: + pixel: cleanup Setup_Vec_PixelPrimitives, remove comments that add + no meaning + [f1437f97e146] + + * source/common/pixel.cpp: + pixel: round up to nearest half before shift, fixes --cpuid 1 + encodes + [4118bc3a68aa] + + * source/encoder/bitcost.cpp: + bitcost: use x265_emms() before calculating a row of MV costs, to be + safe + [5c1de9ce010c] + + * source/common/x86/asm-primitives.cpp: + asm-primitives: fix eoln + [c56ef1521a07] + + * source/common/primitives.h, source/common/x86/asm-primitives.cpp: + Backed out changeset: d9f7525c4adf + [c6370b56b1d9] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: delay x265_emms() to RDO stage of intra analysis + [f0b745956762] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use m_pcRdCost->calcRdSADCost() to avoid doubles in + intra analysis + [458cdba67699] + +2013-06-18 chenm003 + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/IntraPred.cpp, + source/common/primitives.h, source/common/vec/intrapred.inc, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + intrapred: framework for generate 33 Angle modes once + --- source/Lib/TLibCommon/TComPrediction.cpp | 5 +- + source/Lib/TLibCommon/TComPrediction.h | 1 + + source/Lib/TLibEncoder/TEncSearch.cpp | 65 +- + source/common/IntraPred.cpp | 569 +- source/common/primitives.h | 2 + + source/common/vec/intrapred.inc | 9409 + +++++++++++++++--------------- source/test/intrapredharness.cpp | 72 + + source/test/intrapredharness.h | 4 + 8 files changed, 5179 + insertions(+), 4948 deletions(-) + [e582599ae931] + + * source/common/primitives.h, source/test/intrapredharness.cpp, + source/test/intrapredharness.h: + intrapred: fix build error after simplify bLeft and bAbove + --- source/common/primitives.h | 8 ++++---- + source/test/intrapredharness.cpp | 26 +++++++++----------------- + source/test/intrapredharness.h | 4 ++-- 3 files changed, 15 + insertions(+), 23 deletions(-) + [5fbfc07c8898] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + intrapred: Split DC and Planar from Loop + --- source/Lib/TLibEncoder/TEncSearch.cpp | 17 ++++++++++++++++- 1 + file changed, 16 insertions(+), 1 deletion(-) + [20e256246d19] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/IntraPred.cpp, + source/common/primitives.h, source/common/vec/intrapred.inc: + cleanup: remove bLeft and bAbove since HEVC have a reference sample + pad, they are always True + --- source/Lib/TLibCommon/TComPattern.cpp | 14 ++-- + source/Lib/TLibCommon/TComPattern.h | 10 +-- + source/Lib/TLibCommon/TComPrediction.cpp | 33 +++++---- + source/Lib/TLibCommon/TComPrediction.h | 4 +- + source/Lib/TLibEncoder/TEncSearch.cpp | 29 +++----- + source/common/IntraPred.cpp | 43 +++--------- + source/common/primitives.h | 4 +- source/common/vec/intrapred.inc | + 115 +++++++++++++++---------------- 8 files changed, 101 + insertions(+), 151 deletions(-) + [b639e5fff55e] + +2013-06-18 chenm003 + + * source/Lib/TLibEncoder/TEncSearch.cpp: + intrapred: Split loop into Loop_Prediction and Loop_Decide + --- source/Lib/TLibEncoder/TEncSearch.cpp | 12 ++++++++---- 1 file + changed, 8 insertions(+), 4 deletions(-) + [758ac3d8616c] + +2013-06-18 chenm003 + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + cleanup: unused code m_bUseNIF + --- source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp | 36 + +------------------ source/Lib/TLibCommon/TComSampleAdaptiveOffset.h + | 1 - .../Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp | 41 + +--------------------- 3 files changed, 2 insertions(+), 76 + deletions(-) + [56d07fe3a257] + +2013-06-18 Steve Borho + + * source/common/reference.cpp: + Merge + [469e285c62c3] + +2013-06-18 Deepthi Devaki + + * source/common/reference.cpp, source/common/reference.h: + Merge + [0e1117a1e892] + + * source/common/vec/ipfilter8.inc: + Intrinsic implementation for SSE4 and higher for vertical filter + [ef85cd7f938d] + +2013-06-17 Steve Borho + + * source/common/reference.cpp, source/common/reference.h: + reference: use new multiplane primitives (changes outputs, likely + buggy) + [97dbc3346524] + +2013-06-18 Deepthi Devaki + + * Merge + [de3b92c772ad] + + * source/common/vec/ipfilter8.inc: + Partial intrinsic for vertical filter + [844308961a0b] + +2013-06-18 https://mandarmcw + + * source/common/reference.cpp: + Fixed Build Errors(Type cast) for 16 bit + [41855cf217d0] + +2013-06-18 mahesh pittala + + * Merged multicoreware/xhevc into default + [beab4f74a085] + + * Merged multicoreware/xhevc into default + [f4734dec9dc8] + +2013-06-17 mahesh pittala + + * Merged multicoreware/xhevc into default + [2cd09f020148] + + * Merged multicoreware/xhevc into default + [819fd14f888e] + + * source/Lib/TLibEncoder/TEncAnalyze.cpp: + Merged multicoreware/xhevc into default + [e712594eda8f] + +2013-06-15 mahesh pittala + + * Merged multicoreware/xhevc into default + [dc66b43ad68e] + + * Merged multicoreware/xhevc into default + [31558ea714e2] + +2013-06-14 mahesh pittala + + * Merged multicoreware/xhevc into default + [491779f0da6e] + +2013-06-18 Steve Borho + + * source/common/reference.cpp: + reference: extend Y=0 planes in worker function + [7e76d1574234] + + * source/common/reference.cpp, source/common/reference.h: + reference: use new APIs + [948587f9c9fa] + +2013-06-17 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp: + TComSlice: fix debug builds (asserts) + [b252b6a34869] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/encoder.cpp: + TEncCfg: remove deblocking filter metric code + [97c117eee937] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove uiNumSlices + [fcaae4248d5d] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp: + TComSlice: remove m_sliceIdx and access methods + [04505d780210] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused SEI var, nit cleanups + [739c30209346] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: SEIPresent variables can now be local variables + [5d9dc98c01a7] + + * source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove obsolete PROCESSING_STATE enum + [e817dece7a51] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: inline xResetNonNestedSEIPresentFlags() and + xResetNestedSEIPresentFlags() + [0a034b2714ba] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: inline xCreateLeadingSEIMessages + [42c156937eef] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: m_iGopSize did not need to be a member variable + [ac6bf2ad37dd] + + * source/encoder/encoder.cpp: + encoder: declare large integer constant to be unsigned to prevent + warnings + [b42f5f907a8a] + + * source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unnecessary includes of and + [bc3fe350f76a] + + * source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove uneecessary include of + [b34fc8a28026] + + * source/Lib/TLibEncoder/TEncPic.h: + TEncPic: nits + [5cec340da54e] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TComPic: remove unused m_numReorderPics + [dcb9836738d0] + + * source/Lib/TLibCommon/TComPic.h: + TComPic: move pointers to start of class definition + [d9e2d694bd64] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: remove unused m_SEIs + + This may get added again later with a more C friendly interface + [d75b868d9d33] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: white-space cleanups + [bc1ca09c3569] + + * source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h: + TComBitstream: remove decoder class TComInputBitstream + [5ccdcfc7f906] + + * source/Lib/TLibCommon/TComPic.h: + TComPic: remove obsolete include + [272fd2b3626f] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h: + TComSlice: fix misspelled member variable + [70efa70c09c6] + +2013-06-16 ShinYee Chung + + * source/encoder/encoder.cpp: + encoder: Fix compile warnings due to unhandled enumeration values in + a switch statement. + + $HEVC/source/encoder/encoder.cpp: In member function ‘void + x265::Encoder::determineLevelAndProfile(x265_param_t*)’: + $HEVC/source/encoder/encoder.cpp:119:12: error: enumeration value + ‘NONE’ not handled in switch [-Werror=switch] switch (m_level) ^ + $HEVC/source/encoder/encoder.cpp:119:12: error: enumeration value + ‘LEVEL1’ not handled in switch [-Werror=switch] + $HEVC/source/encoder/encoder.cpp:119:12: error: enumeration value + ‘LEVEL2’ not handled in switch [-Werror=switch] + $HEVC/source/encoder/encoder.cpp:119:12: error: enumeration value + ‘LEVEL2_1’ not handled in switch [-Werror=switch] + $HEVC/source/encoder/encoder.cpp:119:12: error: enumeration value + ‘LEVEL3’ not handled in switch [-Werror=switch] + $HEVC/source/encoder/encoder.cpp:119:12: error: enumeration value + ‘LEVEL3_1’ not handled in switch [-Werror=switch] cc1plus: all + warnings being treated as errors + [bf8657373cc0] + +2013-06-17 ShinYee Chung + + * source/Lib/TLibEncoder/TEncGOP.cpp: + gop: Fix compile warning due to shadow variables. + + error: shadowed declaration is here [-Werror=shadow] + [1afdab036cdf] + +2013-06-17 Steve Borho + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: remove unreferenced checkColRefIdx + [462ae5b5c92b] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/encoder/compress.cpp: + TComDataCU: remove TComSlice array of pointers; only one slice + [4a443108dea9] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/compress.cpp: + TComPic: remove m_uiCurrSliceIdx (there can be only one) + [f61d6a311ac7] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: give the orig and recon pointers real names + [4da66382061e] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: rename m_apcPicSym to m_pcPicSym, there is no array + [254cfb7b02c2] + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: further simplify boundary detection + [8bc870e559e5] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic, TComDataCU: remove m_pSliceSUMap + [5c7a900e9a7a] + + * source/Lib/TLibCommon/TComPic.cpp: + TComPic: pre-allocate vSliceCUDataLink + [82e9374afa55] + + * source/Lib/TLibCommon/TComPic.h: + TComPic: nits + [de4b9e89db28] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: m_vSliceCUDataLink did not need to be a member variable + [ab9d5f1754f1] + + * source/test/mbdstharness.cpp: + mbdstharness: fix compile with DEBUG + [d54f561a75ee] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: m_sliceGranularityForNDBFilter did not need to be a member + variable + [03754edfb91b] + + * source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp: + TComPic: nits + [e07e08349d30] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp: + TComPic: remove temp YUV image for tile/slice boundary filtering + [5d800bd145a9] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: remove more multi-slice cruft + [381a0aeddd3c] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: do not pass TComPic as pointer reference to encodeSlice + [896ca3355963] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: TComPic argument to compressSlice is not a reference + [4cc39023e365] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: nits + [15e62b2ff599] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: remove unused m_pcPicYuvPred and m_pcPicYuvResi + [d89a075a2779] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncSlice.cpp: + TComPic: remove unused m_pcPicYuvPred + [c31d7d16e14c] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncSlice.cpp: + TComPic: remove unused m_pcPicYuvResi + + I believe we use TShortYUV internally for this now + [e7aa7efbee13] + + * source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicYuv.h: + TComPic: nits + [b5cc706cb214] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + TComPic: remove unused m_bReconstructed member + [3b359b196585] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: remove extra assignment of m_iMaxRefPicNum, cleanup + [f67a6d5916f2] + + * source/common/ipfilter.cpp, source/common/primitives.h: + Merge + [8dba6d9462cb] + +2013-06-17 Deepthi Devaki + + * Merge + [068622ee452c] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Restructured Horizontal/Vertical filters, added support for + width!=16n in horizontal filter + [f2f58de47a26] + + * source/Lib/TLibEncoder/TEncAnalyze.cpp: + Merge + [6de69685341f] + + * source/common/vec/intrapred.inc: + Used unsafe compress to improve performance + [b852bbd6605b] + + * source/common/vec/ipfilter8.inc: + Used saturated compress to eliminate min() + [9da8624cf3c5] + +2013-06-17 Steve Borho + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: white-space nits + [22edfeb729e5] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: fix eoln + [7051e6809ffb] + + * source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/ipfilter.cpp, + source/common/reference.cpp: + remove redundant copy of subpel filter coefficients + [975d36cb92df] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComWeightPrediction.h: + HM fixes: remove various unused member variables + [89454200f223] + + * source/x265.cpp: + x265: improve x265 -V output some more + [1ba7a1176cef] + + * source/x265.cpp: + x265: fix and improve output of x265 -V + [1c9c6c9c4acc] + + * source/common/vec/dct.inc: + dct: uncrustify + [9d71425aa983] + +2013-06-17 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp: + inline xIT() into invtransformNxN() + --- source/Lib/TLibCommon/TComTrQuant.cpp | 6 +++++- 1 files changed, + 5 insertions(+), 1 deletions(-) + [87f57ab18e2a] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/vec/dct.inc, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + [review] reduce memory copy in xIT() + --- source/Lib/TLibCommon/TComTrQuant.cpp | 6 +- + source/common/dct.cpp | 73 ++++- source/common/primitives.h | 2 +- + source/common/vec/dct.inc | 639 +++++++++++++++++++++------------ + source/test/mbdstharness.cpp | 11 +- source/test/mbdstharness.h | 1 + + 6 files changed, 485 insertions(+), 247 deletions(-) + [7511cd738ad6] + + * source/common/vec/dct.inc: + cleanup: remove unused IDCT table + --- source/common/vec/dct.inc | 208 + --------------------------------------------- 1 files changed, 0 + insertions(+), 208 deletions(-) + [8d40f4be3699] + + * source/test/mbdstharness.cpp: + cleanup: new IDCT don't need backup buffer + --- source/test/mbdstharness.cpp | 15 ++------------- 1 files + changed, 2 insertions(+), 13 deletions(-) + [779041295105] + + * source/common/vec/dct.inc: + new IDCT32x32 from project Chinese University version of x265 + --- source/common/vec/dct.inc | 2069 + ++++++++++++++++----------------------------- 1 files changed, 732 + insertions(+), 1337 deletions(-) + [ac6b5681eb77] + + * source/common/vec/dct.inc: + new IDCT16x16 from project Chinese University version of x265 + --- source/common/vec/dct.inc | 9290 + ++++++++++++++++++++++----------------------- 1 files changed, 4571 + insertions(+), 4719 deletions(-) + [993296aeec04] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + inline xT() into transformNxN() + --- source/Lib/TLibCommon/TComTrQuant.cpp | 27 + +++++++-------------------- source/Lib/TLibCommon/TComTrQuant.h | 3 + --- 2 files changed, 7 insertions(+), 23 deletions(-) + [1252d3c3c90f] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/primitives.h, source/common/vec/dct.inc, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + [review] reduce memory copy in xT() + --- source/Lib/TLibCommon/TComTrQuant.cpp | 11 +- + source/common/dct.cpp | 175 +++++++++++++++++------- + source/common/primitives.h | 11 ++- source/common/vec/dct.inc | 240 + +++++++++++++-------------------- source/test/mbdstharness.cpp | 60 + ++++++++- source/test/mbdstharness.h | 2 + 6 files changed, 294 + insertions(+), 205 deletions(-) + [af7d0d29ef16] + +2013-06-17 Deepthi + + * source/encoder/compress.cpp: + Early Detection skip mode removed from FMD + [f167939ebd90] + + * source/Lib/TLibEncoder/TEncAnalyze.cpp, source/encoder/compress.cpp: + Merge + [81d2deb0c7c1] + +2013-06-15 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + Removing irrelevant FMD optimizations + [78ae2eec3b01] + + * source/encoder/compress.cpp: + Adding comments + [abaabb22011f] + +2013-06-16 Steve Borho + + * .hgtags: + Move tag LASTKNOWNGOOD + [58f364792384] + + * source/test/intrapredharness.cpp, source/test/intrapredharness.h, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + testbench: only perform 0xCDCDCDCDCDCD memsets for debug, runs much + faster + + It should only be necessary to do this when you are not sure the C + primitive is correct. Once the C primitive is correct, it should + never be required to initialize the output buffers in order to find + mismatch bugs. + [9a6800e84295] + + * source/test/mbdstharness.cpp: + mbdstharness: remove unused BufferflyConf_names + [1a8a3ddd9475] + + * source/encoder/compress.cpp: + compress: fix eoln + [94b5e1de4d90] + + * source/common/vec/dct.inc: + dct: remove VC9 x64 warning, the affected routines are all gone + [07b13562ea8d] + + * source/common/vec/dct.inc: + dct: remove unused primitives, reorder for clarity + [ce0f1e1ed310] + + * source/common/dct.cpp, source/common/primitives.h, + source/common/vec/dct.inc, source/test/mbdstharness.cpp, + source/test/mbdstharness.h: + remove partial butterfly and inversedst primitives + [63a832540bab] + +2013-06-16 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp: + [review] merge buffer convert into DCT* module + --- source/Lib/TLibCommon/TComTrQuant.cpp | 8 +------- 1 files + changed, 1 insertions(+), 7 deletions(-) + [7f6068ab7706] + + * source/common/vec/dct.inc: + support nSrcStride on DCT* + --- source/common/vec/dct.inc | 417 + ++++++++++++++++++++++++--------------------- 1 files changed, 221 + insertions(+), 196 deletions(-) + [f3a8a85bdb0d] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Merge xTrMxN into TComTrQuant::xT + --- source/Lib/TLibCommon/TComTrQuant.cpp | 21 +++++---------------- + 1 files changed, 5 insertions(+), 16 deletions(-) + [53838676f709] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + optimize xTrMxN by function pointer array + --- source/Lib/TLibCommon/TComTrQuant.cpp | 30 + ++---------------------------- 1 files changed, 2 insertions(+), 28 + deletions(-) + [f6c012a6b57f] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/vec/dct.inc: + [review] code for DST4x4 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 7 +-- + source/common/dct.cpp | 20 ++++++++ source/common/vec/dct.inc | 87 + +++++++++++++++++++++++++++++++++ 3 files changed, 110 + insertions(+), 4 deletions(-) + [bc1b9ace38e9] + + * source/Lib/TLibEncoder/TEncTop.cpp: + miss for sqrt() + --- source/Lib/TLibEncoder/TEncTop.cpp | 1 + 1 files changed, 1 + insertions(+), 0 deletions(-) + [f7ab8faa77f9] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/vec/dct.inc, source/test/mbdstharness.cpp: + Optimize DCT32x32 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 3 +- + source/common/dct.cpp | 11 + source/common/vec/dct.inc | 669 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 2 + +- 4 files changed, 682 insertions(+), 3 deletions(-) + [4e73d671ade5] + + * source/common/vec/dct.inc: + simplify table name + --- source/common/vec/dct.inc | 384 + ++++++++++++++++++++++---------------------- 1 files changed, 192 + insertions(+), 192 deletions(-) + [92d5ad1eb45f] + + * source/common/vec/dct.inc: + [review] share some table with DCT8x8 + --- source/common/vec/dct.inc | 230 + +++++++++++++++++++++----------------------- 1 files changed, 110 + insertions(+), 120 deletions(-) + [9551c827dd8c] + +2013-06-15 Steve Borho + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove m_storedStartCUAddrForEncodingSlice std::vector + [aa925c788d24] + + * source/Lib/TLibEncoder/TEncAnalyze.cpp, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncGOP: Move statistics structures to TEncTop, protect with a lock + [3545ade1fb3b] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use array delete for array new pointers + [8508e8293b74] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: simplifications + [035a7dee0480] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: single tile simplifications + [065ee8264b98] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: more single-slice simplifications + [b05e694e6a22] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove undefined SAO_RDO + [1a2fb8fe36b8] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove dead code + [d024c9bddf19] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: clean-up some multi-slice constructs + [18ad786c53c0] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncTOP: move RC-GOP operations within compressGOP() + [fa70c5f59650] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: use iPOCLast==0 instead of m_bSeqFirst + [c08b29d11335] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: cleanup initialization + [8d9c2a12262f] + + * source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: cleanups, make internal methods protected + [1821fba47aa4] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove member pointer to encoder singleton picture list + + The picture list is provided to compressGOP() method + [b5ce58a2af61] + + * source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused getListPic() method + [67b8bea4073d] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: remove member pointer to global picture list + [f90b3bfd87c4] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.h: + TEncSlice: remove GOP encoder member variable + + Get rid of another assumption about singleton objects + [8e7b990788c9] + +2013-06-12 Min Chen + + * source/common/primitives.h, source/common/vec/pixel.inc: + primitives: add a 16bit copy with left shift primitive + [ca516890cd40] + +2013-06-15 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: fixup indexing of block primitives + [9b7a1362f367] + +2013-06-15 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + Merge; FAST_MODE_DECISION fully functional with inter. + [9a99578e2815] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Merge + [f4f5b666aecd] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Null rpcBestCU. Memory leaks resolved. + [353206dee54b] + + * source/encoder/compress.cpp: + More prep to null BestCU + [966336c97827] + + * source/encoder/compress.cpp: + Prepare to Null split level BestCU + [e143b9a344e7] + + * source/encoder/compress.cpp: + Prepare to null rpcBestCU + [c61499ef1432] + + * source/encoder/compress.cpp: + More cleanup + [f68b8005dae1] + + * source/Lib/TLibCommon/TComDataCU.cpp, source/encoder/compress.cpp: + Cleanup; fix split bugs + [57174096d40b] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Rolling back; inter mode works, split has bugs + [caba150c54c0] + +2013-06-14 Steve Borho + + * source/encoder/encoder.cpp: + x265: better handling of level thresholds + [4d8007536eb4] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + x265: detect and set the profile and level + [07a1ffa03c3a] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + gcc: integer array subscripts + [c11ecb15eaa0] + +2013-06-14 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/dct.cpp, + source/common/vec/dct.inc, source/test/mbdstharness.cpp: + Optimize DCT16x16 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 3 +- + source/common/dct.cpp | 11 + source/common/vec/dct.inc | 482 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 2 + +- 4 files changed, 495 insertions(+), 3 deletions(-) + [261b26b87c0b] + + * source/common/vec/dct.inc: + cleanup MAKE_ODD outside xDCT8 + --- source/common/vec/dct.inc | 1 + 1 files changed, 1 insertions(+), + 0 deletions(-) + [914852d611c8] + +2013-06-14 Steve Borho + + * source/x265opts.h: + x265: nit in CLI help + [43fd53ad27fe] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/primitives.h: + pixel: use width bit size to index residual and recon primitives + [34e5e66bd386] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc, + source/test/pixelharness.cpp, source/test/pixelharness.h: + pixel: rename residual/reconstruction primitves + [83756d7ec230] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merged in deepthidevaki/xhevc_deepthid (pull request #199) + + Added vector Filter Horizontal Multiplane, and support in testbench. + [d2a8a011a13e] + +2013-06-14 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc, source/test/pixelharness.cpp: + GetResidual + calcRecon changed to use function pointers + [575b19e5a035] + + * source/common/primitives.h, source/test/ipfilterharness.cpp: + Merge + [8fc2fe71a2d7] + + * source/common/ipfilter.cpp, source/common/primitives.h, + source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc, + source/test/ipfilterharness.cpp, source/test/ipfilterharness.h: + Added vector Filter Horizontal Multiplane, and support in testbench. + [15956ceaf16a] + +2013-06-14 Steve Borho + + * Merged in maheshpittala/xhevc_mahesh (pull request #198) + + Fixed Build errors(Type cast) in 16 bit + [73b00711e83d] + +2013-06-14 https://mandarmcw + + * source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/reference.cpp: + Fixed Build errors(Type cast) in 16 bit + [b80bdba3d499] + +2013-06-14 mahesh pittala + + * Merged multicoreware/xhevc into default + [2318037a8305] + + * Merged multicoreware/xhevc into default + [ed51338a5928] + + * Merged multicoreware/xhevc into default + [a4e3b5280eb4] + +2013-06-12 mahesh pittala + + * Merged multicoreware/xhevc into default + [392194eba6a1] + +2013-06-13 mahesh pittala + + * source/tools/HM decoder/TAppDecoder.exe: + Merged multicoreware/xhevc into default + [41fd32e1ee06] + +2013-06-12 mahesh pittala + + * Merged multicoreware/xhevc into default + [9eb8f9941713] + + * Merged multicoreware/xhevc into default + [0eb8933de745] + + * Merged multicoreware/xhevc into default + [803e22af8582] + +2013-06-11 mahesh pittala + + * Merged multicoreware/xhevc into default + [8c842a8cfe96] + + * source/common/macroblock.cpp, source/common/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [266046e7522d] + +2013-06-10 mahesh pittala + + * Merged multicoreware/xhevc into default + [9cd8563afb0d] + + * Merged multicoreware/xhevc into default + [007a8b519e53] + + * Merged multicoreware/xhevc into default + [b5f5d466d800] + +2013-06-07 mahesh pittala + + * Merged multicoreware/xhevc into default + [91c5bb78d52b] + +2013-06-05 mahesh pittala + + * Merged multicoreware/xhevc into default + [9c8618adaf4c] + + * Merged multicoreware/xhevc into default + [fe2691e5afff] + + * Merged multicoreware/xhevc into default + [e4c6fb2823f1] + +2013-06-04 mahesh pittala + + * Merged multicoreware/xhevc into default + [cd1437d72a75] + +2013-06-03 mahesh pittala + + * Merged multicoreware/xhevc into default + [7f7bd035a835] + + * Merged multicoreware/xhevc into default + [c7b8e56f6079] + +2013-06-14 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Merge + [852e5f6730a5] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + Adding comments; cosmetic changes, removing unnecessary local vars + [93938fd46918] + + * source/encoder/compress.cpp: + Entropy related code removed + [5c3e24604043] + + * source/encoder/compress.cpp: + Removing extraneous checks + [07de3c5c0ad9] + + * source/encoder/compress.cpp: + Copy sub-best-recon to current best recon + [acf302317da7] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + BestCU is a reference pointer, while TempCU is just that; a + temporary local pointer. + [9f22712fc800] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + To avoid memory leaks, pass in an empty pointer. rpcBestCU returns + with the Best chosen CU. + [7f4f0e71895f] + +2013-06-13 Deepthi + + * source/encoder/compress.cpp: + Claridying assert checks + [8316225c0886] + + * source/encoder/compress.cpp: + Removing use of rpcTempCU for initializations + [c8f4bd10ea38] + +2013-06-14 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp: + HM bug fix: Improve the temporary fix for issue #1071 (Nonconforming + RPS in CRA pictures) + - generate proper CRA RPS at encoding stage rather than slice segment + header writing + [ec4e63c7c8be] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + HM fixes: cleanups + [3a5d6e4754ce] + + * source/Lib/TLibEncoder/TEncCu.cpp: + HM fix: "Cleanup" + [a174e0b3f6b7] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + HM fixes: "Clean up" + [1707a3030f7c] + + * source/Lib/TLibCommon/TComPicYuv.h: + TComPicYuv: fix 16bpp build + [30941b044925] + + * source/Lib/TLibCommon/TComSlice.cpp: + HM cleanups: "Add assert() statements / clean up" + [57ea893a55f9] + +2013-06-13 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + HM bug fix: "Fix for #1072" + [381af1803935] + + * source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + HM bug fix: "Fix for #1078" + [15c51314f7ac] + + * source/Lib/TLibEncoder/TEncCavlc.cpp: + HM bug fix: "Fix for #1079" + [e8b1953c65b4] + + * source/test/ipfilterharness.cpp: + ipfilterharness: add offsets for src buffer pointer + [fee9cd2c3fc8] + + * source/test/ipfilterharness.cpp: + ipfilterharness: use alignedMalloc for short_buff + [857a4fa1a8e9] + + * source/common/common.cpp, source/encoder/motion.cpp: + motion: prevent over-large merange from causing inf loops + [64acb994bc4e] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: move distance for-loop within StarPatternSearch (less call + overhead) + [323a6667d127] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: move EncoderFrame instances from TEncTop to TEncGOP + + Each TEncGOP will have at least one frame encoder, for parallelism + [f961dee0df08] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: make m_iNumPicCoded a function local variable + [408239eb7e61] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + gcc: replace MAXUINT64 with MAX_INT64 from CommonDefs.h + [5960f5864c43] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: move setCbDistortionWeight() to cpp file, cleanup + [38e5bb0dbca5] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: rename bufsad to fullsad + [33138ffa79a7] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: move init_scales() to constructor, remove from setSourcePU + [5b2b4a7090bb] + + * source/common/primitives.h: + Merged in deepthidevaki/xhevc_deepthid (pull request #196) + + Modifications to vertical filter multiplane + [95b415adeffa] + +2013-06-13 Deepthi Devaki + + * source/tools/HM decoder/TAppDecoder.exe: + Merge + [65768d985016] + + * source/common/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Modifications to vertical filter multiplane + [2c0ecc7b043d] + +2013-06-13 Deepthi Devaki Akkoorath + + * source/tools/HM decoder/TAppDecoder.exe: + Merged multicoreware/xhevc into default + [c8b90c296a0b] + +2013-06-12 Deepthi Devaki + + * source/common/vec/ipfilter.inc, source/common/vec/ipfilter8.inc: + Added Vertical filter for multiplane (Vectorized assuming width is + multiple of 4) + [bd33365c378c] + + * source/VectorClass/vectori128.h, source/common/ipfilter.cpp, + source/common/primitives.h, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h: + Testbench for Vertical IPFilter for multiplane + [a9af2d31ba00] + +2013-06-13 Steve Borho + + * Merged in mandarmcw/xhevc_mandar_mahesh (pull request #197) + + primitives: sse2, ssse3, avx: Assembly calls for SSE_PP(SSD) for + 8,16,24,32,48,64 + [9123ea04e339] + +2013-06-13 Mandar Gurav + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + primitives: sse2: Assembly calls for SSE_PP(SSD) for 8, 16, 24, 32, + 48, 64 + [3ab1f65ad43a] + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + primitives: ssse3: Assembly calls for SSE_PP(SSD) for 8, 16, 24, 32, + 48, 64 + [a268ac5113cd] + + * source/common/x86/asm-primitives.cpp: + primitives: Assembly calls for SSE_PP(SSD) for 24,48,64 + [59228bcdfe9c] + +2013-06-13 Mandar Gurav + + * Merged multicoreware/xhevc into default + [33ba85c5f2a1] + +2013-06-13 Mandar Gurav + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm, + source/common/x86/pixel.h: + primitives: Assembly calls for SSE_PP(SSD) for 32,16,8 + [68c6b9d2d8d1] + +2013-06-13 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: small improvements + [6a170f9465ab] + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComRdCost: remove unused m_mvPredictor (HM ME is using our bit-cost + methods) + [e8d675ec84de] + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TComRdCost: use UInt to scale chroma costs instead of Double + [cf348485a9a5] + +2013-06-13 sumalatha + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + removed double from computing Cost + [b6f7ac1f105a] + +2013-06-13 Min Chen + + * source/test/testbench.cpp: + revert debug code commit by '[review] faster DCT4x4' + --- source/test/testbench.cpp | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [c69480d6a22b] + + * source/common/vec/dct.inc: + [review] fatser DCT8x8 + --- source/common/vec/dct.inc | 461 + ++++++++++++++++++++------------------------- 1 files changed, 200 + insertions(+), 261 deletions(-) + [9c4501c71e10] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc: + [review] merge row process into convert16to32_shl + --- source/Lib/TLibCommon/TComTrQuant.cpp | 22 +--------------------- + source/common/pixel.cpp | 9 ++++++--- source/common/primitives.h | 2 + +- source/common/vec/pixel.inc | 27 +++++++++++++++------------ 4 + files changed, 23 insertions(+), 37 deletions(-) + [44402f63268d] + +2013-06-13 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nits + [a20041407e93] + +2013-06-13 Deepthi + + * source/encoder/compress.cpp: + Fixing Best CU bugs + [4ccd02f3c17a] + + * source/tools/HM decoder/TAppDecoder.exe: + Merge + [9f348f94c788] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + First cut version of RDO-less mode decision: Only inter and rect + modes. + [7df0cd69bbf9] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Adding pred copy to ComputeCostInter + [ac115aa34201] + + * Merge + [84a113488c8e] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/compress.cpp: + Preparation for moving to RDO-less analysis. + [ad0f4840b8c2] + +2013-06-12 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Prediction structures for each mode: inter, intra, rect and merge + [12e076b777b3] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp: + Cleanup macro RDO_WITHOUT_DQP_BITS: always set. + [064461252a7d] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [d9f8493ad297] + + * source/encoder/compress.cpp: + Inter mode only works in compress.cpp + [2a4ada95c8da] + +2013-06-11 Deepthi + + * source/encoder/compress.cpp: + Removing RDO without DQP + [0457e3d429f3] + + * source/encoder/compress.cpp: + Merge and Intra modes removed from Inter slices for now. + [806c020e696e] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/macroblock.cpp, + source/common/vec/macroblock.inc: + Merge + [af84de459345] + +2013-06-10 sumalatha + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Added a function to estimate the header bits + [6a8bff3ae903] + +2013-06-12 Steve Borho + + * source/encoder/motion.cpp: + motion: simplify setSourcePU + [9a053413d61c] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + nits + [a5c8f414b509] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: remove unused iNumPicRcvd argument to initEncSlice + [fde34b9f8b05] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: pass EncoderFrame to TEncSlice methods, do not get from + TEncCfg + [6e3589df7134] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + TEncSlice: refactor initEncSlice to return TComSlice, not pass as + reference + + Make it more obvious pcSlice is the functions output + [eadec0d58647] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: remove redundant setCurrSliceIdx() call + [18418d680e43] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: inline xGetBuffer for clarity + [4f2cab2656e5] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: inline xInitGOP, which ignored most of its arguments + [5cd02a1e0833] + + * source/Lib/TLibCommon/TComSlice.h: + nits + [b3a522a85ca1] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove two more unused variables + [e94675550fa9] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + nits + [7a22a718c027] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + nits + [1b36f456d018] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: cleanup refactor, no behavior change + [26f03b2606f8] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused m_bFirst variable + [28bbf3b63723] + + * .hgtags: + tweak tags, make older LASTKNOWNGOOD tag visible + [6b8fabd722e7] + + * source/tools/HM decoder/TAppDecoder.exe: + Remove compiled TAppDecoder.exe + + It's not working on some platforms, and in general we don't want + compiled binaries in the tree. Users should build their own, or we + should have it available for download from Bitbucket or Egnyte. + [6facda17b5dc] + +2013-06-12 Min Chen + + * source/common/vec/dct.inc: + IDCT4x4 is SSE2 + --- source/common/vec/dct.inc | 22 ---------------------- 1 files + changed, 0 insertions(+), 22 deletions(-) + [37e60377bb38] + + * source/common/vec/dct.inc, source/test/testbench.cpp: + [review] faster DCT4x4 + --- source/common/vec/dct.inc | 137 + ++++++++++++++++++++++----------------------- + source/test/testbench.cpp | 2 +- 2 files changed, 68 insertions(+), + 71 deletions(-) + [fad6f1100104] + +2013-06-12 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove unused xFindDistortionFrame() method + [64989e7f7b96] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix chroma stride used in frame PSNR + [87599a2e5ea9] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix chroma stride used in frame PSNR + [94b3cd0b3d6d] + + * source/Lib/TLibCommon/TComPicYuvMD5.cpp: + Fix chroma stride used in hash functions (Min Chen) + [fda858eb8b20] + +2013-06-11 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: add a note for future optimizations + [a5e26e8a86c8] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: minor cleanups + [c8e15bb22d3c] + + * source/CMakeLists.txt: + cmake: downgrade requirement to cmake-2.6; it works fine + [5cc718257bc0] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: nit + [1ffcfbfa6a67] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use SSE primitives to calculate frame PSNR + [e93c215a81df] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYUV: enforce 16byte alignment of chroma planes + [580383b435fb] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + TEncGOP: remove a handful of of member variables, declare on stack + + This is a safety measure to prevent race hazards. Now we know the + scope where these variables must be consistent. + [e2948255ad27] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/encoder/wavefront.cpp: + TEncSlice: prune unused cost variables and methods + [295fbda07f03] + + * source/encoder/wavefront.cpp: + wavefront: slight refactor + [b3e9a7469e43] + + * source/common/CMakeLists.txt: + gcc: fix win32 mingw compile + [fb8920b0bba0] + + * source/encoder/wavefront.cpp: + wavefront: drop PPA events for compress and encodeCU + + Now I know that encodeCU takes very little time compared to + compressCU + [4e8a0221fb01] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/wavefront.cpp, + source/encoder/wavefront.h: + refactor: move singleton objects from TEncTop to EncodeFrame + + This will allow multiple frames to be encoded at the same time, + barring other data dependencies + [6a77792e00e0] + + * source/common/vec/pixel8.inc: + pixel: pass sad as reference + [c14b5fb37187] + + * source/common/vec/pixel8.inc: + pixel: perform sad_8 in batches of 16 rows + [f74b07aa34f4] + + * source/common/pixel.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc: + pixel: white-space cleanups, remove Intra from getResidual name + [b69c96b0e90c] + +2013-06-11 Min Chen + + * source/CMakeLists.txt: + Stack align for GCC + --- source/CMakeLists.txt | 2 +- 1 files changed, 1 insertions(+), 1 + deletions(-) + [a3bf4f55f789] + +2013-06-11 Steve Borho + + * source/common/vec/pixel8.inc: + Merged in deepthidevaki/xhevc_deepthid (pull request #191) + + Optimized residual and reconstruction in + xIntraCoding[Luma/Chroma]Blk + [318e10a6a50d] + +2013-06-11 Deepthi Devaki + + * source/common/primitives.h: + fix for bad commit + [8e690d36e169] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/common/vec/pixel8.inc: + Merge + [16f559d6b0d9] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Optimized residual and reconstruction in + xIntraCoding[Luma/Chroma]Blk + [df8efcb6d5ab] + +2013-06-11 Deepthi Devaki Akkoorath + + * source/common/macroblock.cpp, source/common/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [f66842baa091] + +2013-06-10 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [f449fe3822e5] + +2013-06-11 ggopu + + * source/common/vec/pixel8.inc: + primitives:- implementation of loop unrolling for sad - 8,32,48,64. + Others are not showing performance benefit. + [f09d5d9cf0c3] + +2013-06-10 Steve Borho + + * source/encoder/motion.h: + motion: use intptr_t for blockOffset + [bc498b762ad8] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: remove sadStride member variable + [8f13381a9a50] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/motion.h: + motion: make motion reference and search limits function arguments + + This makes the motion search re-entrant, for a given source PU block + [7a630939cd66] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: inline qpelSatd + [2ea29b826a89] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: inline qpelSad function + [13019ff2dfd3] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: inline the fpelSad() helper function + [e5b9711d3615] + + * source/common/vec/CMakeLists.txt, source/common/vec/pixel8.inc: + gcc: unused parameter workarounds + [b8896916d0cf] + + * source/common/vec/pixel8.inc: + pixel: override ALWAYSINLINE for the unroll template functions + [07408db373ed] + + * source/common/vec/pixel8.inc, source/test/mbdstharness.cpp: + fix GCC compile errors + [51d401251cd2] + + * source/common/CMakeLists.txt, source/common/dct.cpp, + source/common/macroblock.cpp, source/common/primitives.cpp, + source/common/vec/CMakeLists.txt, source/common/vec/dct-avx.cpp, + source/common/vec/dct-avx2.cpp, source/common/vec/dct-sse2.cpp, + source/common/vec/dct-sse3.cpp, source/common/vec/dct-sse41.cpp, + source/common/vec/dct-sse42.cpp, source/common/vec/dct-ssse3.cpp, + source/common/vec/dct.inc, source/common/vec/macroblock.inc, + source/common/vec/vecprimitives.inc: + rename macroblock to dct, split vector primitives into their own C++ + files + [ed6cb766af9e] + + * source/common/vec/pixel8.inc: + pixel: fix EOLN damage + [045c16364748] + + * source/common/vec/pixel8.inc: + pixel: simplify exit logic + [2fa825921a79] + + * source/common/vec/pixel8.inc: + pixel: remove pointer offset multiplication + [b7a5ceb6d6c9] + + * source/common/vec/pixel8.inc: + pixel: simplify template early-outs + [ddb5937d4ce2] + + * Merged in ggopu/gopu_xhevc (pull request #189) + + primitives:- loop unrolling using template metaprogramming. + [597cc1154c19] + +2013-06-10 Mandar Gurav + + * source/common/vec/pixel8.inc: + primitives:- loop unrolling using template metaprogramming. + [187b5e63d0c2] + +2013-06-10 Steve Borho + + * .hgtags: + Moved tag LASTKNOWNGOOD to changeset 3ec4837e6f6c (from changeset + d60578bec82e) + [72b1768e42cd] + + * source/common/macroblock.cpp, source/common/vec/macroblock.inc, + source/encoder/CMakeLists.txt: + fix GCC compile errors + [3ec4837e6f6c] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: add more missing x265_emms() calls; win32 is broken without + them + [016bad7ac3e4] + + * .hgtags: + Moved tag LASTKNOWNGOOD to changeset d60578bec82e (from changeset + 681eabf8a086) + [0667ab7fc8ed] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/motion.h: + Merge + [d60578bec82e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: move QP set much earlier, avoids race hazards + [5fbac3c94918] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/motion.h: + motion: do not use subsampling for bufSAD(), call EMMS where + necessary + [5d8f24205a1c] + +2013-06-11 Deepthi + + * Merge + [d693fcba0feb] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Verified that satd cost of orig and final prediction correspond to + those returned from ME. + [3ca77cf0a0da] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Capturing Cost for Mode Decision from Motion Estimation. + + Here, TotalCost (which will be used in Mode Decision) is satd + + lambda*MVDbits. This cost needs to be improved by adding + lambda*header(or signalling) bits. + [7a4e397d0703] + +2013-06-10 Steve Borho + + * source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + x265: use stderr and log level consistently for logging output + [6ed146e7e6a2] + +2013-06-10 Min Chen + + * source/common/vec/macroblock.inc: + [review] more optimize for IDCT32x32 + --- source/common/vec/macroblock.inc | 114 + +++++++++++--------------------------- 1 files changed, 33 + insertions(+), 81 deletions(-) + [251c74042c22] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp: + [PATCH 10/11] [review] merge memcpy with stride into IDCT + + From c4ab38c711e073f8e244bac87171b1e762992eed Mon Sep 17 00:00:00 + 2001 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 16 ++---- + source/common/macroblock.cpp | 66 ++++++++++++++------ + source/common/primitives.h | 3 +- source/common/vec/macroblock.inc | + 106 ++++++++++++++++++-------------- source/test/mbdstharness.cpp | + 14 ++-- 5 files changed, 120 insertions(+), 85 deletions(-) + [b0bb8b7d2ec1] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/primitives.h, + source/test/mbdstharness.cpp: + [review] replace function xITrMxN by function pointer + --- source/Lib/TLibCommon/TComTrQuant.cpp | 42 + ++++----------------------------- source/common/primitives.h | 10 + ++++---- source/test/mbdstharness.cpp | 8 +++--- 3 files changed, 14 + insertions(+), 46 deletions(-) + [a53a6888afea] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp: + [review] code for IDCT32x32 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 8 +- + source/common/macroblock.cpp | 11 + source/common/primitives.h | 1 + + source/common/vec/macroblock.inc | 1533 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 1 + + 5 files changed, 1547 insertions(+), 7 deletions(-) + [313549dc567d] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp: + [review] code for IDCT16x16 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 3 +- + source/common/macroblock.cpp | 11 + source/common/primitives.h | 1 + + source/common/vec/macroblock.inc | 1449 + +++++++++++++++++++++++---------- source/test/mbdstharness.cpp | 19 + +- 5 files changed, 1042 insertions(+), 441 deletions(-) + [6ffb16e28ec2] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + fix bug in testbench + --- source/test/mbdstharness.cpp | 47 + +++++++++++++++++++++++++---------------- source/test/mbdstharness.h + | 4 +- 2 files changed, 31 insertions(+), 20 deletions(-) + [b5414396d60d] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp: + [review] code for IDCT8x8 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 3 +- + source/common/macroblock.cpp | 11 ++ source/common/primitives.h | 1 + + source/common/vec/macroblock.inc | 252 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 1 + + 5 files changed, 266 insertions(+), 2 deletions(-) + [fa33207d7309] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp: + [review] code for IDCT4x4 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 6 +- + source/common/macroblock.cpp | 11 ++++ source/common/primitives.h | + 1 + source/common/vec/macroblock.inc | 103 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 1 + + 5 files changed, 120 insertions(+), 2 deletions(-) + [6900a1238ea5] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + more general testbench + --- source/test/mbdstharness.cpp | 52 + ++++++----------------------------------- source/test/mbdstharness.h + | 3 +- 2 files changed, 9 insertions(+), 46 deletions(-) + [fbdad5a9a376] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp: + [review] code for IDST4x4 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 3 +- + source/common/macroblock.cpp | 11 +++ source/common/primitives.h | 1 + + source/common/vec/macroblock.inc | 115 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 10 + +++ 5 files changed, 138 insertions(+), 2 deletions(-) + [ee73706fab63] + + * source/test/mbdstharness.cpp: + ident + --- source/test/mbdstharness.cpp | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [1616a4a66f8a] + +2013-06-10 Steve Borho + + * source/encoder/compress.cpp: + compress: fix eoln and VC9 compile + [89957326ec3e] + + * source/encoder/motion.cpp: + motion: improve cost comment + [c342a23c05a6] + + * Merged in deepthidevaki/xhevc_deepthid (pull request #188) + + 32x32 intraAng with intrinsics - Reduced build time in MSVC + [1d4db3d80b2d] + +2013-06-10 Deepthi Devaki + + * source/common/vec/intrapred.inc: + Merge + [7525ac0b99d9] + + * source/test/intrapredharness.cpp: + Include 32x32 IntraAng in testbench + [4c815940f610] + + * source/common/vec/intrapred.inc: + removing unwanted comments + [ec956b124b97] + + * source/common/vec/intrapred.inc: + 32x32 intraAng with intrinsics - Reduced build time in MSVC + [079d9456f248] + +2013-06-10 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/compress.cpp: + More cleanup + [ff59c5d68841] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/compress.cpp: + Removing extraneous arguments to predInterSearch + [9999fc1a7939] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Replacing xcompressCU with xCompressInterCU + [cf88ab9b2ad2] + + * source/encoder/compress.cpp: + Finetuning xComputeCostInter + [cd37abe9c70b] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/common/primitives.h, + source/common/vec/macroblock.inc, + source/common/vec/vecprimitives.inc: + backout e200a6011f23: causes crash. + [00190abfbdbf] + + * Merge + [621c6f8f64e0] + + * source/encoder/motion.cpp: + Comment for clarification of the term "cost" used in ME. + + In general, we can try to use the following terms in a consistent + manner. 1. Distortion: This is the actual distortion produced in the + image due to any of the lossy components in video compression (block + matching, quant, filtering etc etc). In prediction, we can define + this as the satd/sad between orig and predicted image. 2. Bits: no. + of bits used in representing that particular mode. 3. Cost or + TotalCost = Distortion + lambda*bits. + + Appending cost to either satd/bits is to be generally avoided. + [69ea081b98a6] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Define xComputeCostInter. + + Early skip detection always ON. If skip detected, exit early. + [e4f2d46c69fe] + + * source/Lib/TLibCommon/TComDataCU.cpp: + Implementation of copyCU member function. + [3ba1b81f83df] + + * source/Lib/TLibCommon/TComDataCU.h: + Adding more access functions; declaring a copyCU member function to + copy Search results. + [3173379b20a0] + +2013-06-09 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + 4 temporary structs for 2nx2nInter, InterRect, IntrainInter, and + Merge + [32ea45399897] + + * source/encoder/compress.cpp: + Recursive calls to xcompressInterCU + [c307d3211445] + + * source/Lib/TLibEncoder/TEncCu.h, source/encoder/compress.cpp: + Adding xcompressInterCU; removing irrelevant if-checks + [96c054b8187b] + + * Merge + [9bc61fc21444] + + * source/encoder/CMakeLists.txt, source/encoder/compress.cpp, + source/encoder/compress.h: + Adding compress.cpp and compress.h for the new mode decision + algorithm. + [6b5cb6739108] + +2013-06-10 Steve Borho + + * source/encoder/slicetype.cpp: + slicetype: remove functions we will not need anytime soon + [01a345bdfccc] + +2013-06-09 Steve Borho + + * doc/LookaheadGuide.txt, source/encoder/slicetype.cpp: + Borrow slicetype.c from x264 as slicetype.cpp + + This file was taken from x264 source tree circa Dec 4, 2012 with + x264 bug fixes applied from Dec 11th and Jan 8th 2013. But without + taking any of the slice threading changes because we will eventually + use the x265 thread pool and wavefront scheduling. + [1fd9e6a154dc] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTop: reorder members together, prepare for reorg + [8725f199bac5] + + * source/common/vec/vecprimitives.inc: + vec: add comments on where external functions are found + [5df470615208] + + * source/common/vec/macroblock.inc, + source/common/vec/vecprimitives.inc: + vec: move includes into vecprimitives.inc + + macroblock.inc is included inside an anonymous namespace, and cannot + include headers itself. + [b80c25641411] + +2013-06-09 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/vec/macroblock.inc, source/test/mbdstharness.cpp, + source/test/mbdstharness.h: + [review] code for DCT4x4 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 6 ++- + source/common/macroblock.cpp | 11 ++++ + source/common/vec/macroblock.inc | 82 + +++++++++++++++++++++++++++++++++ source/test/testbench.cpp | 2 +- + source/test/mbdstharness.cpp | 30 ++++++++++++ + source/test/mbdstharness.h | 1 + 6 files changed, 129 insertions(+), + 3 deletions(-) + [2622ba58e755] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp, + source/common/primitives.h, source/common/vec/macroblock.inc, + source/test/mbdstharness.cpp, source/test/mbdstharness.h: + [review] code for DCT8x8 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 3 +- + source/common/macroblock.cpp | 11 ++ source/common/primitives.h | 12 + ++ source/common/vec/macroblock.inc | 275 + +++++++++++++++++++++++++++++++++ source/test/mbdstharness.cpp | 50 + ++++++ source/test/mbdstharness.h | 1 + 6 files changed, 350 + insertions(+), 2 deletions(-) + [d2175819e669] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + replace by const + --- source/Lib/TLibCommon/TComTrQuant.cpp | 12 ++++++------ 1 files + changed, 6 insertions(+), 6 deletions(-) + [2ea982400b90] + + * source/Lib/TLibEncoder/TEncSbac.cpp: + little optimize in TEncSbac::codeLastSignificantXY + --- source/Lib/TLibEncoder/TEncSbac.cpp | 10 ++-------- 1 files + changed, 2 insertions(+), 8 deletions(-) + [145223d77222] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncSbac.cpp, source/common/primitives.h, + source/common/vec/macroblock.inc, + source/common/vec/vecprimitives.inc: + optimize TEncSbac::codeCoeffNxN by new function scanNonZeroCoeffs + --- source/Lib/TLibCommon/TComRom.cpp | 23 +++++ + source/Lib/TLibCommon/TComRom.h | 2 + + source/Lib/TLibCommon/TypeDef.h | 1 + + source/Lib/TLibEncoder/TEncEntropy.cpp | 12 --- + source/Lib/TLibEncoder/TEncEntropy.h | 1 - + source/Lib/TLibEncoder/TEncSbac.cpp | 59 ++++--------- + source/common/primitives.h | 2 + source/common/vec/macroblock.inc | + 146 ++++++++++++++++++++++++++++++++ + source/common/vec/vecprimitives.inc | 2 + 9 files changed, 194 + insertions(+), 54 deletions(-) + [e200a6011f23] + +2013-06-08 Steve Borho + + * source/common/primitives.h, source/common/x86/asm-primitives.cpp: + primitives: disable EMMS usage on x64 compiles + [d9f7525c4adf] + +2013-06-08 Min Chen + + * source/Lib/TLibEncoder/TEncSearch.cpp: + reduce EMMS when there NO Double operator + --- source/Lib/TLibEncoder/TEncSearch.cpp | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [f61a08dec7b7] + +2013-06-08 Steve Borho + + * source/x265.cpp: + x265: report elapsed time in final logging output + [264f4853796c] + +2013-06-08 Min Chen + + * source/common/x86/asm-primitives.cpp, source/common/x86/pixel-a.asm: + asm: use assembly SSD routines, where applicable + --- source/common/x86/asm-primitives.cpp | 5 +++++ + source/common/x86/pixel-a.asm | 6 ++++++ 2 files changed, 11 + insertions(+), 0 deletions(-) + [c67c9f5fd4b5] + +2013-06-07 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: repair the pixel correctness tests + [7710c496c0bb] + + * source/common/x86/asm-primitives.cpp: + Backed out changeset: e040de3904a6 (appears to break some PCs) + [cb5a4206160e] + + * source/encoder/wavefront.cpp, source/encoder/wavefront.h: + wavefront: move CU processing into a CTURow method + [2d7570e3eab0] + + * source/test/testbench.cpp: + testbench: add support for --test NAME argument + + ex: TestBench --test pix + + Only tests pixel primitives + [c5f0c45e6198] + + * source/test/intrapredharness.h, source/test/ipfilterharness.h, + source/test/mbdstharness.h, source/test/pixelharness.h, + source/test/testbench.cpp, source/test/testharness.h: + testbench: add test bench names + [c6fe42bb96f1] + + * source/test/testbench.cpp: + testbench: stop testing C prims against themselves. show SIMD names + [a88d84ce9391] + + * source/common/primitives.cpp: + primitives: remove unused variable + [078d513b8ce7] + + * source/common/x86/asm-primitives.cpp: + asm: use assembly SSD routines, where applicable + [e040de3904a6] + + * source/common/x86/CMakeLists.txt: + cmake: add pixel.h to solution, for easy access from MSVC + [88b81e0565f0] + + * source/common/primitives.h: + primitives: note that SSE primitives make no alignment assumptions + [de9847f38e29] + + * source/x265.cpp: + x265: move CPU detection and primitive initialization after file + init + [1047b745f04f] + + * source/common/common.cpp, source/x265.cpp: + x265: report input file data all one one line. Do not repeat input + filename + [42fac954412a] + + * source/input/input.h, source/input/y4m.h, source/input/yuv.h: + input: add getName() method + [27549dbae44b] + + * source/Lib/TLibEncoder/TEncAnalyze.h: + x265: report global bitrate + [8a1ebb2da93c] + + * source/Lib/TLibEncoder/TEncAnalyze.h: + x265: indicate bitrate in frame logging same as other places + [4bc3d0f8cf5f] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSlice.cpp: + Remove SAO_CHROMA_LAMBDA define, assume enabled + [49160bd03b96] + + * source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + Remove RDOQ_CHROMA_LAMBDA define, assume enabled + [915c857b3074] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + nit + [5a4710c63060] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: replace another copy loop with a primitive + [8e5ed1f84fe9] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: inline primitive arguments, remove unused var + [850ab8ac45fa] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove all three getDistPart() + [0f089c76a5c9] + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: restore xTZSearchHelp() to its former simplicity + [9bdce4f90c1a] + + * Merged in ggopu/gopu_xhevc (pull request #187) + + TEncodeSearch: Removed getDistPart() replaced with sse Primitives + [20d3411fb757] + +2013-06-07 ggopu + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/vec/sse.inc: + TEncodeSearch: Removed getDistPart() replaced with sse Primitives + [ca1959805ada] + +2013-06-07 Steve Borho + + * source/common/vec/intrapred.inc: + Merged in deepthidevaki/xhevc_deepthid (pull request #185) + + Added IntraAngular modes for chroma + [573411ef9172] + +2013-06-07 Deepthi Devaki + + * source/common/vec/intrapred.inc: + Added bFilter in call for 16x16 for chroma + [ac2aa44b996b] + + * source/common/vec/intrapred.inc: + Merge + [22c9294eb683] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/common/vec/intrapred.inc, source/test/intrapredharness.cpp: + IntraAngular adapted for Chroma application + [18843443dc21] + +2013-06-07 Min Chen + + * source/common/vec/intrapred.inc: + intrapred: fix bug on VC9-x64, the pextrw instruction very easy make + bug in VS2008 + --- source/common/vec/intrapred.inc | 3 +-- 1 files changed, 1 + insertions(+), 2 deletions(-) + [105c01ee73d9] + +2013-06-07 nandaku2 + + * source/Lib/TLibEncoder/TEncCu.cpp: + Merged in sumalatha/xhevc_sumalatha (pull request #186) + + missed in previous checkin + [772be4365f2d] + +2013-06-07 sumalatha + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/x265.cpp: + missed in previous checkin for CU_STAT_LOGFILE + [0e9f956c7cd5] + +2013-06-07 Sumalatha Polureddy + + * source/x265.cpp: + Merged multicoreware/xhevc/default into default (e5a96e958371) + [1b1235269f98] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/x265.cpp: + Merged multicoreware/xhevc/default into default (f4815a9f6747) + [e5a96e958371] + +2013-06-06 Sumalatha Polureddy + + * Merged multicoreware/xhevc/default into default (4f45fd8e9ea8) + [f4815a9f6747] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/x265.cpp, + source/x265.h: + Merged multicoreware/xhevc/default into default (6dc2e5b6c1d4) + [4f45fd8e9ea8] + +2013-06-05 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/x265.cpp, + source/x265.h: + Backed out changeset: 683fbbf6818d + [6dc2e5b6c1d4] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/x265.cpp, + source/x265.h: + added logging to collect statistics related to mode decision + [683fbbf6818d] + +2013-06-07 Deepthi + + * Merge + [f1fd893cab78] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Introducing satd based Intra in Inter: quality slowly climbing up + (0.4 dB less), bitrate equal to RDO-based. + [e654eb6fdb2c] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + CheckRDCostIntrainInter: so we can simplify Intra in Inter without + affecting I-slice decisions. + [8eca9075cfe0] + +2013-06-07 Steve Borho + + * source/x265.cpp: + x265: remove obsolete printf argument + [4ff9a3f7639f] + +2013-06-06 Steve Borho + + * source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/x265.cpp: + x265: adapt statistic logging to look more like x264 + [2df6841df3ab] + + * source/x265.cpp: + x265: report final FPS and bitrate as x264 does + [465450a1ffa7] + + * source/PPA/ppaCPUEvents.h, source/common/reference.cpp, + source/common/reference.h: + reference: up to 16 threads used for subpel interpolation + [ef3d597154d9] + + * source/common/reference.cpp, source/common/reference.h: + reference: up to 9 threads used for subpel interpolation + [fa007c214c1f] + + * source/common/reference.cpp: + reference: fix eoln + [83a646b6f94e] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComSlice.cpp, source/common/reference.cpp, + source/common/reference.h: + reference: use thread pool to interpolate references, up to 4 + threads + [299bcbf0b763] + + * source/common/threadpool.cpp, source/common/threadpool.h: + threadpool: add accessor to static var that doesn't increase ref + count + [6cf0b85a4e38] + + * source/common/reference.h: + reference: use intptr_t for pointer offset m_offsetToLuma + [32bcc907c80f] + + * source/common/reference.cpp, source/common/reference.h: + reference: remove redundant variables + [75abe6a7275e] + + * source/common/reference.cpp: + reference: simplify and correct some math + [5a5edf293002] + + * source/common/reference.cpp, source/common/reference.h: + reference: split out compute per horizontal qpel offset + [bfb389d9b26a] + + * source/common/vec/intrapred.inc: + intrapred: allow non-MSVC compilers to build intra 32x32 + [95769be84f67] + + * source/common/reference.cpp, source/common/reference.h: + reference: move temp vars to be class members + + Normally this wouldn't be helpful but since this routine is about to + be threaded, this helps work efficiency + [4707e4389160] + + * source/Lib/TLibEncoder/TEncSlice.cpp, source/PPA/ppaCPUEvents.h: + PPA: add event for TEncSlice_encodeSlice + [f0fe1b23b93a] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/PPA/ppaCPUEvents.h, + source/common/reference.cpp, source/encoder/wavefront.cpp: + PPA: tweak events to more easily analyse threading + + GenerateReferencePlanes has improved from 60ms to 56ms in the last 6 + commits (measuring on HD video). Each plane costs 3.7ms + [ecbbfa05934d] + + * source/common/reference.cpp: + reference: directly use full-pel plane from TComPicYuv - do not copy + [b2aa11f00641] + + * source/common/reference.cpp: + reference: reduce redunant calculations + [4311f9eda5c6] + + * source/common/reference.cpp: + reference: reduce redunant calculations + [418ef065cd76] + + * source/common/reference.cpp: + reference: reduce redunant calculations + [efcf8e88705c] + + * source/common/reference.cpp: + reference: reduce redunant calculations + [51ffeea76d4b] + + * source/common/reference.cpp: + reference: unroll plane extensions + [411cb75acef5] + + * source/common/reference.cpp, source/common/reference.h: + reference: combine functions + [831ab4d28f46] + +2013-06-07 Deepthi + + * Merge + [7ea20ea451eb] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Replacing Merge distortion with satd costs. + + However, this badly thrashes quality - I suspect this is due to + lower distortion values from satd for the merge-zero residual mode. + We need to finetune the merge distortion model. + [0d0adf325e0d] + +2013-06-06 Steve Borho + + * source/common/vec/intra-avx.cpp, source/common/vec/intra-sse42.cpp: + re-enable forceinline for intra AVX and SSE42 + [9567c46f3511] + + * Merged in deepthidevaki/xhevc_deepthid (pull request #184) + + intrinsics in IntraAng + [1c7ce772c0b7] + +2013-06-06 Deepthi Devaki + + * source/common/vec/intrapred.inc: + Expanded some of the Macros in 32x32 IntraAng with intention to + improve compile time. + [18f485d4e06f] + + * source/common/vec/intrapred.inc: + Intrinsics added in 32x32 Macros. + [105bb52b4414] + + * source/common/vec/intrapred.inc: + More intrinsics in 16x16 + [eaab40490386] + + * source/common/vec/intrapred.inc: + Intrinsics in MACRO for 16x16 + [dea89dcfa4c8] + +2013-06-06 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Using bits estimate from RDO instead of ME estimate. + + Using the bits estimate from RDO gives 0.5dB loss and 10% bitrate + increase. Using bits estimate from ME gives 30% bitrate increase and + 0.7dB loss. ToDo: replace RDO bits estimate with a carefully + calculated estimate. + [6657cc220ef5] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Cleanup: remove CalcRDCostInter/Intra + [ae86f626666c] + +2013-06-06 sumalatha + + * source/Lib/TLibEncoder/TEncSearch.cpp: + incremental changes - commit to add logs for rdo, satd cost. + [d021c4d5e43f] + +2013-06-06 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + CU_STAT patch from Aarthi/Sumalatha + [14aa8db979f8] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [aa89281d6517] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Merge + [b77d5500205d] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Modeling SATD distortion + [5158e6a3466e] + +2013-06-05 Steve Borho + + * source/common/vec/intrapred.inc: + intrapred: move another file static init into setup func + [3ec88cd25ae6] + + * source/common/reference.cpp: + reference: nit + [9f64d9a6a9f8] + + * source/common/reference.cpp: + reference: use a single temp buffer + [287576ca5ad2] + + * source/common/reference.cpp: + reference: remove unused assignment + [650800e42016] + + * source/common/reference.cpp: + reference: reorder plane generations + [9c8caf38d14f] + + * source/Lib/TLibCommon/TComPicYuv.h, source/common/reference.cpp, + source/common/reference.h, source/encoder/motion.cpp, + source/encoder/motion.h: + reference: use m_ prefix for member variables + [15ea880c0ffd] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: drop chroma fenc plane pointers, unused + [b13ccd8af6ea] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/CMakeLists.txt, + source/common/reference.cpp, source/common/reference.h, + source/encoder/motion.h: + reference: move reference plane generation into new common/ files + [2b62bd9493e0] + +2013-06-06 ShinYee Chung + + * source/test/pixelharness.cpp: + pixelharness: Fix missing header for SHRT_MAX. + [ec399d4f91f0] + + * source/common/pixel.cpp, source/common/vec/pixel.inc: + vec: Fix macro paste problem due to invalid tokens. + + error: pasting "." and "sad" does not give a valid preprocessing + token error: pasting "sse" and "<" does not give a valid + preprocessing token + [f0bee4bc5ab8] + +2013-06-05 Steve Borho + + * source/common/vec/macroblock.inc: + macroblock: prevent VC9 x64 from building SSE4 intrinsic butterfly + primitives + + Specifically the butterfly32 crashes and butterfly8 fails unit tests + [cc3701060a60] + + * source/common/common.h: + common: disable logging by default + [fb8d4ef3eed1] + + * source/x265.cpp: + remove redundant fclose() + [1192519fe4cc] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/common.h, + source/x265.cpp: + rename LOGGING to CU_STAT_LOGFILE + [436ebbab3ba2] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/common/common.h, + source/x265.h: + move LOGGING define to common.h + [f2ea9f497340] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost: use its own helper function consistently + [4b036d5114f8] + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: reorder methods, no change + [b076f3dd0351] + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove two unused member variables + + They were moved to DistParam and never removed from here + [c5008c610656] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h: + TComDataCU: remove unused slow bit counting methods + + These weren't being used at all; not sure if they were used in dead + code paths we've pruned or if all of its users were moved to + TComRdCost + [971fe0131bd5] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove old commented code, can no longer be used + [cda2e64d596c] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove getCost(x, y), getBits(x, y), and + xGetComponentBits() + [0e4510344de3] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: replace TComRdCost::getCost(x,y) with BitCost::mvcost + [CHANGE] + + Changes outputs of --me 4 (HM orig) slightly. Does not affect other + ME modes, which were already using mvcost() + [1058053226f4] + + * source/common/vec/pixel.inc, source/common/vec/sse.inc, + source/test/pixelharness.cpp: + Backed out changeset: ce41a92cec25 + [0720ed17000a] + + * source/common/vec/intrapred.inc: + intrapred: remove const from file static vars + + The compiler was ignoring my const_cast<> and discarding the + assignments. My guess is that it had inlined the constants (known to + be zero) into the routines + [6c44353bb63f] + + * source/common/vec/intrapred.inc: + intra: disable 32x32 temporarily, to save MSVC build times + + Once the key macros have been made pure-intrinsic, it should be safe + to build these again with MSVC. + [1ae62e55635c] + + * source/encoder/wavefront.cpp, source/encoder/wavefront.h: + wavefront: uncrustify + [cec743da6f0e] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + TEncCu, TEncSlice: uncrustify + [dcad39643221] + + * source/common/vec/intrapred.inc: + intrapred: move static variable initialization to init time + + Allow CPU detection to avoid initializing static vars that might use + illegal instructions. + [b7fa5ee0ff52] + +2013-06-05 ggopu + + * source/common/vec/pixel.inc, source/common/vec/sse.inc, + source/test/pixelharness.cpp: + primitives: implementation of sse_ss, sse_sp, sse_pp : Comment out + crashing part + [ce41a92cec25] + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + primitives: implementation of sse_ss, sse_sp, sse_pp : Quick fix + [fa56753ac713] + + * source/common/pixel.cpp, source/common/primitives.h, + source/common/vec/CMakeLists.txt, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/vec/sse.inc, source/test/pixelharness.cpp: + primitives: implementation of sse_ss, sse_sp, sse_pp + [7e9174316989] + +2013-06-05 Deepthi + + * Merge + [2cf7f7189f52] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/x265.cpp, + source/x265.h: + Logging statistics showing CU mode types and partition sizes - patch + from Aarthi/Sumalatha. + [5679059e8c2d] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Do not repeat encRes for merge mode + [ed6c51f1274e] + +2013-06-05 Steve Borho + + * .hgtags: + Added tag LASTKNOWNGOOD for changeset 681eabf8a086 + [ce241499c41d] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: nits, no change + [30fc780867a3] + +2013-06-04 Steve Borho + + * source/encoder/wavefront.cpp: + wavefront: re-order initializations to make GCC happy + [86059d10c1da] + + * source/Lib/TLibEncoder/TEncCavlc.h: + TEncCavlc: remove unused formal parameter names, avoid GCC warnings + [189c561de3cd] + + * source/encoder/wavefront.cpp: + wavefront: remove clumsy logic where codeRow could be different for + goOnSBac + [2b27bd6e3c0e] + + * source/encoder/encoder.cpp: + encoder: mark WPP disabled if thread pool has only one thread + + Either the user requested --threads 1, or auto-detect only found 1 + CPU core + [051e5dab1985] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + nit + [fd22c2f571ce] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: remove include of omp.h + [f3a40c7d2ec5] + + * source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/wavefront.cpp, + source/encoder/wavefront.h: + wavefront: further cleanups + [70edba386b16] + + * source/encoder/wavefront.cpp: + wavefront: remove unnecessary re-alloc of buffer Sbac coders + [f559d0f9c927] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: completely remove singleton TComRdCost, force use of + EncodeFrame + [6134af9537f2] + + * source/x265.cpp: + x265: remove old comment + [9cbadcbf9af5] + + * source/encoder/wavefront.cpp: + wavefront: move flush and deletions to destroy method + [2c6debbdea1d] + + * source/encoder/wavefront.cpp: + wavefront: eliminate redundant assignments + [8a6c3172ad9e] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/encoder/wavefront.cpp: + TEncCu: CU should use row local m_cRdCost so lambda is allowed to + diverge + [cb1c0f4fe4d2] + + * source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/wavefront.h: + distribute lambda and chroma weights to thread m_cRdCost (fixes non- + determinism) + + it seems --me 4 and --wpp is still non-deterministic, just not + nearly as much. + [8ad5d8bc925c] + +2013-06-03 Steve Borho + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/encoder/wavefront.cpp, source/encoder/wavefront.h: + wavefront: reorg per-thread/row data into a structure + [5103e97b516e] + +2013-06-04 Steve Borho + + * source/Lib/TLibCommon/TypeDef.h: + Remove unused SEQUENCE_LEVEL_LOSSLESS + [31b2998e87e2] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused calcRdCost64 + [734d93ef8fd8] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused calcRdCost + [d46bc99052f6] + + * source/common/vec/CMakeLists.txt: + cmake: cleanup vec file, no real changes + [2b0a5cfe46f2] + +2013-06-05 ShinYee Chung + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: Fix compile warnings. + [31e1d53e0aad] + +2013-06-04 Steve Borho + + * source/common/vec/intra-avx.cpp, source/common/vec/intra-sse42.cpp: + disable forceinline, compiles are taking multiple hours + [9f27fcbe7660] + + * source/common/pixel.cpp: + pixel: add C primitives for int/short conversion functions + [5935a4c31de9] + + * source/common/vec/intrapred.inc, + source/common/vec/vecprimitives.inc: + vec: more #include pruning to avoid unnecessary rebuilds + [637f3b3a1f8a] + + * source/common/vec/intrapred.inc, + source/common/vec/vecprimitives.inc: + move g_aucConvertToBit[] to intrapred.inc + [9df6d9454529] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove eDFunc argument from getDistPart() methods + + Soon getDistPart() can be replaced entirely with SSE primitives. + [3f286f363159] + + * source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove unused macro and method + [90f2d655fc4f] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: fix eoln + [cbcce0610e27] + +2013-06-04 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [bdec57941cef] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Bug fix in memcpy + [34cafdcafffe] + +2013-06-04 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc: + Merged in ggopu/gopu_xhevc (pull request #182) + + primitives: SSE 8 & 16 bit implementation - done for all and + Replaced getSADPart with Primitives.sad function + [537e783eab5b] + +2013-06-04 Mandar Gurav + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Replaced getDistPart() with SadBuf() and remved CALCRDCOST_SAD call + from TencodeSearch + [0d22b87a3bd4] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Replaced getSADPart with Primitives.sad function + [23778eeabe2f] + + * source/common/vec/pixel16.inc, source/common/vec/pixel8.inc: + primitives: SSE 16 bit - done for all + [d8065a5600fb] + + * source/common/vec/pixel.inc, source/common/vec/pixel8.inc: + primitives: SSE 8 bit implementation - done for all + [4c3b34bfd0a3] + +2013-06-04 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/common/vec/intra- + avx.cpp, source/common/vec/intra-sse42.cpp, source/common/vec + /ipfilter-avx.cpp, source/common/vec/ipfilter-avx2.cpp, + source/common/vec/ipfilter-sse2.cpp, source/common/vec/ipfilter- + sse3.cpp, source/common/vec/ipfilter-sse41.cpp, source/common/vec + /ipfilter-sse42.cpp, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/pixel-avx.cpp, source/common/vec/pixel-avx2.cpp, + source/common/vec/pixel-sse2.cpp, source/common/vec/pixel-sse3.cpp, + source/common/vec/pixel-sse41.cpp, source/common/vec/pixel- + sse42.cpp, source/common/vec/pixel-ssse3.cpp, + source/common/vec/utils.h: + Merge + [da81281bbfb6] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Replacing with memcpy. + [c6948a0ed131] + +2013-06-04 Deepthi Devaki + + * source/common/vec/intrapred.inc: + Completed & Uncrustified Intra Angular 32x32 + [f42c0dd3b2c3] + + * source/common/vec/intra-avx.cpp, source/common/vec/intra-sse42.cpp, + source/test/intrapredharness.cpp: + Fix for a previous wrong commit, that disabled forceinline + [be5a9b49fdcb] + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/PPA/ppa.h, + source/VectorClass/vectori128.h, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/common.cpp, + source/common/common.h, source/common/ipfilter.cpp, + source/common/md5.cpp, source/common/md5.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/threadpool.cpp, source/common/vec/CMakeLists.txt, + source/common/vec/blockcopy.inc, source/common/vec/intra-avx.cpp, + source/common/vec/intra-avx2.cpp, source/common/vec/intra-sse2.cpp, + source/common/vec/intra-sse3.cpp, source/common/vec/intra-sse41.cpp, + source/common/vec/intra-sse42.cpp, source/common/vec/intra- + ssse3.cpp, source/common/vec/intrapred.inc, source/common/vec + /ipfilter-avx.cpp, source/common/vec/ipfilter-avx2.cpp, + source/common/vec/ipfilter-sse2.cpp, source/common/vec/ipfilter- + sse3.cpp, source/common/vec/ipfilter-sse41.cpp, source/common/vec + /ipfilter-sse42.cpp, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/ipfilter.inc, source/common/vec/pixel-avx.cpp, + source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse2.cpp, + source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel-sse42.cpp, source/common/vec/pixel- + ssse3.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/vec/utils.h, source/common/vec/vec-primitives.cpp, + source/common/vec/vecprimitives.inc, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/motion.cpp, + source/input/y4m.cpp, source/input/yuv.cpp, + source/output/output.cpp, source/output/output.h, + source/output/y4m.cpp, source/output/yuv.cpp, source/output/yuv.h, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp, source/test/testbench.cpp, + source/test/testharness.h, source/tools/dr_psnr/PsnrCalculator.h, + source/tools/dr_psnr/SSIMCalculator.h, source/x265.cpp, + source/x265.h, source/x265opts.h: + Backed out changeset: 471c6f016b40 + [d790920f4c32] + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/PPA/ppa.h, + source/VectorClass/vectori128.h, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/common.cpp, + source/common/common.h, source/common/ipfilter.cpp, + source/common/md5.cpp, source/common/md5.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/threadpool.cpp, source/common/vec/CMakeLists.txt, + source/common/vec/blockcopy.inc, source/common/vec/intra-avx.cpp, + source/common/vec/intra-avx2.cpp, source/common/vec/intra-sse2.cpp, + source/common/vec/intra-sse3.cpp, source/common/vec/intra-sse41.cpp, + source/common/vec/intra-sse42.cpp, source/common/vec/intra- + ssse3.cpp, source/common/vec/intrapred.inc, source/common/vec + /ipfilter-avx.cpp, source/common/vec/ipfilter-avx2.cpp, + source/common/vec/ipfilter-sse2.cpp, source/common/vec/ipfilter- + sse3.cpp, source/common/vec/ipfilter-sse41.cpp, source/common/vec + /ipfilter-sse42.cpp, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/ipfilter.inc, source/common/vec/pixel-avx.cpp, + source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse2.cpp, + source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel-sse42.cpp, source/common/vec/pixel- + ssse3.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/vec/utils.h, source/common/vec/vec-primitives.cpp, + source/common/vec/vecprimitives.inc, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/motion.cpp, + source/input/y4m.cpp, source/input/yuv.cpp, + source/output/output.cpp, source/output/output.h, + source/output/y4m.cpp, source/output/yuv.cpp, source/output/yuv.h, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp, source/test/testbench.cpp, + source/test/testharness.h, source/tools/dr_psnr/PsnrCalculator.h, + source/tools/dr_psnr/SSIMCalculator.h, source/x265.cpp, + source/x265.h, source/x265opts.h: + Backed out merge changeset: 899d27ae3960 + + Backed out merge revision to its first parent (ffc3e2a59068) + [471c6f016b40] + + * source/common/vec/intra-avx.cpp, source/common/vec/intra-sse42.cpp, + source/common/vec/intrapred.inc, source/test/intrapredharness.cpp: + Merge + [899d27ae3960] + + * source/common/vec/intrapred.inc: + Partial vector implementation of IntrPredAng 32x32. + [ffc3e2a59068] + +2013-06-03 Steve Borho + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.h, source/PPA/ppa.h, + source/common/TShortYUV.cpp, source/common/TShortYUV.h, + source/common/common.cpp, source/common/common.h, + source/common/ipfilter.cpp, source/common/md5.cpp, + source/common/md5.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/threadpool.cpp, source/common/vec/blockcopy.inc, + source/common/vec/intra-avx.cpp, source/common/vec/intra-sse42.cpp, + source/common/vec/intrapred.inc, source/common/vec/ipfilter-avx.cpp, + source/common/vec/ipfilter-avx2.cpp, source/common/vec/ipfilter- + sse2.cpp, source/common/vec/ipfilter-sse3.cpp, source/common/vec + /ipfilter-sse41.cpp, source/common/vec/ipfilter-sse42.cpp, + source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/ipfilter.inc, source/common/vec/pixel-avx.cpp, + source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse2.cpp, + source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel-sse42.cpp, source/common/vec/pixel- + ssse3.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/vec/utils.h, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/encoder/motion.cpp, + source/input/y4m.cpp, source/input/yuv.cpp, + source/output/output.cpp, source/output/output.h, + source/output/y4m.cpp, source/output/yuv.cpp, source/output/yuv.h, + source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/pixelharness.cpp, source/test/testbench.cpp, + source/test/testharness.h, source/tools/dr_psnr/PsnrCalculator.h, + source/tools/dr_psnr/SSIMCalculator.h, source/x265.cpp, + source/x265.h, source/x265opts.h: + uncrustify: catch up with several weeks of drift + [b68cb1eaef6d] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [a35eae69f1a8] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.h: + white-space nits + [b2327bb6131d] + + * source/Lib/TLibEncoder/TEncCu.h: + TEncCu: nits + [f56bc8f415de] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp: + TEncSearch: reduce argument count for TEncSearch::Init() (it can + read) + [c163638bc616] + + * source/Lib/TLibEncoder/TEncCfg.h: + TEncCfg: add accessors for all ME data + [227956601381] + + * source/Lib/TLibEncoder/TEncTop.h: + TEncTop: white-space nits + [3a35f88c6dc4] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: remove xInitPPSforTiles + [ab72ab85c51d] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove xTZ8PointSquareSearch + [8ff79cf0335e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSlice: more white-space nits + [ade13761b5d4] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fixup commit + [d4e1b147c8aa] + + * source/Lib/TLibCommon/TComRdCost.h, source/Lib/TLibCommon/TypeDef.h: + Move DFunc enum from TypeDef.h to TComRdCost.h + [42efb06e0617] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: add a note for future optimization + [5bfad71d2e69] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Remove HHI_RQT_INTRA_SPEEDUP, HHI_RQT_INTRA_SPEEDUP_MOD. Assume + current values + [9a12626cc6cc] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Remove FAST_UDI_USE_MPM, always enabled + [7893e9388d61] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncTop.cpp: + Remove WRITE_BACK define, always enabled + [e2674be0bcce] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp: + Remove AUTO_INTER_RPS define, always enabled + [d74b6ff8128a] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: remove redundant m_uiNumSubstreams + [dce2e6e926e4] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + TEncTop: prep cleanup, remove unused m_cRDGoOnSbacCoder + [75d7c30ceb8b] + +2013-06-04 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + xAddSymbolBitsInter: part replacement of entropy estimate + [7e4e05cc36e1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Replacing zeroResidual mode bit estimate + [1a1185fbe376] + +2013-06-03 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Replacing entropy estimate for Skip. + [7ad86922e1a9] + +2013-06-03 Steve Borho + + * source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: more white-space nits + [4b42318dec1c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: white-space nits + [29258a2b78e8] + +2013-06-03 Min Chen + + * source/common/vec/intrapred.inc: + [review] more performance implement on predIntraPlanar4_sse4 + --- source/common/vec/intrapred.inc | 48 + ++++++++++++++++++++------------------ 1 files changed, 25 + insertions(+), 23 deletions(-) + [65305cf315ae] + + * source/common/vec/intrapred.inc: + [review] instrinsic predIntraPlanar64_sse4 + --- source/common/vec/intrapred.inc | 126 + +++++++++++++++++++++++++++++++++++++-- 1 files changed, 121 + insertions(+), 5 deletions(-) + [f5aef2183b11] + + * source/common/vec/intrapred.inc, source/test/intrapredharness.cpp: + [review] instrinsic predIntraPlanar32_sse4 + --- source/common/vec/intrapred.inc | 82 + ++++++++++++++++++++++++++++++++++++++ + source/test/intrapredharness.cpp | 6 +- 2 files changed, 85 + insertions(+), 3 deletions(-) + [b1be1b00a939] + + * source/common/vec/intrapred.inc: + [review] instrinsic predIntraPlanar16_sse4 + --- source/common/vec/intrapred.inc | 97 + ++++++++++++--------------------------- 1 files changed, 30 + insertions(+), 67 deletions(-) + [58046661809b] + + * source/common/vec/intrapred.inc: + [review] instrinsic predIntraPlanar16_sse4, performance + --- source/common/vec/intrapred.inc | 112 + ++++++++++++++++++++++++++++++++++++++- 1 files changed, 110 + insertions(+), 2 deletions(-) + [3265a3e7a60f] + + * source/common/vec/intrapred.inc: + [review] instrinsic predIntraPlanar8_sse4 + --- source/common/vec/intrapred.inc | 79 + ++++++++++++++++++++++++++++++++++++++- 1 files changed, 77 + insertions(+), 2 deletions(-) + [5c6fca239875] + +2013-06-03 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TencSearch: fix bit counting + [f96db0f0e705] + + * source/common/vec/pixel8.inc: + pixel: use real ints in vector code + [a22994d345a4] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + backout changes to TComRdCost, this file cannot use primitives. + [fd2d0932ed34] + + * Merged in ggopu/gopu_xhevc (pull request #180) + + * m_pcRdCost->getBits() replaced with BitCost.mvcost + [609ae722f590] + +2013-06-03 ggopu + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/pixel.cpp, + source/common/primitives.h, source/common/vec/pixel.inc, + source/common/vec/pixel8.inc, source/test/pixelharness.cpp: + * m_pcRdCost->getBits() replaced with BitCost.mvcost + * Fixed other code blocks calling non-primitive SAD functions (crashes + at sad8 - for load_a whereas works with just load) + * Primitives for SSE functions + [dedd1f39ae61] + +2013-06-03 Steve Borho + + * Merged in deepthidevaki/xhevc_deepthid (pull request #179) + + Enabled Intra Angular 16x16 in testbench + [ae67119947c0] + +2013-06-03 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [b6298df4010a] + +2013-06-03 Deepthi Devaki + + * source/test/intrapredharness.cpp: + Enabled Intra Angular 16x16 in testbench + [8e85d566d172] + +2013-06-03 Deepthi + + * Merge + [e521c7751844] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + FMD - Bug fix + [06dc1c8c186f] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Making RDO optional + [ad3f6e8fd032] + +2013-06-03 Steve Borho + + * source/common/vec/CMakeLists.txt: + cmake: add missing ipfilter-avx.cpp + [681eabf8a086] + + * source/common/vec/CMakeLists.txt: + cmake: add new foo-avx.cpp files to -mavx properties list + [ce989de34742] + + * source/common/vec/intrapred.inc: + intrapred: add include for smmintrin.h + [755d91c1e921] + + * source/common/vec/vec-primitives.cpp: + vec: allow VC9 and VC10 to use SSE4 primitives + [ffe0f4fb6980] + + * source/VectorClass/vectori128.h, source/common/vec/intra-avx.cpp, + source/common/vec/intra-sse42.cpp: + vec: enable __forceinline for higher arches of intra primitives + [b4ed502f10e5] + + * source/common/vec/CMakeLists.txt, source/common/vec/ipfilter- + avx.cpp, source/common/vec/ipfilter-avx2.cpp, source/common/vec + /ipfilter-sse2.cpp, source/common/vec/ipfilter-sse3.cpp, + source/common/vec/ipfilter-sse41.cpp, source/common/vec/ipfilter- + sse42.cpp, source/common/vec/ipfilter-ssse3.cpp, + source/common/vec/ipfilter.inc, source/common/vec/vecprimitives.inc: + vec: split ipfilter primitives into their own C++ files + [ecaeb658c41f] + + * source/common/vec/CMakeLists.txt, source/common/vec/intra-avx.cpp, + source/common/vec/intra-avx2.cpp, source/common/vec/intra-sse2.cpp, + source/common/vec/intra-sse3.cpp, source/common/vec/intra-sse41.cpp, + source/common/vec/intra-sse42.cpp, source/common/vec/intra- + ssse3.cpp, source/common/vec/intrapred.inc, + source/common/vec/vecprimitives.inc: + vec: split intra primitives into their own C++ files + [8a3b623143e7] + + * source/common/vec/CMakeLists.txt, source/common/vec/pixel-avx.cpp, + source/common/vec/pixel-avx2.cpp, source/common/vec/pixel-sse2.cpp, + source/common/vec/pixel-sse3.cpp, source/common/vec/pixel-sse41.cpp, + source/common/vec/pixel-sse42.cpp, source/common/vec/pixel- + ssse3.cpp, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/utils.h, + source/common/vec/vecprimitives.inc: + vec: split pixel primitives into their own C++ files + [d7b8280f744a] + +2013-06-03 ShinYee Chung + + * source/x265.cpp: + cmd: Fix segfault when an unrecognized option is given on the + command line. + [d4363da9df37] + +2013-06-02 Min Chen + + * source/common/vec/intrapred.inc: + [review] instrinsic predIntraPlanar4_sse4 + --- source/common/vec/intrapred.inc | 59 + +++++++++++++++++++++++++++++++++++++++ 1 files changed, 59 + insertions(+), 0 deletions(-) + [598fa5788632] + + * source/common/vec/intrapred.inc: + intrapred: optimize by func_ptr in predIntraPlanar + --- source/common/vec/intrapred.inc | 59 + ++++++++++++++++++-------------------- 1 files changed, 28 + insertions(+), 31 deletions(-) + [9737e9d6ba9e] + + * source/common/vec/CMakeLists.txt: + Enable SSE4 on VC9 + --- source/common/vec/CMakeLists.txt | 10 +++------- 1 files changed, + 3 insertions(+), 7 deletions(-) + [ddfcdb3e6815] + + * source/common/vec/intrapred.inc: + intrapred: simple predIntraPlanar4 by macro + --- source/common/vec/intrapred.inc | 49 + ++++++++++++--------------------------- 1 files changed, 15 + insertions(+), 34 deletions(-) + [43dc6a7f95cc] + + * source/common/vec/intrapred.inc: + miss some authors + --- source/common/vec/intrapred.inc | 3 +++ 1 files changed, 3 + insertions(+), 0 deletions(-) + [f17187830a01] + +2013-06-01 Steve Borho + + * source/x265.cpp: + x265: use output frame counts for progress reports + + This actually makes them less accurate in the short term because of + the GOP cadence of the encode function. but it is more correct from + a code logic point of view. + [0abd8ed72c32] + + * source/encoder/encoder.cpp: + encoder: pull recon images from queue even when not requested + + When the app wasn't requesting recon images, they were queueing + indefinitely in m_cListRecQueue (not quite as bad as it seems, it + only queues pointers and the buffers are recycled). And it's was + always returning 0 for iNumEncoded, which was bad. + [e93dce2aaf48] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: rename StarSearch to StarPatternSearch, make inline + [c7d3dbce14c8] + + * source/encoder/motion.cpp: + motion: fixup comments, remove tabs, simplify some macros + [016c61125c55] + + * source/Lib/TLibCommon/TComRom.h: + TComRom: remove unused extern + [ca6ac5ea20bd] + + * source/encoder/motion.h: + motion: use sadStride in helper functions + [c70e2aff8caa] + +2013-06-01 Mandar Gurav + + * source/encoder/motion.cpp: + motion: use sad_x4 in star Search for dist = 1. + [20a701e4a753] + + * source/common/common.h: + motion: ME cycle count- Print lld + [4413954b9a14] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: TwoPoint search manually inlined. Now, exe_time_SUBSAMPLE_1 + < exe_time_SUBSAMPLE_0. + [8d0a2281093e] + + * source/common/common.h: + motion: Print Total Cycle count also. Avg. Cycle count doesn't give + exact idea of performance - since num_calls_SUBSAMPLE_0 != + num_calls_SUBSAMPLE_1. + [5e5988fad642] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: Replace "stride << subsample" with "sadStride". + [2a19d8945ddd] + + * source/Lib/TLibCommon/TComRom.h: + Merge conflict: Copy files directly from xhevc main repo. part 4 + [e902484772b1] + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncAnalyze.h: + Merge conflict: Copy files directly from xhevc main repo. part 3 + [65463235027f] + +2013-05-31 Mandar Gurav + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, cfg/per- + sequence/BQMall.cfg, cfg/per-sequence/BQSquare.cfg, cfg/per- + sequence/BQTerrace.cfg, cfg/per-sequence/BasketballDrill.cfg, cfg + /per-sequence/BasketballDrillText.cfg, cfg/per- + sequence/BasketballDrive.cfg, cfg/per-sequence/BasketballPass.cfg, + cfg/per-sequence/BlowingBubbles.cfg, cfg/per-sequence/Cactus.cfg, + cfg/per-sequence/ChinaSpeed.cfg, cfg/per-sequence/FourPeople.cfg, + cfg/per-sequence/Johnny.cfg, cfg/per-sequence/Kimono.cfg, cfg/per- + sequence/KristenAndSara.cfg, cfg/per- + sequence/NebutaFestival_10bit.cfg, cfg/per-sequence/ParkScene.cfg, + cfg/per-sequence/PartyScene.cfg, cfg/per- + sequence/PeopleOnStreet.cfg, cfg/per-sequence/RaceHorses.cfg, cfg + /per-sequence/RaceHorsesC.cfg, cfg/per-sequence/SlideEditing.cfg, + cfg/per-sequence/SlideShow.cfg, cfg/per- + sequence/SteamLocomotiveTrain_10bit.cfg, cfg/per- + sequence/Traffic.cfg, cfg/per-sequence/Vidyo1.cfg, cfg/per- + sequence/Vidyo3.cfg, cfg/per-sequence/Vidyo4.cfg, + source/Lib/CMakeLists.txt, + source/Lib/TAppCommon/program_options_lite.cpp, + source/Lib/TAppCommon/program_options_lite.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/TestForChecking_BadCommit.bat, source/encoder/IntraPred.cpp, + source/encoder/TShortYUV.cpp, source/encoder/TShortYUV.h, + source/encoder/butterfly.h, source/encoder/common.cpp, + source/encoder/common.h, source/encoder/ipfilter.cpp, + source/encoder/macroblock.cpp, source/encoder/md5.cpp, + source/encoder/md5.h, source/encoder/motion.cpp, + source/encoder/motion.h, source/encoder/pixel.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h, + source/encoder/threading.cpp, source/encoder/threading.h, + source/encoder/threadpool.cpp, source/encoder/threadpool.h, + source/encoder/vec/CMakeLists.txt, source/encoder/vec/avx.cpp, + source/encoder/vec/avx2.cpp, source/encoder/vec/blockcopy.inc, + source/encoder/vec/intrapred.inc, source/encoder/vec/ipfilter.inc, + source/encoder/vec/ipfilter16.inc, source/encoder/vec/ipfilter8.inc, + source/encoder/vec/macroblock.inc, source/encoder/vec/pixel.inc, + source/encoder/vec/pixel16.inc, source/encoder/vec/pixel8.inc, + source/encoder/vec/sse2.cpp, source/encoder/vec/sse3.cpp, + source/encoder/vec/sse41.cpp, source/encoder/vec/sse42.cpp, + source/encoder/vec/ssse3.cpp, source/encoder/vec/vec-primitives.cpp, + source/encoder/vec/vecprimitives.inc, + source/encoder/x86/CMakeLists.txt, source/encoder/x86/asm- + primitives.cpp, source/encoder/x86/const-a.asm, + source/encoder/x86/cpu-a.asm, source/encoder/x86/pixel-32.asm, + source/encoder/x86/pixel-a.asm, source/encoder/x86/pixel.h, + source/encoder/x86/sad-a.asm, source/encoder/x86/x86inc.asm, + source/encoder/x86/x86util.asm, source/x265cfg.cpp, + source/x265cfg.h, source/x265enc.cpp, source/x265enc.h, + source/x265main.cpp: + Merged multicoreware/xhevc into default + [2871d75ffb73] + +2013-06-01 Mandar Gurav + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Merge conflict: Copy files directly from xhevc main repo. part 2 + [c446638cb8c0] + +2013-06-01 ggopu + + * source/encoder/motion.cpp, source/encoder/motion.h: + Merge conflict: Copy files directly from xhevc main repo. + [6272284d8ce0] + +2013-05-23 Mandar Gurav + + * source/encoder/motion.cpp: + Merged multicoreware/xhevc into default + [53497b61c0ac] + +2013-05-24 ggopu + + * build/nmake/make-solutions.bat, source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp: + Merge + [b224e58a7f8b] + +2013-05-23 ggopu + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Backed out changeset: 06fa58c2176d + [effd80b01d39] + +2013-05-31 Steve Borho + + * source/encoder/encoder.cpp: + encoder: do not allow GOP size to be less than 1 + [984eb5329bef] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TrComQuant: remove unused variable, fix GCC warnings + [2125b36b8ae6] + +2013-05-30 Steve Borho + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, cfg/per- + sequence/BQMall.cfg, cfg/per-sequence/BQSquare.cfg, cfg/per- + sequence/BQTerrace.cfg, cfg/per-sequence/BasketballDrill.cfg, cfg + /per-sequence/BasketballDrillText.cfg, cfg/per- + sequence/BasketballDrive.cfg, cfg/per-sequence/BasketballPass.cfg, + cfg/per-sequence/BlowingBubbles.cfg, cfg/per-sequence/Cactus.cfg, + cfg/per-sequence/ChinaSpeed.cfg, cfg/per-sequence/FourPeople.cfg, + cfg/per-sequence/Johnny.cfg, cfg/per-sequence/Kimono.cfg, cfg/per- + sequence/KristenAndSara.cfg, cfg/per- + sequence/NebutaFestival_10bit.cfg, cfg/per-sequence/ParkScene.cfg, + cfg/per-sequence/PartyScene.cfg, cfg/per- + sequence/PeopleOnStreet.cfg, cfg/per-sequence/RaceHorses.cfg, cfg + /per-sequence/RaceHorsesC.cfg, cfg/per-sequence/SlideEditing.cfg, + cfg/per-sequence/SlideShow.cfg, cfg/per- + sequence/SteamLocomotiveTrain_10bit.cfg, cfg/per- + sequence/Traffic.cfg, cfg/per-sequence/Vidyo1.cfg, cfg/per- + sequence/Vidyo3.cfg, cfg/per-sequence/Vidyo4.cfg, + source/CMakeLists.txt, + source/Lib/TAppCommon/program_options_lite.cpp, + source/Lib/TAppCommon/program_options_lite.h, source/x265cfg.cpp, + source/x265cfg.h, source/x265enc.cpp, source/x265enc.h, + source/x265main.cpp: + x265-cli: remove old CLI program and config files + + old command line: x265-cli -i Vidyo4.yuv -c cfg/encoder_I_15P.cfg -c + cfg/per-sequence/Vidyo4.cfg -b out.hevc new command line: x265 + Vidyo4.y4m out.hevc + + old command line: x265-cli -i Vidyo4.yuv -c cfg/encoder_all_I.cfg -c + cfg/per-sequence/Vidyo4.cfg -b out.hevc new command line: x265 + Vidyo4.y4m out.hevc --sao -i 1 + [cda100c6417f] + +2013-05-31 Steve Borho + + * source/encoder/encoder.cpp: + encoder: add "-i 32" hack to select "encoder_randomaccess_main" + config from hm + + Also requires enabling --rect --amp --sao to match the output of + that config file. + [3136a4b62f03] + +2013-05-30 Steve Borho + + * source/encoder/encoder.cpp: + encoder: be explicit about slice type + [4414841e3211] + + * source/x265.cpp: + x265: fix warnings hidden by old x265-cli project + [192da7e89c14] + +2013-05-31 Steve Borho + + * source/common/vec/CMakeLists.txt: + cmake: fix another set_source_files problem + [8d6efac63424] + + * source/common/vec/CMakeLists.txt: + cmake: avoid warnings from x64 MSVC compilers + + x64 implies at least SSE4, and gets tetchy when you say /arch:SSE2 + Previously, the set_source_files_properties line was busted by a bad + cut-paste + [633b668d6c47] + +2013-05-31 Min Chen + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/primitives.h, + source/common/vec/pixel.inc: + optimize by convert32to16_shr + --- source/Lib/TLibCommon/TComTrQuant.cpp | 5 +---- + source/common/primitives.h | 2 ++ source/common/vec/pixel.inc | 24 + ++++++++++++++++++++++-- 3 files changed, 25 insertions(+), 6 + deletions(-) + [f70b88137438] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/primitives.h, + source/common/vec/pixel.inc: + optimize by convert16to32_shl + --- source/Lib/TLibCommon/TComTrQuant.cpp | 17 ++++++++++++++--- + source/common/primitives.h | 2 ++ source/common/vec/pixel.inc | 18 + ++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) + [525a8ad9e244] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/primitives.h, + source/common/vec/pixel.inc: + optimize by convert32to16 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 5 +---- + source/common/primitives.h | 2 ++ source/common/vec/pixel.inc | 19 + +++++++++++++++++++ 3 files changed, 22 insertions(+), 4 + deletions(-) + [49856ba4303c] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant::xT: replace for_loop by memcpy() + --- source/Lib/TLibCommon/TComTrQuant.cpp | 15 ++++----------- 1 + files changed, 4 insertions(+), 11 deletions(-) + [060e49a8383d] + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/primitives.h, + source/common/vec/pixel.inc: + optimize by convert16to32 + --- source/Lib/TLibCommon/TComTrQuant.cpp | 9 +++++---- + source/common/primitives.h | 3 +++ source/common/vec/pixel.inc | 20 + ++++++++++++++++++++ 3 files changed, 28 insertions(+), 4 + deletions(-) + [ec636f543969] + + * source/encoder/wavefront.cpp: + thread: bits matches to HM when single thread + --- source/encoder/wavefront.cpp | 11 ++++++++--- 1 files changed, 8 + insertions(+), 3 deletions(-) + [fa49682780e3] + + * source/common/vec/macroblock.inc: + fix bug on VC9-x86 + --- source/common/vec/macroblock.inc | 87 + +++++++++++++++++++++++++------------ 1 files changed, 59 + insertions(+), 28 deletions(-) + [61dc8cf0acdd] + +2013-05-31 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Fix for 8bpp & 16bpp output mismatch + [ce4d663b31d7] + +2013-05-31 ShinYee Chung + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: Disable set but not used variables (-Werror=unused-but- + set-variable). + [4c7879f51a14] + + * source/common/common.cpp: + Common: Fix compile error -Werror=write-strings. + [e73b996e121d] + +2013-05-30 Steve Borho + + * source/common/vec/ipfilter8.inc: + ipfilter: work around unaligned data copies generated by VC9 (from + Min Chen) + [05e0b5742adb] + +2013-05-31 ShinYee Chung + + * source/x265.cpp: + x265: Fix compile issue due to -Wwrite-strings and undeclared + va_start() and va_end(). + [4dfbf564c772] + +2013-05-30 Steve Borho + + * source/test/mbdstharness.cpp: + dequant: fix a test bench typo + [c7b53fb8146d] + + * source/x265.cpp, source/x265opts.h: + x265: make input/output bit depths function local to parse method, + fix eoln + [e7d3e278ba1c] + + * source/x265cfg.cpp: + x265cfg: set default log level for x265-cli to "debug" + [1dab28c7dbda] + + * source/x265.cpp: + x265: add a CTRL+C handler + [a8b7be978131] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp, + source/x265enc.cpp: + TEncGOP: show per-frame statistics at log level X265_LOG_DEBUG + [be3ba8d41623] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/common/common.cpp, + source/common/common.h, source/common/primitives.cpp, + source/common/primitives.h, source/encoder/encoder.cpp, + source/test/testbench.cpp, source/x265.cpp, source/x265.h, + source/x265cfg.cpp, source/x265cfg.h, source/x265enc.cpp, + source/x265opts.h: + x265: use x265_log when initializing primitives and thread pool + + * removes thread pool pointer from CLI apps, encoders initialize it + now + * thread pool size validation moved to x265_check_params() + [55c353b046a1] + + * source/common/common.cpp, source/x265.cpp, source/x265.h, + source/x265opts.h: + x265: introduce x265_param_t.logLevel, and x265_log() + [cb9129046937] + + * source/x265.h: + x265: fixup comment for iWaveFrontSynchro + [7bb9d9960526] + + * source/x265.cpp, source/x265.h, source/x265opts.h: + x265: introduce the concept of log levels + [e7e934f1aed9] + + * source/x265.h: + x265: prune unused bits of public header + [d6c0d17e7c87] + + * source/Lib/TLibEncoder/TEncGOP.cpp, source/x265.cpp: + x265: add x264 style progress reporting, kbps numbers differ from HM + [8d9f188a7219] + + * source/common/threadpool.cpp, source/common/threadpool.h, + source/x265.cpp: + threadpool: report thread pool size, fix bug with CPU count auto- + detect + [dddd03cf7b26] + + * source/CMakeLists.txt: + cmake: remove short-lived openmp compile flags + [0a300add8bc5] + +2013-05-30 ShinYee Chung + + * source/x265.cpp, source/x265cfg.cpp: + Config: Force the threadpool to be single threaded when wavefront is + disabled. + [6b1b567da72c] + + * source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/CMakeLists.txt: + WPP: Use the threadpool based wavefront parallel processing module. + [00c1977b8709] + + * source/encoder/wavefront.cpp, source/encoder/wavefront.h: + WPP: Module for wavefront parallelization using threadpool. + [fe2556ffffff] + + * source/Lib/TLibEncoder/TEncTop.h: + TLibEncoder: Add threadpool accessor to TEncTop. + [2730479ce267] + +2013-05-30 Steve Borho + + * source/encoder/motion.cpp: + motion: fix a bug introduced in last commit + [c853140e23c9] + + * source/encoder/motion.cpp: + motion: two more SAD calls that were missed + [b9e8e9d51890] + + * source/encoder/motion.cpp: + motion: fix a missed raw call to a sad() primitive + [08dd67222b97] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: minor cleanups, no behavior changes + [6f32a8ba24b0] + + * source/encoder/motion.cpp: + motion: else clause of primitive setup must always be compiled in + + This fixes some of the broken-ness when SUBSAMPLE_SAD is enabled + [4d77283b0129] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: remove unused blockWidth and blockHeight + [d7ec2b5e1519] + + * Merged in ggopu/gopu_xhevc (pull request #176) + + Motion: implemented subsampling SAD when the block height is greater + than 12 + [d67eed484a46] + +2013-05-30 ggopu + + * source/encoder/motion.cpp, source/encoder/motion.h: + Motion: implemented subsampling SAD when the block height is greater + than 12 + [7d87f713971f] + +2013-05-30 Gopu G + + * Merged multicoreware/xhevc into default + [29850cefbec9] + +2013-05-29 Gopu G + + * source/encoder/motion.cpp: + Merged multicoreware/xhevc into default + [824c2ddca305] + +2013-05-30 ggopu + + * source/encoder/motion.cpp: + Conflict + [edc303ed7859] + +2013-05-29 ggopu + + * source/encoder/motion.cpp: + Motion: raster search refinement - sad_x4 Implementation + [d8ac001a7730] + +2013-05-30 Steve Borho + + * source/common/primitives.cpp, source/common/primitives.h, + source/common/x86/README.txt, source/common/x86/asm-primitives.cpp, + source/common/x86/pixel-a.asm, source/common/x86/pixel.h, + source/common/x86/x86inc.asm: + asm: rebrand x264 assembly functions with x265_ to avoid static link + collisions + + x265 compiles the routines with a different FENC_STRIDE, so they + cannot have the same names as the x264 routines else x264 may break + when linked together with x265 in the same application. + [3b4597a6a66b] + +2013-05-30 Deepthi + + * Merge + [d0242a8f6c60] + +2013-05-30 praveen Tiwari + + * Merged multicoreware/xhevc into default + [2a5bfbaa64c1] + +2013-05-30 praveentiwari + + * source/test/mbdstharness.cpp: + Modified unit test + [369944997be0] + +2013-05-30 praveen Tiwari + + * Merged multicoreware/xhevc into default + [f4a49e8bc939] + +2013-05-30 praveentiwari + + * source/test/mbdstharness.cpp: + More comprehensive unit test code for xDeQuant. + [e7f272bfe95a] + +2013-05-30 praveen Tiwari + + * Merged multicoreware/xhevc into default + [4fe41d6eced6] + +2013-05-29 praveentiwari + + * source/common/vec/macroblock.inc: + Vector code for xDeQuant function + [71119aff0eb0] + +2013-05-29 praveen Tiwari + + * Merged multicoreware/xhevc into default + [c94a23b37846] + +2013-05-29 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/common/macroblock.cpp: + Shorten the variable names + [32326bb746ae] + +2013-05-29 praveen Tiwari + + * Merged multicoreware/xhevc into default + [92fa5bc97c93] + + * Merged multicoreware/xhevc into default + [d0a255af3f3b] + +2013-05-29 praveentiwari + + * source/common/macroblock.cpp: + C primitive for xDeQuant function + [40b64d61b0ac] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code for xDeQuant + [0f58120670fe] + + * source/common/primitives.h: + Added function pointer typedef for xDeQuant in primitives.h file + [f82cb9a78b0a] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Added a switch and modified argument list for xDeQuant to break + dependecy from other member variable and functions + [28de65cc3e50] + +2013-05-30 Deepthi + + * Merge + [8c8f8d931100] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + RDO: new version for fast mode decision + [e7ca54f292cb] + +2013-05-30 Steve Borho + + * source/common/common.cpp: + common: prevent a known-broken configuration (wpp+aqselect) + [4c56f1b1c190] + + * source/x265opts.h: + x265opts: disable --depth when compiled for 8bpp, make --wpp a + boolean flag + [f02a5ddfe1d4] + + * source/common/common.cpp: + common: give log indication when FAST_MODE_DECISION has been + compiled in + [51cea626fc5c] + + * source/common/common.cpp: + common: show rect and amp options when enabled + [2b1b890ae098] + + * source/common/common.cpp: + common: group transform skip options together + [93cd37a94aa6] + + * source/CMakeLists.txt: + cmake: give FAST_MODE_DECISION an accurate help message + [57ac25324d49] + +2013-05-30 Deepthi + + * cfg/encoder_I_15P.cfg, source/CMakeLists.txt, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.cpp: + Merge + [af2246ceeb2c] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + First cut version of using SAD costs. + [4d681d6d86fe] + + * cfg/encoder_I_15P.cfg, source/common/common.cpp: + Disabling rect and AMP search (for now) + [06f36a6d94fc] + +2013-05-29 Deepthi + + * source/CMakeLists.txt, source/Lib/TLibEncoder/TEncSearch.cpp: + ME satd costs and bits captured. FAST_MODE_DECISION introduced. + [27ac6d4a40fd] + +2013-05-30 Steve Borho + + * source/common/common.cpp, source/common/common.h: + common: add x265_mdate() and fix rdoq option names + [7e10b0e4554f] + +2013-05-29 Steve Borho + + * source/x265opts.h: + x265: add "no-" options for boolean parameters, cleanup sao and hash + options + [6769ff6d6154] + + * source/x265.cpp: + x265: add basic command line help + [a48721370b31] + + * source/x265.cpp: + x265: improve reporting of invalid or extra vars, allow "x265 input + output" + [fafeb6fb8d0c] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: introduce infrastructure for sub-sampled SAD + + When performing a motion search for a block taller than 12 pixels + (16 or more) we want to use sub-sampled SAD, meaning we only measure + every other row of the block. Normally one would do this by halving + the number of rows and doubling the stride, and doubling the output + SAD result but we have two caveats: + + 1 - our sad_x3 and sad_x4 primitives have hard-coded fenc strides + (FENC_STRIDE) and thus we cannot subsample the fenc block by + doubling stride. Instead we have to actually subsample the block and + store it in a buffer with FENC_STRIDE, which I've done in this + commit. Now fencSad will always point to a buffer with FENC_STRIDE, + and if this->subsample == 1, the buffer is subsampled. + + 2 - since the SAD costs never leave this function, we do not have to + shift up our sad results by one bit to account for the subsampling. + The subpel refine step will remeasure with non-subsampled SATD and + thus we don't care what the best SAD cost actually was (only that it + was the best). The only place where we need to account for + subsampling is in the SAD_THRESH macro. It must shift the sad cost + up by this->subsample bits. + + Steps remaining (all to be done within #if SUBSAMPLE_SAD + + 1 - Replace fenc with fencSad for all sad(), sad_x3(), sad_x4() + calls 2 - shift the luma stride by this->subsample for sad(), + sad_x3() and sad_x4() 3 - shift up SAD cost by this->subsample for + SAD_THREAD checks 4 - make sure all the changes from 1..3 have no + effect on SATD calls 5 - test, we should see a small quality loss + and an ME perf gain when enabled + [3ea322fcdd7a] + + * source/CMakeLists.txt: + gcc: add openmp flags for gcc and icl + [7f14e6ca1bff] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: disable allocation and usage of events if numThreads == 1 + [300663bf0015] + +2013-05-23 Min Chen + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: improvement performance based on replace iFinish by Event + --- source/Lib/TLibEncoder/TEncSlice.cpp | 41 + ++++++++++++++++++++++++++------- 1 files changed, 32 insertions(+), + 9 deletions(-) + [1f68ebfe1ae7] + + * source/CMakeLists.txt, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + thread: the first worked version, turn on in WPP mode only + [b67a18ce5c31] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + thread: modify m_pcTrQuant to every thread, [BROKEN + UseAdaptQpSelect] + --- source/Lib/TLibCommon/TComSlice.h | 5 ----- + source/Lib/TLibEncoder/TEncCu.cpp | 3 ++- + source/Lib/TLibEncoder/TEncCu.h | 1 + + source/Lib/TLibEncoder/TEncGOP.cpp | 23 +++++++++++++++-------- + source/Lib/TLibEncoder/TEncSearch.cpp | 3 +-- + source/Lib/TLibEncoder/TEncSearch.h | 2 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 22 +++++++++++++++++++--- + source/Lib/TLibEncoder/TEncSlice.h | 2 +- + source/Lib/TLibEncoder/TEncTop.cpp | 26 +++++++++++++------------- + source/Lib/TLibEncoder/TEncTop.h | 4 ++-- 10 files changed, 55 + insertions(+), 36 deletions(-) + [d4a0ce886fae] + +2013-05-29 Steve Borho + + * build/linux/batch.py: + update batch.py to use x265.exe + [45a9d983b45a] + + * source/x265.cpp: + x265: destroy cliopt before checking for leaks (x265.exe now leak + free) + [492957758353] + + * source/encoder/encoder.cpp: + encoder: destroy YUV buffers before deleting them + [9131daa60c98] + + * source/encoder/encoder.cpp: + encoder: properly release reconstructed image buffers at close + [61d3fb60cb9e] + + * source/x265.cpp: + x265: cleanup library statics before exit, prevent leak reports + [908544e772b4] + + * source/encoder/encoder.h: + encoder: preallocate room in NAL containers + [932360bdc969] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: write output packets directly into std::string buffer + [80db68a1ea35] + + * source/encoder/motion.cpp: + motion: check motion candidates even for star search + [60b59cac3bc2] + + * source/encoder/motion.cpp: + motion: use sad_x4 for raster search + [f0cbf34f0845] + +2013-05-29 Min Chen + + * source/encoder/encoder.cpp, source/x265.h, source/x265opts.h: + x265: new option hash + --- source/encoder/encoder.cpp | 2 +- source/x265.h | 3 +++ + source/x265opts.h | 1 + 3 files changed, 5 insertions(+), 1 + deletions(-) + [d5e04469c1fc] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + x265: fix bug in nal write + --- source/encoder/encoder.cpp | 6 +++++- source/encoder/encoder.h | + 1 + 2 files changed, 6 insertions(+), 1 deletions(-) + [ae9452e370e7] + +2013-05-29 Steve Borho + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/PPA/ppaCPUEvents.h: + ppa: tweak PPA events to extract more useful detail about threading + [3f33cdd7ff98] + + * source/x265.cpp: + x265: fix integer argument parsing + [c40e7dbe7f07] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/PPA/ppaCPUEvents.h: + TEncCu: add separate PPA event for xCalcRDCostIntra + [ee68c82fc7c7] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/PPA/ppa.h, + source/PPA/ppaCPUEvents.h: + TEncCU: allow PPA to show recursive depth to xCompressCU + [f625550f4de7] + + * source/encoder/CMakeLists.txt: + cmake: fix display of TLibEncoder headers + [3e9e9412262b] + + * source/common/common.cpp, source/x265.cpp: + common: improve the logging of configuration + [cfde846ed6e4] + +2013-05-28 Steve Borho + + * source/x265.cpp: + x265: report total elapsed time + [5b755da5e73a] + +2013-05-29 Deepthi + + * Merge + [d16847d13c67] + + * source/CMakeLists.txt, source/Lib/TLibEncoder/TEncCu.cpp: + Early Partition Decision macro removed - the xCompressCU version has + been commented out with a warning. + [7bad155a07be] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Cleanup - early exit + [82f090c9a0f2] + +2013-05-28 Deepthi + + * Merge + [d876d9c5eecc] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Early exit reloaded. + [fcdedf954fbf] + +2013-05-27 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Adding CalcRDCostIntra + [6de8c04a070d] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Separating out analysis structure from mode decision + [3a4b62a318bb] + +2013-05-28 Steve Borho + + * source/encoder/motion.cpp: + motion: hoist two-point early out out of function + [f3c78bfe071d] + + * source/encoder/motion.cpp: + motion: convert two-point search into a table lookup + [3d4df6126d11] + + * source/common/common.h, source/encoder/encoder.cpp: + Merge + [60570eb6bff8] + + * source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.h, + source/encoder/encoder.cpp: + common: cleanup cycle counter, disable by default + [70ee52ecfdb5] + + * source/common/common.h, source/encoder/motion.cpp: + motion: rename some macros, enable sad_x4 + [fb67b234b11e] + + * source/encoder/encoder.cpp: + encoder: use osstream.write() instead of << which appears to be zero + terminated + [ebb0c3ff732c] + +2013-05-28 ggopu + + * source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/common/common.h: + Implemented the Cycle Count Calculation for Motion Estimation + [2a0b3e881909] + + * source/common/common.h, source/encoder/motion.cpp: + sad_x4 Implementation for Star Search + [e0a1a4ed9ccc] + +2013-05-27 Steve Borho + + * source/common/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: minor cleanups + [4fd217fbf622] + +2013-05-27 Min Chen + + * source/VectorClass/vectori128.h, source/common/CMakeLists.txt, + source/encoder/CMakeLists.txt: + Fix build problem + --- source/VectorClass/vectori128.h | 2 +- + source/common/CMakeLists.txt | 3 ++- source/encoder/CMakeLists.txt | + 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) + [3ba1fae68c60] + +2013-05-27 Steve Borho + + * source/x265.h: + x265: fix eoln mangling + [835abb6c24f9] + +2013-05-27 praveentiwari + + * source/common/vec/macroblock.inc: + .60x more performance gain for partailButterfly4 intrinsic version + [3cd8e6c169f6] + +2013-05-26 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Adding CalcRDCostInter + [a61257252105] + +2013-05-25 Steve Borho + + * source/encoder/encoder.cpp, source/x265.cpp: + x265: encoder_all_I.cfg output now matches x265 -i1 --sao 1 + + SAO was never disabled in the I frame config, so must be enabled to + get the exact same bitstream + [a581b097ad6e] + + * source/encoder/encoder.cpp: + x265: I15P config now matches bit for bit with x265-cli + [7bab890121d4] + + * source/Lib/TLibEncoder/TEncGOP.cpp, source/encoder/encoder.cpp, + source/encoder/encoder.h: + x265: resolve some bitstream differences between CLI apps + [dee26a40d02a] + + * source/TestForChecking_BadCommit.bat, + source/tools/TestForChecking_BadCommit.bat: + move regression batch file into tools folder + [8b15863c0013] + + * source/x265.cpp: + nits + [e32ccdc056a1] + + * source/encoder/encoder.cpp, source/x265.cpp: + encoder: correct NAL counts and a few other issues + + Byte count difference between the two CLI programs is now 2 + [2fe823ae463f] + + * source/encoder/encoder.cpp, source/encoder/encoder.h, + source/x265.cpp: + encoder: plausible NAL write routines + [8fa3800a48e2] + + * source/x265.cpp: + x265: enable NAL write functions + [025d311ac08e] + + * source/Lib/TLibCommon/CommonDef.h, source/common/common.h, + source/x265.cpp, source/x265main.cpp: + x265: move NVM macros to common.h + [e22bfa6b8f85] + + * source/Lib/TLibCommon/CommonDef.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/TComSlice.h, source/x265.cpp, source/x265.h: + x265: move NalUnitType from CommonDef.h to public x265.h + [a2f0181eda8b] + + * source/CMakeLists.txt, source/input/yuv.cpp: + x265: more warnings cleanups + [79cab5bc3d95] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: add missing semi-colons after PPA macros + [c5796290ba6e] + + * source/input/yuv.cpp: + x265: more compiler warnings fixes + [3478dcabae7b] + + * source/test/CMakeLists.txt: + cmake: test benches only need common.lib + [813159de8a6a] + + * source/input/y4m.cpp, source/input/y4m.h, source/x265.cpp: + x265: fix compiler warnings that were accidentally hidden + [068c2a7d55a1] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/common/CMakeLists.txt, source/encoder/CMakeLists.txt, + source/test/CMakeLists.txt: + cmake: merge TLibCommon into common.lib, TLibEncoder into + encoder.lib + + This fixes link issues with GCC, and clarifies the project + relationships + [46c9252f7dcb] + + * source/x265.cpp, source/x265opts.h: + x265opt: add help strings to CLI help output + [1dab9cd7ccf7] + + * source/x265opts.h: + x265opt: more tuning of command line options + [750c8127e6bc] + + * source/CMakeLists.txt: + cmake: provide version number to x265.cpp + [8693e88f6e7b] + + * source/common/primitives.cpp, source/encoder/encoder.cpp: + move x265_cleanup into the encoder/ folder, fixes link references + [678151ff4837] + + * source/Lib/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: split HM into TLibEncoder and TLibCommon + [617b70bc5028] + + * source/CMakeLists.txt, source/common/CMakeLists.txt, + source/common/IntraPred.cpp, source/common/TShortYUV.cpp, + source/common/TShortYUV.h, source/common/butterfly.h, + source/common/common.cpp, source/common/common.h, + source/common/ipfilter.cpp, source/common/macroblock.cpp, + source/common/md5.cpp, source/common/md5.h, source/common/pixel.cpp, + source/common/primitives.cpp, source/common/primitives.h, + source/common/threading.cpp, source/common/threading.h, + source/common/threadpool.cpp, source/common/threadpool.h, + source/common/vec/CMakeLists.txt, source/common/vec/avx.cpp, + source/common/vec/avx2.cpp, source/common/vec/blockcopy.inc, + source/common/vec/intrapred.inc, source/common/vec/ipfilter.inc, + source/common/vec/ipfilter16.inc, source/common/vec/ipfilter8.inc, + source/common/vec/macroblock.inc, source/common/vec/pixel.inc, + source/common/vec/pixel16.inc, source/common/vec/pixel8.inc, + source/common/vec/sse2.cpp, source/common/vec/sse3.cpp, + source/common/vec/sse41.cpp, source/common/vec/sse42.cpp, + source/common/vec/ssse3.cpp, source/common/vec/vec-primitives.cpp, + source/common/vec/vecprimitives.inc, + source/common/x86/CMakeLists.txt, source/common/x86/asm- + primitives.cpp, source/common/x86/const-a.asm, + source/common/x86/cpu-a.asm, source/common/x86/pixel-32.asm, + source/common/x86/pixel-a.asm, source/common/x86/pixel.h, + source/common/x86/sad-a.asm, source/common/x86/x86inc.asm, + source/common/x86/x86util.asm, source/encoder/CMakeLists.txt, + source/encoder/IntraPred.cpp, source/encoder/TShortYUV.cpp, + source/encoder/TShortYUV.h, source/encoder/butterfly.h, + source/encoder/common.cpp, source/encoder/common.h, + source/encoder/ipfilter.cpp, source/encoder/macroblock.cpp, + source/encoder/md5.cpp, source/encoder/md5.h, + source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/threading.cpp, + source/encoder/threading.h, source/encoder/threadpool.cpp, + source/encoder/threadpool.h, source/encoder/vec/CMakeLists.txt, + source/encoder/vec/avx.cpp, source/encoder/vec/avx2.cpp, + source/encoder/vec/blockcopy.inc, source/encoder/vec/intrapred.inc, + source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter16.inc, + source/encoder/vec/ipfilter8.inc, source/encoder/vec/macroblock.inc, + source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc, source/encoder/vec/sse2.cpp, + source/encoder/vec/sse3.cpp, source/encoder/vec/sse41.cpp, + source/encoder/vec/sse42.cpp, source/encoder/vec/ssse3.cpp, + source/encoder/vec/vec-primitives.cpp, + source/encoder/vec/vecprimitives.inc, + source/encoder/x86/CMakeLists.txt, source/encoder/x86/asm- + primitives.cpp, source/encoder/x86/const-a.asm, + source/encoder/x86/cpu-a.asm, source/encoder/x86/pixel-32.asm, + source/encoder/x86/pixel-a.asm, source/encoder/x86/pixel.h, + source/encoder/x86/sad-a.asm, source/encoder/x86/x86inc.asm, + source/encoder/x86/x86util.asm, source/test/CMakeLists.txt: + directory re-org, move primitives and infrastructure into common/ + folder + + This encoder/common split is also how x264 has its code arranged. + + common - performance primitives, threading, low level features + encoder - analysis, mode decision, cabac, high level features + + General rules: 1. Things we're replacing from TLibCommon/ go in + common/ 2. Things we're replacing from TLibEncoder/ go in encoder/ + [96baad4e5160] + +2013-05-25 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp: + More if - I_slice checks removed + [bcbed778915b] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Removing redundant if-I_slice checks + [7dfef87f686f] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + A separate path for Intra Slice Analysis + [9372c4dd1876] + + * source/Lib/TLibEncoder/TEncCu.cpp: + POinters for each NxN partition + [07982959ab7d] + + * source/Lib/TLibEncoder/TEncCu.cpp: + This code snippet is baffling - no NxN partitions allowed at any + level for inter-CUs. + [5077b41e4dcc] + + * source/Lib/TLibEncoder/TEncCu.cpp: + More PCM cleanup + [4c23d8da41c3] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Removing PCM and AMP from EARLY_PARTITION. + [996d036256a5] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Done experimenting - lets clean up and start afresh. + [b831e30a39f1] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Merge + [1275b8710950] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Bug fix for no EARLY_DECISION path. The difference between #ifndef + and #if !! + [fe1e15dc6eb1] + +2013-05-24 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp: + Eliminating repetitive cost checks for 4 NxN partitions + [472585bb237a] + +2013-05-24 Steve Borho + + * source/encoder/encoder.cpp: + encoder: correct the flush argument to TEncTop::encoder() + [ea418dc405d4] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: use less horizontal space to print per-frame PSNR + [f377a112c835] + + * source/encoder/encoder.cpp: + encoder: reorder some lines, should have no effect + [2729b04a27f3] + + * source/Lib/TLibEncoder/TEncGOP.cpp, source/PPA/ppaCPUEvents.h: + ppa: add wrapping event for loop filters + [3639ac6c84ed] + + * source/x265.cpp: + x265: combine two log lines, remove bitstream filename (not + relevant) + [73c2c3b6bdd2] + + * source/encoder/common.cpp: + common: change more defaults to match I15P + [3462f6dfee4d] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: pull in really ugly GOP intialization/validation code from + TAppEncoder + + This all goes away once we have a proper lookahead + [35f67ed1b8ce] + + * source/encoder/common.cpp: + common: fixup default params + [5e9aca57a444] + + * source/Lib/TLibEncoder/TEncTop.cpp: + nits, no code change + [a893a5ee88ab] + + * source/x265.cpp: + x265: fix input file arg indexing + [f01c09fd1370] + +2013-05-25 ShinYee Chung + + * source/encoder/vec/intrapred.inc: + intra: Fix compile warning for HIGH_BIT_DEPTH (-Werror=shadow). + [480bc3934409] + + * source/encoder/vec/intrapred.inc: + intra: Fix compile warning for HIGH_BIT_DEPTH (-Werror=unused-but- + set-variable). + [d5459b8da333] + + * source/encoder/vec/intrapred.inc: + intra: Fix vector conversion errors. + + source/encoder/vec/intrapred.inc:570:66: error: conversion from + ‘Vec128b’ to non-scalar type ‘Vec8s’ requested Vec8s + v_topRow = load_partial(const_int(8), &pSrc[-srcStride]); // topRow + + ... + [e63381924908] + + * source/x265.cpp: + x265: Fix compile error due to undeclared strncmp function. + [8b6c9566d28d] + +2013-05-24 ShinYee Chung + + * source/encoder/motion.h: + motion: Fix compile warning due to -Wparentheses. + [ea85c7f907b9] + +2013-05-23 ShinYee Chung + + * source/encoder/vec/intrapred.inc: + intrapred: Fix compile warnings/errors due to -Wmaybe-uninitialized. + [7984ca4e57fd] + +2013-05-24 Steve Borho + + * source/x265.cpp: + x265: more debugging + [81f824e31a0d] + + * source/x265.cpp, source/x265opts.h: + x265: more work on CLI options + [3b6c4efdc154] + + * source/CMakeLists.txt, source/x265.cpp, source/x265opts.h: + x265: add getopt() to new CLI app, tune short flags to match x264 + [c140af14db48] + + * source/Lib/TLibEncoder/TEncCu.cpp: + re-apply bug fix, there must have been a bad merge somewhere + [d14951b6324a] + + * Merged in deepthidevaki/xhevc_deepthid (pull request #170) + + IntraPredAng 16x16 with all modes + [813e9c90cbf6] + +2013-05-24 Deepthi Devaki + + * source/encoder/vec/intrapred.inc: + Merge + [73316a3addc5] + + * source/encoder/vec/intrapred.inc: + IntraPredAng 16x16 with all modes + [9d91f3584db2] + + * source/VectorClass/vectori128.h: + ForceInline few more functions in vectorclass + [05870dbb3efb] + + * build/nmake/make-solutions.bat: + Merge + [2a6f287ceebe] + + * source/encoder/vec/intrapred.inc: + IntraPredAng16x16 few modes, not getting called yet, shall be + included on completion + [4b10526cb3a0] + +2013-05-24 nandaku2 + + * Merged in praveentiwari/xhevc_praveent (pull request #169) + + Added logic to avoid saturation in partialButterfly4 intrinsic + version + [e3ed785d727f] + +2013-05-24 praveentiwari + + * source/encoder/vec/macroblock.inc: + Added logic to avoid saturation in partialButterfly4 intrinsic + version + [81d852d7865b] + + * source/encoder/vec/macroblock.inc: + Removed unused vectors from partialButterflyInverse32 code + [afb8d92f66f4] + + * source/encoder/vec/macroblock.inc: + Removed unused vectors from partialButterflyInverse16 + [fabf29c2888a] + + * source/encoder/vec/macroblock.inc: + Removed unused vectore from partialButterflyInvers8 code + [1d2b3e54a4a4] + + * source/encoder/vec/macroblock.inc: + optimized saturation mechanism in partialButterflyInvers16 vector + version + [19b0f38f6380] + + * source/encoder/vec/macroblock.inc: + Removed unnecessary computation from partialButterflyInverse4 vector + version + [08b2688b6de9] + + * source/encoder/vec/macroblock.inc: + changed saturation mechanism for partialButterflyInverse8 vector + version + [4d0cf1002390] + +2013-05-24 Deepthi + + * Merge + [f739e927e018] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Storing NxN CU's at each depth + [051ac8dfaff9] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Bug fix for m_NxNCU + [84a82ca7c5a6] + + * source/Lib/TLibEncoder/TEncCu.cpp: + More cleanup; Rmeoving CheckBestMode out of CheckInterCost + [477d76b93842] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Cleanup: Remove extra arguments from xCheckRDCostInter/Intra + [4555cb1f1a2b] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Adding data structures for NxN information + [ef9f8f85c277] + +2013-05-24 praveentiwari + + * source/encoder/vec/macroblock.inc: + Changed packing mechanism for partialButterfly8 intrinsic version + [a6f34ed51a4d] + +2013-05-24 praveen Tiwari + + * Merged multicoreware/xhevc into default + [f32272e3ca22] + +2013-05-24 praveentiwari + + * source/encoder/vec/macroblock.inc: + Removed unnecessary computation from partialButterflyInverse32 + [c7ee802ee960] + +2013-05-24 praveen Tiwari + + * Merged multicoreware/xhevc into default + [b1d686c45089] + + * build/nmake/make-solutions.bat: + Merged multicoreware/xhevc into default + [fd0bbd1775d6] + +2013-05-23 praveen Tiwari + + * Merged multicoreware/xhevc into default + [526ab84ce55d] + + * Merged multicoreware/xhevc into default + [f9fd6ba9f9bc] + +2013-05-24 Deepthi Devaki + + * source/encoder/vec/intrapred.inc: + Fixed build for HIGH_BIT_DEPTH enabled + [ea8e87bacbcf] + +2013-05-23 Steve Borho + + * source/encoder/motion.cpp: + motion: fix star search's raster step + + tmv's loop range covers the entire search area. The search origin + should not be added to it. + [a3439ab05d47] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: hard-code our simple P frame GOP structure + + It will be the default GOP structure, but "All I" can be selected + simply by adding --keyint 1 option. Will add another option for + all-B later. + [04ce483efd92] + + * source/encoder/vec/CMakeLists.txt, source/encoder/vec/vec- + primitives.cpp: + cmake: prevent VC9 and VC10 from using SSE4, fixes aligned move + crashes + [8394398ddad3] + + * source/Lib/TLibCommon/TComPrediction.cpp: + nit + [dc7eea8b9b88] + + * source/encoder/vec/macroblock.inc: + macroblock: do not use SSE4 intrinsics for lower instrset builds + [7c65d9710ec4] + + * source/encoder/vec/ipfilter8.inc: + ipfilter: use immediate load-partial, fixes VC9 x64 crashes + [f66f0728d7e1] + + * source/encoder/vec/macroblock.inc: + macroblock: do not use SSE4 intrinsics for lower instrset builds + [3ea4514efde6] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: only allocate extY tmp buffer when necessary, use + xMalloc + [9894aed732d5] + + * build/nmake/make-solutions.bat: + build: drop nmake script - was only a temp hack for ASM support + [ce6d7bc7f18d] + + * source/Lib/TLibEncoder/TEncSearch.h: + nit + [cbf103758b5a] + + * source/x265.cpp: + x265: inline the NAL write functions into CLIOptions + [661d5db35d30] + + * source/x265.cpp: + x265: include assert.h for VLD validation checks + [9479fde37ee3] + + * source/encoder/motion.h: + motion: manually enforce 16 byte alignment for fenc buffer + [577747a625ff] + +2013-05-23 Deepthi + + * source/encoder/vec/macroblock.inc: + Merge + [d749a9fe2c1c] + + * Merge + [8686b9a4e74c] + +2013-05-23 praveen Tiwari + + * Merged multicoreware/xhevc into default + [d329a297cccd] + +2013-05-23 praveentiwari + + * source/encoder/vec/macroblock.inc: + Intrinsic code for partialButterflyInverse16 + [824a8e9a6a6f] + +2013-05-23 praveen Tiwari + + * Merged multicoreware/xhevc into default + [6f04eef8255c] + +2013-05-23 praveentiwari + + * source/encoder/vec/macroblock.inc: + Intrinsic code for partialButterflyInverse8 + [44529de64523] + +2013-04-17 praveen Tiwari + + * source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [235de855d779] + +2013-05-23 Deepthi Devaki + + * Merge + [31c59ffe9c96] + + * source/encoder/vec/intrapred.inc: + Improvement in Intra Angular 8x8 vector implementation + [d6a113bf815b] + +2013-05-23 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [bf29b1a40bca] + +2013-05-23 Deepthi Devaki + + * source/VectorClass/vectori128.h: + ForceInlined a few more functions in vectori128 + [c7fe807725d5] + + * source/encoder/vec/intrapred.inc, source/test/intrapredharness.cpp: + IntraPredAng 8x8 HIGH_BIT disabled - with special case + implementation for 8 modes. + [9ce6eb8b73ce] + +2013-05-23 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Defining lambda values for Check Best Mode + [3f9b5f17bb56] + +2013-05-23 Steve Borho + + * source/CMakeLists.txt, source/encoder/encoder.cpp, + source/encoder/encoder.h, source/x265.cpp, source/x265.h: + encoder: introduce new cli front-end, to eventually replace x265-cli + + cfg/*, x265main, x265cfg, and x265enc will all go away as soon as + this works + [92e3da6cc72d] + +2013-05-23 Deepthi + + * Merge + [18d3c0a322d2] + + * source/Lib/TLibCommon/TypeDef.h: + Best value of lambda select set to 0.6 + [96bb8955c43e] + +2013-05-22 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: remove my comments from HM source, they were also wrong + [06fa58c2176d] + + * source/encoder/motion.cpp: + motion: better document search shapes + [28fbc93455c6] + + * source/VectorClass/vectorclass.h, source/VectorClass/vectorf128.h, + source/VectorClass/vectori128.h: + vector: disable floating point vector classes, avoids GCC warnings, + fix EOLN + + vecprimitives.inc:73:1: error: '__m128d selectd(const __m128d&, + const __m128d&, const __m128d&)' conflicts with a previous + declaration + [eea2e5dbd441] + + * source/encoder/vec/intrapred.inc: + intrapred: fix GCC warnings + + intrapred.inc:1340:45: error: conversion from 'Vec16c' to non-scalar + type 'Vec16uc' intrapred.inc:1362:17: error: declaration of 'k' + shadows a previous local + [0d9eeff169b0] + + * source/encoder/vec/macroblock.inc: + macroblock: fix GCC warnings again, perhaps line argument should be + removed? + + macroblock.inc:562:9: error: statement has no effect [-Werror + =unused-value] + [d377908c3b23] + + * source/encoder/encoder.cpp, source/encoder/motion.cpp, + source/encoder/motion.h, source/x265.h, source/x265cfg.cpp: + motion: rename our adapted HM search to "STAR" + [175b3e9e3fe8] + + * Merged in praveentiwari/xhevc_praveent (pull request #161) + + Intrinsic code for partialButterfly8 + [c9c87ffc9b47] + +2013-05-22 praveen Tiwari + + * Merged multicoreware/xhevc into default + [0bb355ace86f] + +2013-05-22 praveentiwari + + * source/encoder/vec/macroblock.inc: + Intrinsic code for partialButterfly8 + [bc4ff1668070] + +2013-05-22 Steve Borho + + * Merged in deepthidevaki/xhevc_deepthid (pull request #160) + + IntraPredAngular 4x4 HIGH_BIT depth enabled + [b322aca2f1da] + +2013-05-22 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [ed34d6c8a225] + +2013-05-22 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/IntraPred.cpp, source/encoder/primitives.h, + source/encoder/vec/intrapred.inc, source/test/intrapredharness.cpp: + IntraPredAngular 4x4 HIGH_BIT depth enabled + [12eaf2159b3f] + + * source/encoder/vec/vecprimitives.inc: + Fix back slash in #include + [55094d80f04a] + +2013-05-22 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [bd911514525a] + +2013-05-21 Deepthi Devaki + + * Merge + [8aa73d922542] + + * source/encoder/IntraPred.cpp, source/encoder/vec/intrapred.inc, + source/encoder/vec/vecprimitives.inc, + source/test/intrapredharness.cpp: + xPredIntraAng 4x4 HIGH_BIT disabled + [db8e242c4ad0] + + * source/VectorClass/vectori128.h, source/test/intrapredharness.cpp: + Force inline functions in vectori128.h + [5551a0eb5fbe] + +2013-05-22 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/motion.h: + motion: use square pattern for subpel refinement + [0d8765c5e054] + + * source/encoder/motion.cpp: + motion: allow fpel star refinement if two point search finds new + point + [9b50900a78d4] + +2013-05-22 ggopu + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Temporary fix to crash + [0dbbe48f078d] + +2013-05-22 Steve Borho + + * source/encoder/motion.cpp: + motion: use COPY2_IF_LT in subpel refine, simplifies logic + + No behavior change + [a1b974810f40] + + * source/encoder/motion.cpp: + motion: if first subpel iteration found no improvement, skip later + iters + + Should have no coding effect, but should help perf slightly. + [32ee09f3e9b8] + +2013-05-21 Steve Borho + + * source/encoder/motion.cpp: + motion: only adjust subpel search center once per iteration + + Basing the offsets from BMV was allowing the search to wander away + on less fruitful paths. Best to wait for all results to come in + before moving the search center point. + [c0134d7e756d] + +2013-05-22 ShinYee Chung + + * source/encoder/encoder.h: + Encoder: Fix enum accesses. + [2e7af97e815e] + + * source/encoder/motion.cpp: + motion: Fix compile warning/error due to shadow variable in COST_MV + macro. + [d2ff2006533f] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TLibEncoder: Fix compile error due to undeclared INT_MAX. + [53b7e60badcb] + +2013-05-21 Steve Borho + + * source/encoder/common.cpp, source/encoder/encoder.cpp, + source/x265.h, source/x265cfg.cpp, source/x265cfg.h, + source/x265enc.cpp: + api: further trim x265_param_t, remove PCM and rate control and + other bits + + We will add these back later as we optimize and/or adapt these + features + [26d354b832d2] + + * source/encoder/common.cpp, source/encoder/encoder.cpp, + source/x265cfg.cpp: + encoder: plumb in a parse function + [2ac0b485815b] + + * source/Lib/TLibEncoder/TEncCfg.h: + TEncCfg: white-space nit + [38a2194f7420] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/encoder.cpp, + source/x265enc.cpp: + encoder: simply encode() function signature, auto-detect frames to + be encoded + [084aaa5e85cc] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: simplify destroy method slightly + [2af1b2f39aa0] + +2013-05-21 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + cleanup: remove unused pointer from TEncCu + + From 28a48831f9571ff030574c4e28c75e4afc8105f3 Mon Sep 17 00:00:00 + 2001 + --- source/Lib/TLibEncoder/TEncCu.cpp | 3 --- + source/Lib/TLibEncoder/TEncCu.h | 3 --- 2 files changed, 0 + insertions(+), 6 deletions(-) + [c0f39e04c429] + +2013-05-20 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + thread: [BITS CHANGED] split m_pcRDGoOnSbacCoder into own thread + --- source/Lib/TLibEncoder/TEncCu.cpp | 3 ++- + source/Lib/TLibEncoder/TEncCu.h | 2 +- + source/Lib/TLibEncoder/TEncGOP.cpp | 4 ++-- + source/Lib/TLibEncoder/TEncSearch.h | 2 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 7 ++++--- + source/Lib/TLibEncoder/TEncSlice.h | 2 +- + source/Lib/TLibEncoder/TEncTop.cpp | 6 +++--- + source/Lib/TLibEncoder/TEncTop.h | 4 ++-- 8 files changed, 16 + insertions(+), 14 deletions(-) + [695f62044e15] + +2013-05-21 Steve Borho + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: remove some includes only used by new main + [e076db6b561e] + + * source/Lib/TLibEncoder/TEncTop.cpp: + TEncTop: call destroy() on CU encoders before deleting them + + HM class destructors do not necessarily free memory. This fixes some + large leaks. + [c645bcc7795c] + + * source/Lib/TLibEncoder/TEncCfg.h, source/encoder/encoder.cpp, + source/x265enc.cpp: + cruft: remove setFrameSkip/getFrameSkip() from TencCfg + + The encoder should be unaware of file I/O properties. + setFramesToBeEncoded() should also be removed, but the encoder needs + the last frame number in order to handle edge cases at the end of + the encode. + [c19a3ca7761d] + + * source/encoder/encoder.h: + encoder: remove obsolete cruft, assign default level and profile + [efccf29f74ef] + + * source/encoder/encoder.cpp, source/encoder/encoder.h: + encoder: improve defaults to match current behavior, simplify + [0d7c1810b154] + + * source/x265cfg.cpp: + more nits + [3ec9dcc432cf] + + * source/encoder/vec/macroblock.inc: + macroblock: tabs to spaces + [398786d7e65a] + + * source/encoder/vec/macroblock.inc: + Merged in praveentiwari/xhevc_praveent (pull request #159) + + Intrinsic code for partialButterflyInverse4 + [4a43a67aab95] + +2013-05-21 praveentiwari + + * source/encoder/vec/macroblock.inc: + Intrinsic code for partialButterflyInverse4 + [8a115d37a05a] + + * source/CMakeLists.txt, source/encoder/vec/macroblock.inc, + source/test/filterharness.cpp, source/test/mbdstharness.cpp, + source/test/testbench.cpp: + Backed out changeset: 6a5d35fd39fb + [ff8e92d7c095] + + * source/CMakeLists.txt, source/encoder/vec/macroblock.inc, + source/test/filterharness.cpp, source/test/mbdstharness.cpp, + source/test/testbench.cpp: + Intrinsic code for partialButterflyInverse4 + [6a5d35fd39fb] + +2013-05-21 Steve Borho + + * source/encoder/motion.cpp: + motion: do not check candidate MVs for HM style search + + No compelling reason for this, except to help debug differences + [21098423e1aa] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: disable SAD subsampling while we debug ME + + It's not clear we ever want to turn this back on, at least before + we've added subsampling to x265 ME + [b05a30649d2e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: give the real MVP to x265 ME, not just zero + + This is the bug that has been preventing UMH from working as + designed + + DIA: 461.0320 31.4553 36.0712 36.1111 HEX: 457.8880 31.4728 36.0830 + 36.1282 UMH: 457.3520 31.4723 36.1079 36.1460 HM: 458.6560 31.4827 + 36.1382 36.1273 ORIG: 456.0960 31.5091 36.1377 36.1228 + + Our HM version still has a bug; need to find it + [d011d9380876] + + * source/encoder/motion.cpp: + motion: use MV 0 prediction as search start if cheaper than MVP + [5957dfe39429] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use actual enum for selecting x265 ME routine + [94edee85036a] + + * source/encoder/motion.cpp: + motion: remove hard-coded mode selection + [d96750bcb143] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: fix bugs in my changes, add relevant comments for future + work + [88caee6ef7c3] + + * source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/mv.h: + motion: simplify HM search method code + [b58f7f60ae7c] + + * source/encoder/common.cpp, source/x265.h: + api: add original HM search as valid configurable option + [4a4f55e067b9] + + * Merged in ggopu/ggopu_xhevc (pull request #158) + + Implemented the HM ME to Motion.cpp + [3301bc78e1ce] + +2013-05-21 ggopu + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/motion.h, source/encoder/mv.h: + Implemented the HM ME to Motion.cpp + [e2e81e4f914c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [f13d5c93b30e] + + * source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/mv.h: + Implemented HM ME to Motion.cpp + [453365fdf903] + +2013-05-21 Steve Borho + + * source/encoder/encoder.cpp: + encoder: fix another shadowed variable warning + [5ad7c41ac970] + + * source/encoder/CMakeLists.txt, source/encoder/encoder.cpp: + encoder: fix GCC compiler warnings + [b515463b42cd] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/common.cpp, + source/encoder/encoder.cpp, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + api: rename enableAMPRefine to useRectInter + + This configurable was mis-named. It has nothing to do with AMP, it + is disabling the use of rectangular inter prediction modes (which + has the side-effect of also disabling AMP since AMP is not checked + if the rectangular mode of similar shape was not checked). + + This configurable should never have been in TComSlice since the flag + is not signaled in the slice header like AMP activation is. + [ab23a9399bcb] + + * source/encoder/vec/macroblock.inc: + macroblock: fix GCC compiler warning + + macroblock.inc: In function 'void + {anonymous}::partialButterfly4(short int*, short int*, int, int)': + macroblock.inc:1326:12: error: parameter 'line' set but not used + [-Werror=unused-but-set-parameter] + [587dd16e4001] + +2013-05-21 praveen Tiwari + + * Merged multicoreware/xhevc into default + [5d2edf07bd81] + +2013-05-21 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/vec/macroblock.inc: + Fixed alignment issue for partialButterfly32 + [8e42a9907241] + +2013-05-21 praveen Tiwari + + * Merged multicoreware/xhevc into default + [f89ac88ad652] + +2013-05-21 praveentiwari + + * source/encoder/vec/macroblock.inc: + Fixed encoder crash for partialButterfly32 + [c175df4a600f] + + * source/encoder/vec/macroblock.inc: + Code cleanup for partialButterfly4 + [f110e2eac973] + +2013-05-21 https://mandarmcw + + * source/TestForChecking_BadCommit.bat: + Test script to find bad commit + [55fd15776acf] + +2013-05-21 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/encoder.cpp, + source/encoder/encoder.h: + api: new encoder class and main function + [b4b06a6e89cd] + + * source/encoder/common.cpp, source/x265cfg.cpp: + api: use new methods to validate and print public params (CHANGES + LOGGING) + + This changes the look of the output log, but should not change any + output bits + [6cb27c8e9011] + + * source/encoder/common.cpp, source/encoder/common.h: + common: add new methods for validating and printing the public + params + [3f0ece48b1f7] + + * source/x265.h, source/x265cfg.cpp: + api: make iMaxCuDQPDepth a uint32_t, like the other depth settings + [e8acc21708cb] + +2013-05-20 Steve Borho + + * source/x265cfg.cpp: + fix HIGH_BIT_DEPTH=1 build + [6d47f9979e5c] + + * source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/TComPrediction.cpp: + minor nit fixes, no behavior changes + [17d10d96b071] + + * source/encoder/common.cpp, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + api: remove m_ prefix from x265_param_t data members + [bb2c9bc68a9e] + +2013-05-21 ShinYee Chung + + * source/encoder/vec/macroblock.inc: + Vec: Fix compile warnings/errors due to unused typedef. + [d062def382de] + +2013-05-20 Steve Borho + + * source/x265cfg.cpp: + cfg: simplify log header a bit + [06f883384489] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/encoder/common.cpp, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + cfg: replace MaxCUWidth, MaxCUHeight with single MaxCUSize + [fe13c968c8c1] + + * source/x265cfg.cpp: + x265cfg: remove redundant ME name array, white-space cleanups + [8f33ed829287] + + * source/x265cfg.cpp: + x265cfg: move the default output filename to a better location + [5bd50e2c6ad3] + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h: + bitcost: match BitCost::bitcost() to TComRdCost:getCost() [CHANGES + OUTPUTS] + + The HM counts MVD bits in integer, while we were using log2n() and + keeping floats. Our MV costs were smoother across the range of MV's + but this is a false accuracy. Encoding cost is rounded to nearest + bit. This change improves compression efficiency. + [bcf310907041] + + * source/x265main.cpp: + main: minor nit cleanups + [596328140dfc] + + * source/encoder/vec/macroblock.inc, source/x265.h: + fix GCC compile warnings + [4d682b12d184] + +2013-05-20 praveentiwari + + * source/encoder/vec/macroblock.inc: + 32bit build issue fix for partialButterfly4 intrinsic function. + Further optimisation. + [e90a68b8cdd8] + +2013-05-20 sumalatha + + * source/tools/performanceProfiler/Profiler.bat, + source/tools/performanceProfiler/Readme.txt, + source/tools/performanceProfiler/config.txt, + source/tools/performanceProfiler/performanceProfiler.bat: + modified the scripts files to take the application's(.exe) from the + relative path to the curent directory + [215c0e0beda0] + +2013-05-20 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use x265's optimized bitcost() method for xCheckBestMVP + + Minor performance improvement + [36642a6ac4c6] + +2013-05-20 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp: + Fix for compilation error in HIGH_BIT_DEPTH enabled + [e1ce48613e7f] + +2013-05-19 Steve Borho + + * source/encoder/common.cpp, source/x265.h, source/x265cfg.cpp, + source/x265cfg.h: + api: further work triming x265_param_t and implementing public API + [42d2f5b65c7c] + + * source/x265cfg.cpp: + x265cfg: white-space nits + [0ba7cbf85049] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/common.cpp, + source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h, source/output/output.h, + source/output/y4m.cpp, source/output/y4m.h, source/output/yuv.cpp, + source/output/yuv.h, source/x265.h, source/x265cfg.h, + source/x265enc.cpp: + api: rename some public structures to follow x264 style, hide some + params + + If I have no idea how a param is configured or used, I removed it + from the public data structure. + [c8f27268f3c1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix comments in xTZ8PointDiamondSearch + + Removes one extraneaous if() level by chaining together if() else + if() else + [ff379e5854bf] + + * source/encoder/motion.cpp: + Backout 1545:eaaa4edde516 (tested worse in regression suite) + [2fbe276bd35f] + + * source/encoder/motion.cpp: + motion: fix GCC shadow variable warning (tmv is used inside COST_MV) + [27b17a18367d] + + * source/encoder/motion.cpp: + motion: start fpel search at best predictor + + HM and x264 both do this, but has uncertain results. Will test in + regression suite with more videos + [eaaa4edde516] + + * source/encoder/motion.cpp: + motion: use roundToFPel() when converting clipped MVP to fpel + + x264 does this, and it seems to help compression slightly. + [63af725da724] + + * source/encoder/motion.cpp, source/encoder/mv.h: + motion: only remeasure bmv at fpel if pmv had subpel fractions + [d465b6f04260] + + * source/encoder/motion.cpp: + motion: remove omv=bmv assignment that x264 UMH does not do + [e343fdb42a15] + + * source/encoder/motion.cpp: + motion: avoid rechecking bestpre + [2b88c1e2bc08] + + * source/encoder/motion.cpp: + motion: improve UMH comments, add a TODO for further analysis + [fd94f23e7828] + + * source/encoder/motion.cpp: + motion: do not check candidates equal to MVP, clarify prep code + [253d81c1a1e5] + + * source/encoder/motion.cpp: + motion: move warning disable to top of file + [21ae49204eb1] + + * source/encoder/motion.cpp: + motion: use x265.h enums for motion search methods + [ae6e0fbfabab] + + * source/encoder/motion.cpp: + motion: nits + [96a835f01afa] + + * source/encoder/motion.cpp, source/encoder/mv.h: + mv: explicit notZero() method + [dce5c8184afc] + +2013-05-19 Min Chen + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + thread: modify m_pcCuEncoder to every thread + --- source/Lib/TLibEncoder/TEncSlice.cpp | 22 +++++++++++----------- + source/Lib/TLibEncoder/TEncSlice.h | 4 ++-- + source/Lib/TLibEncoder/TEncTop.cpp | 8 +++++--- + source/Lib/TLibEncoder/TEncTop.h | 4 ++-- 4 files changed, 20 + insertions(+), 18 deletions(-) + [8448f800200b] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + thread: modify m_pcPredSearch to every thread + --- source/Lib/TLibEncoder/TEncCu.cpp | 2 +- + source/Lib/TLibEncoder/TEncCu.h | 2 ++ + source/Lib/TLibEncoder/TEncSearch.cpp | 6 ++---- + source/Lib/TLibEncoder/TEncSearch.h | 2 -- + source/Lib/TLibEncoder/TEncSlice.cpp | 10 ++++++++-- + source/Lib/TLibEncoder/TEncSlice.h | 2 +- + source/Lib/TLibEncoder/TEncTop.cpp | 8 +++++++- + source/Lib/TLibEncoder/TEncTop.h | 7 +++++-- 8 files changed, 26 + insertions(+), 13 deletions(-) + [74d3dbbf08fc] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + thread: modify m_pcEntropyCoder and fix bug in HM when WPP + --- source/Lib/TLibEncoder/TEncCu.cpp | 3 +- + source/Lib/TLibEncoder/TEncCu.h | 1 + + source/Lib/TLibEncoder/TEncGOP.cpp | 122 + +++++++++++++++++----------------- source/Lib/TLibEncoder/TEncGOP.h + | 2 +- source/Lib/TLibEncoder/TEncSearch.h | 2 + + source/Lib/TLibEncoder/TEncSlice.cpp | 39 +++++++----- + source/Lib/TLibEncoder/TEncSlice.h | 2 +- + source/Lib/TLibEncoder/TEncTop.cpp | 6 ++- + source/Lib/TLibEncoder/TEncTop.h | 5 +- 9 files changed, 98 + insertions(+), 84 deletions(-) + [079de58deffd] + +2013-05-19 Steve Borho + + * source/encoder/motion.cpp: + motion: do two subpel iterations for hpel and qpel + + This will find corner vectors, where previous simple DIA did not. + This catches us up quite a bit: + + DIA: 467.8400 31.3870 36.0379 36.0973 HEX: 466.5200 31.3955 36.0604 + 36.1248 UMH: 463.1120 31.4415 36.1199 36.1584 HM: 456.4880 31.4984 + 36.1100 36.1101 + [bbb7982882ba] + +2013-05-18 Steve Borho + + * source/encoder/motion.cpp: + motion: use ints for temp vars + + Using sub-word integers is usually a perf loss, and opens you up to + wierd overflow bugs. + [4db7ec5efb78] + + * source/encoder/motion.cpp: + motion: minor cleanups, no effects + [ebf5fd8f484d] + + * source/encoder/motion.cpp, source/encoder/mv.h: + motion: convert hex4 to MV, use tmv for checkRange within hex grid + loop + + x264 checks tmv there (mv being tested), not bmv (current best mv) + [1bb2e323b6fb] + +2013-05-18 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + thread: [NEED VERIFY] modify logic and remove unused code for + m_pppcRDSbacCoder Conflicts: + + source/Lib/TLibEncoder/TEncTop.cpp + --- source/Lib/TLibEncoder/TEncCu.cpp | 5 +++- + source/Lib/TLibEncoder/TEncCu.h | 1 + + source/Lib/TLibEncoder/TEncGOP.cpp | 3 +- + source/Lib/TLibEncoder/TEncSearch.h | 1 + + source/Lib/TLibEncoder/TEncSlice.cpp | 9 ++---- + source/Lib/TLibEncoder/TEncSlice.h | 1 - + source/Lib/TLibEncoder/TEncTop.cpp | 46 + +++------------------------------ source/Lib/TLibEncoder/TEncTop.h | + 6 +--- 8 files changed, 17 insertions(+), 55 deletions(-) + [4817a9ff9832] + +2013-05-18 Steve Borho + + * source/encoder/motion.cpp: + motion: adapt "full MB size" check to HEVC CTU + + This had no measurable effect, but seemed more correct + [d5f187bb6d44] + + * source/encoder/motion.cpp: + motion: fix arguments to CROSS macro in UMH + [b5947f8c979e] + + * source/encoder/motion.cpp: + motion: remove SIMPLE_HEX, the outputs are now the same + [0304c492eb5d] + + * source/encoder/x86/x86util.asm: + asm: fix FENC_STRIDE to HEVC size + [b333105c3ae1] + + * source/encoder/motion.cpp: + motion: remove BITS_MVD again (was resurrected during a merge) + [cb15920370ca] + + * source/encoder/motion.cpp: + motion: enable SIMPLE_HEX by default, until the "fast" path is + debugged + [b243347757d4] + +2013-05-18 Min Chen + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: avoid reference to m_pppcRDSbacCoder[0][CI_CURR_BEST] + --- source/Lib/TLibEncoder/TEncSlice.cpp | 4 ++-- 1 files changed, 2 + insertions(+), 2 deletions(-) + [afb15c17b9c0] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + thread: remove unused code for m_pcBufferBinCoderCABACs + --- source/Lib/TLibEncoder/TEncSlice.cpp | 12 ++---------- + source/Lib/TLibEncoder/TEncSlice.h | 1 - 2 files changed, 2 + insertions(+), 11 deletions(-) + [9a0a53b12609] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: move pppcRDSbacCoder into loop + --- source/Lib/TLibEncoder/TEncSlice.cpp | 8 ++++---- 1 files + changed, 4 insertions(+), 4 deletions(-) + [9bf8a39b1801] + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + cleanup: remove unused code for pcBufferLowLatSbacCoders + --- source/Lib/TLibEncoder/TEncSlice.cpp | 15 --------------- + source/Lib/TLibEncoder/TEncSlice.h | 2 -- 2 files changed, 0 + insertions(+), 17 deletions(-) + [798cdd336099] + + * source/Lib/TLibEncoder/TEncCu.cpp: + cleanup: remove unused code + --- source/Lib/TLibEncoder/TEncCu.cpp | 17 ----------------- 1 files + changed, 0 insertions(+), 17 deletions(-) + [659df28d9850] + +2013-05-18 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: remove m_fencbuf and SAD primitive use from HM ME + + This fixes BIDIR and weighted P prediction + [76c5cc558878] + + * cfg/encoder_all_I.cfg: + encoder_all_I: nit cleanups + [103828d5f1b0] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/encoder/motion.cpp, + source/encoder/motion.h, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + replace FastSearch configurable with motion search method + [b24d4f0d8a6e] + + * source/encoder/vec/macroblock.inc: + macroblock: disable partialButterfly4, it is failing in the + testbench + [c407651a52a4] + + * source/encoder/motion.cpp: + Merge + [29b3933dcf27] + + * source/Lib/TLibCommon/TComDataCU.cpp: + TComDataCU: fix GCC compile warnings + [0ee4ca22aa1f] + + * source/encoder/motion.cpp: + motion: fix GCC compile warnings + [7260a1fdc7a7] + + * source/encoder/IntraPred.cpp: + intrapred: fix include path again + [701bc030c157] + + * source/encoder/IntraPred.cpp, source/encoder/primitives.h: + IntraPred: do not use pointer reference + + 1 - this is a C++ only construct, and would make ASM opts very + difficult 2 - the pointer value being passed was a temp rvalue, not + modifiable + [c4459134a244] + +2013-05-18 Deepthi + + * source/encoder/vec/macroblock.inc: + Disabling 4 and 32 butterfly intrinsic implementations + [83b3c02ec36a] + + * source/Lib/TLibCommon/TComPrediction.cpp: + Remove xPredIntraAngRefBuf from HM - they are now part of + x265::primitives + [4bb8150df674] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [393536c14d6f] + +2013-05-17 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [33bb25b0bf5f] + +2013-05-16 Deepthi Devaki + + * Merge + [6f0ad2ce8510] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/IntraPred.cpp, source/encoder/primitives.h, + source/test/intrapredharness.cpp, source/test/intrapredharness.h: + Unit test for Intra Angular primitives + [bd9bc0bc3759] + +2013-05-16 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [8e9356e3cf6e] + +2013-05-16 Deepthi Devaki + + * Merge + [b48ae2baa364] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Redesigned intraprediction for angular modes to use buffered + references + [048211fdc5f0] + +2013-05-18 Steve Borho + + * source/encoder/motion.cpp: + motion: make UMH the default (this needs to be a param) + [d8c379d73e86] + + * source/encoder/motion.cpp: + motion: make "optimized" HEX the default now + [bc7cae4f28e8] + + * source/encoder/motion.cpp: + motion: remove unused BITS_MVD macro + [cf973ab501ae] + + * source/encoder/motion.cpp: + motion: measure MV cost in qpel in COST_MV + [fbdfdd9f710d] + +2013-05-17 Steve Borho + + * source/encoder/motion.cpp: + motion: simplify COST_MV_X4 + [5850f4fb7e1e] + + * source/encoder/motion.cpp: + motion: add range check for subpel refine + [e8c864e6301e] + + * source/encoder/motion.cpp: + motion: cleanup hex and square offset arrays + [1c8d80939de3] + + * source/encoder/motion.cpp: + motion: add default case to switch statement + [b46bc231b6a1] + + * source/encoder/motion.cpp: + motion: make HEX the default for the moment + [d15f3b52a98c] + + * source/encoder/mv.h: + mv: add a helpful comment + [346ce779ff9a] + + * source/encoder/motion.cpp: + motion: fix obvious bug in COST_MV(). HEX mode now looks ok + + This is a quality/perf progression you would expect for DIA and HEX + + Kbps Y PSNR U PSNR V PSNR elapsed time DIA: 499.9280 31.1502 + 35.8939 35.9815 27sec HEX: 474.3600 31.3310 36.0312 36.1283 28sec + HM: 456.4880 31.4984 36.1100 36.1101 30sec + + UMH is worse than HEX, better than DIA. Which means there are still + bugs. + + UMH: 481.9600 31.2575 36.0621 36.0658 28sec + + The "optimized" HEX path is also broken (worse numbers than these) + [3179771dfccc] + + * source/encoder/motion.cpp: + motion: add descriptive #if 0 from x264 + + It is useful because it is a simpler version of hex that can be more + easily debugged, and the later version should match the outputs + [96d7eed717e0] + + * source/encoder/motion.cpp: + motion: white-space nit, no code change + [cf80adc4e35c] + + * source/encoder/motion.cpp: + motion: remove redundant code + [3ec960515133] + + * source/encoder/motion.cpp: + motion: just a single stride var is necessary + [6de0d4f2f5a9] + + * source/encoder/motion.cpp: + motion: use bmv.checkRange() + [04923c4dc22a] + + * source/encoder/motion.cpp: + motion: take advantage of MV union + [55427f2a6fbc] + + * source/encoder/motion.cpp: + motion: include assert.h + [7e83eb3c796e] + + * source/encoder/motion.cpp: + motion: use MV copy constructor some more + [08c0e5f843e7] + + * source/encoder/motion.cpp: + motion: HEVC does not have 4x4 inter partitions + [fd313e990e4b] + + * source/encoder/motion.cpp: + motion: pmv needs to be a full-pel motion vector + [d7cd89c7d662] + + * source/encoder/motion.cpp: + motion: use MV copy constructor + [494e1c059f10] + + * source/encoder/motion.cpp: + motion: remove dup variable i_me_range + [8e5588e6b2a7] + + * source/encoder/motion.cpp: + motion: cleanup + [e2727ec025ac] + + * source/encoder/motion.cpp: + motion: remove unnecessary dup variables + [01ee1bc52dea] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: save the partition enum for the current PU + [ae2c9e4d80fe] + + * source/encoder/motion.cpp: + motion: turn size_scale the right way around + [962787f0f8a3] + + * source/encoder/motion.cpp: + motion: remove cruft + [fb19468bde57] + + * source/encoder/motion.cpp: + motion: fix obvious bug in x265_predictor_difference + [5290730cc3b2] + + * source/encoder/motion.cpp: + cleanups, no behavior change + [98b7107debdf] + + * source/encoder/common.cpp, source/encoder/common.h, + source/encoder/motion.cpp: + motion: move common macros to common.h + [6e0a819ee02b] + + * source/encoder/motion.cpp, source/encoder/vec/macroblock.inc, + source/x265cfg.cpp: + fix GCC compilation warnings + [807d04c7511c] + + * Merged in sumalatha/xhevc_sumalatha (pull request #148) + + fix for decoder crash in xcompresscu() [early_abort optimization] + [28523f0f4149] + +2013-05-17 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp: + fix for removing the decoder crash in xcompressCU() - early + partition optimization + [0122fe9ae44c] + +2013-05-17 Sumalatha Polureddy + + * build/dr_psnr_script/TAppDecoder.exe, + build/dr_psnr_script/psnr_script.bat, + source/Lib/TLibEncoder/TEncCu.cpp: + Merged multicoreware/xhevc into default + [ee89354b0898] + +2013-05-17 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp: + Backed out changeset: 78976eb7f3fa + [0ad65cd9d7b4] + + * source/Lib/TLibEncoder/TEncCu.cpp: + fix for decoder crash in xcompresscu() {early_abort optimization] + [78976eb7f3fa] + +2013-05-17 Steve Borho + + * source/encoder/vec/macroblock.inc: + partialButterfly4 does not compile on 32bit x86 + + _mm_cvtsi64_si128 does not exist on 32bit systems + [608559903526] + +2013-05-17 praveen Tiwari + + * build/dr_psnr_script/TAppDecoder.exe, + build/dr_psnr_script/psnr_script.bat: + Merged multicoreware/xhevc into default + [a501a1c26fd9] + +2013-05-17 praveentiwari + + * source/encoder/vec/macroblock.inc: + intrinsic code for partialButterfly32 + [d14c341119ff] + +2013-05-17 Deepthi + + * .hgignore, build/dr_psnr_script/TAppDecoder.exe, + build/dr_psnr_script/psnr_script.bat, source/tools/HM + decoder/TAppDecoder.exe: + Relocating HM decoder to tools. Removing build/dr_psnr_script + [71005e0c4e43] + + * Merge + [c5c192a1e47f] + +2013-05-17 Sumalatha Polureddy + + * Merged multicoreware/xhevc into default + [8718c6433394] + +2013-05-17 sumalatha + + * source/tools/performanceProfiler/Profiler.bat, + source/tools/performanceProfiler/Readme.txt, + source/tools/performanceProfiler/config.txt, + source/tools/performanceProfiler/performanceProfiler.bat: + Modified the testing scripts for running both yuv files and y4m + files also + [d37656a0c5ca] + +2013-05-17 Deepthi + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp: + More refinement of early abort conditions + [67bf72b8a743] + + * Merge + [9e411b77d861] + + * source/Lib/TLibEncoder/TEncCu.cpp: + refining early abort + [404f0e8cd3e6] + + * source/Lib/TLibEncoder/TEncCu.cpp: + fixing the decoder crash + [539b0fbd82fe] + + * source/Lib/TLibEncoder/TEncCu.cpp: + remove PCM for now + [519dd09ead40] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Remove AMP for now from EARLY_ABORT mode decision + [53ace900d5ba] + +2013-05-17 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp: + TEncSearch: pass neighbor MVs to motionEstimate (UMH seems to + require them) + [a0e0b589aa92] + +2013-05-17 ggopu + + * source/encoder/motion.cpp: + Motion Estimation : UMH implementation stage 1 + [94746841ea03] + +2013-05-16 Gopu G + + * Merged multicoreware/xhevc into default + [526f1f214b82] + + * Merged multicoreware/xhevc into default + [197dcdbb270f] + +2013-05-14 Gopu G + + * source/test/timer.cpp: + Merged multicoreware/xhevc into default + [dde8e0582a9b] + +2013-05-13 Gopu G + + * Merged multicoreware/xhevc into default + [2b1a79979ce5] + +2013-05-13 ggopu + + * source/encoder/motion.cpp: + Motion Estimation : UMH implementation stage 1 + [a599997fbd20] + +2013-05-16 Steve Borho + + * source/x265main.cpp: + vld: add assert for zero leaks (DEBUG only) + [2c3c0dfb5f83] + + * source/x265main.cpp: + x265main: explicit allocation and deallocation of TAppEncTop + [97d5b00ab35a] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: TComPicYuv are re-used, check before allocating subpel + buffers + [c98229f4a65d] + + * source/x265cfg.cpp, source/x265cfg.h: + x265cfg: release thread pool handle at exit + [b0919ee9a22e] + + * source/x265main.cpp: + x265main: ask VLD to output the leak log to the debugger output + console + + This way the leak report survives after the run. + [cadb1c11d2ef] + + * source/CMakeLists.txt, source/cmake/FindVLD.cmake, + source/x265main.cpp: + cmake: add detection and use of Visual Leak Detector + [70519a482e47] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: disable printing of per-frame encode time, prune unused + lists + + This just makes the frame logs non-deterministic. + [90aa43526097] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: fix EOLN + [297851df497a] + + * build/linux/batch.py: + write batch outputs into a folder + [603fef620960] + + * build/linux/batch.py: + add MD5 hashes to batch run output log + [095771f6dbaa] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, source/x265cfg.cpp: + remove filenames from configuration files + + Hard-code the default filename for an output stream. Default to no + recon output file if none is specified on the CLI. + [b6b24bd0e966] + +2013-05-16 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp: + Preserving the original xcompressCU implementation until the + EARLY_PARTITION_DECISION mode has been fully debugged. + [d246be5c803a] + +2013-05-16 Steve Borho + + * source/tools/CMakeLists.txt: + backout change to source/tools/CMakeLists.txt + + The performanceProfiler/ folder has no CMakeLists.txt within it. + [6215d24d1f59] + +2013-05-16 sumalatha + + * source/tools/CMakeLists.txt, + source/tools/performanceProfiler/Profiler.bat, + source/tools/performanceProfiler/Readme.txt, + source/tools/performanceProfiler/config.txt, + source/tools/performanceProfiler/performanceProfiler.bat: + Added the scripts for validating the encoded bitstream(running HM + decoder) and measuring the the performance like PSNR values, + Timetaken for encoing and the bytes written in encoded bitstream. + For running the scripts, use Readme.txt + [56ba4fb6b574] + +2013-05-16 Deepthi + + * Merge + [2f469b1dc95c] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Cleanup + [d258fac8d99d] + + * source/Lib/TLibEncoder/TEncCu.cpp: + AMP after 2nxn, nx2n, if at all + [091ddf1c5724] + + * source/Lib/TLibEncoder/TEncCu.cpp: + 2nxn and nx2n comparisons only if nxn mode better than 2nx2n + [e24d7b31af7d] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Cleanup; remove erroneous paths + [0b8a18bc5914] + + * source/Lib/TLibEncoder/TEncCu.cpp: + merge error + [30df17dc5923] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Initial version of compare 2nx2n with sum of nxn costs + [7b58edb59bae] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Getting rid of EARLY_PARITION_DECISION macro (temporarily) + [079ddf89b5d5] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Init Sub CUs to prevent crash when early detection is OFF + [db9215642791] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Scalar expansion for subCUs + [034adf2f6077] + +2013-05-15 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp: + Tab fixes + [e1300ac7706a] + +2013-05-16 Steve Borho + + * build/linux/batch.py: + add a batch launch script for litespeed server + [b30e0de1a6ee] + +2013-05-15 Steve Borho + + * source/encoder/x86/asm-primitives.cpp: + asm: fix a bug found by x86-64 linux and GDB + [ecf99ffdc0c6] + + * source/encoder/vec/vecprimitives.inc: + vec: GCC needs smmintrin.h for some intrinsics we use in + partialButterfly4 + [5f71cc4e88e4] + + * source/VectorClass/vectori128.h: + vector: use correct build define for fromUint64 + [34a4b0cbd857] + + * source/CMakeLists.txt: + cmake: reverse yasm version logic, so all versions less than 1.2.0 + rejected + + previous logic was allowing 1.1.0.X + [5796f6b06a15] + +2013-05-15 Min Chen + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + thread: [NEED VERIFY] modify logic about m_pcBufferSbacCoders, from + [sliceID] to [wpp_line] + --- source/Lib/TLibEncoder/TEncSlice.cpp | 21 +++++++++++---------- + source/Lib/TLibEncoder/TEncSlice.h | 2 +- 2 files changed, 12 + insertions(+), 11 deletions(-) + [8328dc3293fb] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: [NEED VERIFY] simplify logic about WPP SBAC context manage + --- source/Lib/TLibEncoder/TEncSlice.cpp | 24 + ++++++------------------ 1 files changed, 6 insertions(+), 18 + deletions(-) + [7e8190b7ce6b] + +2013-05-15 Steve Borho + + * source/CMakeLists.txt, source/cmake/FindYasm.cmake, + source/cmake/version.cmake: + cmake: detect and enforce minimal YASM version + + x264 assembly code requires YASM 1.2.0 (for AVX2, etc) + [38d4e63a1d87] + +2013-05-15 Min Chen + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: remove reduce call to m_pcCfg->getWaveFrontsynchro() + --- source/Lib/TLibEncoder/TEncSlice.cpp | 3 ++- 1 files changed, 2 + insertions(+), 1 deletions(-) + [71af0137057f] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: move define of uiCUAddr into loop + --- source/Lib/TLibEncoder/TEncSlice.cpp | 4 +--- 1 files changed, 1 + insertions(+), 3 deletions(-) + [7b7e8aa6e785] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: simplify uiSubStrm + --- source/Lib/TLibEncoder/TEncSlice.cpp | 4 ++-- 1 files changed, 2 + insertions(+), 2 deletions(-) + [de144fc9cc62] + +2013-05-15 Steve Borho + + * source/test/intrapredharness.cpp, source/test/testbench.cpp: + testbench: GCC compilation fixes + [2801d32f7a03] + + * source/cmake/version.cmake: + cmake: avoid use of HG find_package + [b04ee03ff736] + + * source/encoder/common.cpp: + GCC compilation fix + [1fb1106c2e4b] + + * source/encoder/vec/macroblock.inc: + macroblock: enable partialButterfly4 for X86_64 builds only + + The primitive does not compile on x86 + [21836779c4fd] + +2013-05-15 Deepthi + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/test/timer.cpp, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + Merge + [fe8ccdde67f6] + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/x265.h, source/x265cfg.cpp, source/x265enc.cpp: + Early CU option removed. + [e7ab5425ab23] + + * source/Lib/TLibEncoder/TEncCu.cpp: + More useless skip control variables removed. + [ba4bd338e345] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibEncoder/TEncCu.cpp: + Removing extraneous SKIP macros. A more relevant Skip threshold + macro can be added later. + [72079ab73e34] + +2013-05-15 praveentiwari + + * source/encoder/macroblock.cpp: + C primitive for partialButterfly4 + [ae0b9dbfa305] + +2013-05-14 Min Chen + + * source/VectorClass/vectori128.h: + vector128i: fix bug report by Shazeb, it don't broad to high element + --- source/VectorClass/vectori128.h | 13 +++++++++---- 1 files + changed, 9 insertions(+), 4 deletions(-) + [285509a3c2b1] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: remove code for loop + --- source/Lib/TLibEncoder/TEncGOP.cpp | 27 + ++++----------------------- 1 files changed, 4 insertions(+), 23 + deletions(-) + [175911f4f3ab] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + cleanup: remove code for boundingCUAddr + --- source/Lib/TLibEncoder/TEncGOP.cpp | 3 +-- + source/Lib/TLibEncoder/TEncSlice.cpp | 20 ++++++-------------- + source/Lib/TLibEncoder/TEncSlice.h | 2 +- 3 files changed, 8 + insertions(+), 17 deletions(-) + [7bd6922d4d68] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: simplify, uiTotalCUs always multiple of CUSize + --- source/Lib/TLibEncoder/TEncSlice.cpp | 3 ++- 1 files changed, 2 + insertions(+), 1 deletions(-) + [62a3a423b4a3] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + thread: simplify, iNumSubstreams always equal to maxCUHeight + --- source/Lib/TLibCommon/TComSlice.cpp | 1 - + source/Lib/TLibCommon/TComSlice.h | 6 ------ + source/Lib/TLibEncoder/TEncCavlc.cpp | 2 +- + source/Lib/TLibEncoder/TEncCfg.h | 5 ----- + source/Lib/TLibEncoder/TEncGOP.cpp | 12 +++++++----- + source/Lib/TLibEncoder/TEncSlice.cpp | 31 + +++++++++++++++++++------------ source/Lib/TLibEncoder/TEncTop.cpp | + 7 ------- source/x265.h | 1 - source/x265cfg.cpp | 5 +---- + source/x265enc.cpp | 1 - 10 files changed, 28 insertions(+), 43 + deletions(-) + [05b33c3d0ca5] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: don't calculate uiLin and uiCol from uiCUAddr, they are + for_loop now + --- source/Lib/TLibEncoder/TEncSlice.cpp | 3 --- 1 files changed, 0 + insertions(+), 3 deletions(-) + [620f08b2c1ae] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + cleanup: remove unused code for startCUAddrSliceSegment + --- source/Lib/TLibEncoder/TEncGOP.cpp | 29 + ++++------------------------- source/Lib/TLibEncoder/TEncGOP.h | 1 - + 2 files changed, 4 insertions(+), 26 deletions(-) + [f9011428cf05] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: remove unused code for nextSliceSegment + --- source/Lib/TLibCommon/TComSlice.cpp | 2 -- + source/Lib/TLibCommon/TComSlice.h | 5 ----- + source/Lib/TLibEncoder/TEncGOP.cpp | 21 +-------------------- + source/Lib/TLibEncoder/TEncSlice.cpp | 1 - 4 files changed, 1 + insertions(+), 28 deletions(-) + [1e6f773bc6d0] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: simplify since sliceSegmentCurEndCUAddr always equal to + sliceCurEndCUAddr + --- source/Lib/TLibCommon/TComSlice.cpp | 2 -- + source/Lib/TLibCommon/TComSlice.h | 5 ----- + source/Lib/TLibEncoder/TEncCavlc.cpp | 2 +- + source/Lib/TLibEncoder/TEncCu.cpp | 16 ++++++++-------- + source/Lib/TLibEncoder/TEncGOP.cpp | 12 ++++++------ + source/Lib/TLibEncoder/TEncSlice.cpp | 20 ++------------------ 6 + files changed, 17 insertions(+), 40 deletions(-) + [311b199d9062] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: ident, no code changes + --- source/Lib/TLibEncoder/TEncSlice.cpp | 217 + +++++++++++++++++----------------- 1 files changed, 108 + insertions(+), 109 deletions(-) + [e0e64a6d4690] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + thread: replace while_loop to row/col for_loop + --- source/Lib/TLibEncoder/TEncSlice.cpp | 16 +++++++++++++--- 1 + files changed, 13 insertions(+), 3 deletions(-) + [ede2db9aabaf] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: remove unused TilesCol, etc + --- source/Lib/TLibEncoder/TEncSlice.cpp | 38 + +++++++++------------------------- 1 files changed, 10 + insertions(+), 28 deletions(-) + [95f330abd287] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: replace uiEncCUOrder to uiCUAddr since they always equal + --- source/Lib/TLibEncoder/TEncSlice.cpp | 8 +++----- 1 files + changed, 3 insertions(+), 5 deletions(-) + [19fc43ebdd8f] + + * source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + cleanup: StartCUAddr always zero + --- source/Lib/TLibEncoder/TEncGOP.cpp | 3 +-- + source/Lib/TLibEncoder/TEncSlice.cpp | 10 ++++------ + source/Lib/TLibEncoder/TEncSlice.h | 2 +- 3 files changed, 6 + insertions(+), 9 deletions(-) + [aad01f6a7122] + +2013-05-14 Steve Borho + + * source/test/intrapredharness.cpp, source/test/ipfilterharness.cpp, + source/test/mbdstharness.cpp, source/test/pixelharness.cpp, + source/test/testharness.h: + testharness: simplify REPORT_SPEEDUP, use varargs to avoid bugs + [ae526ecaa5d4] + + * source/test/CMakeLists.txt, source/test/intrapredharness.cpp, + source/test/ipfilterharness.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/test/testharness.h, + source/test/timer.cpp: + testbench: remove unused timer class + [78a8595b98aa] + + * source/test/testbench.cpp: + nits + [21e04f3d8f9c] + + * source/encoder/vec/macroblock.inc: + macroblock: disable partialButterfly4 for 32bit builds + (_mm_cvtsi64_si128) + + The _mm_cvtsi64_si128 instrinsic is only supported in 64bit + executables + [91a94c5f2272] + + * source/encoder/vec/intrapred.inc: + intrapred: use X86_64 instead of _WIN64 + [1116d281a461] + + * source/CMakeLists.txt: + cmake: add X86_64 define for 64bit builds + [abee2d651a02] + + * Merged in shazebnawazkhan/xhevc_shazeb (pull request #141) + + Pred Intra Planar 16x16 vectorization + [2d32d38d62dc] + +2013-05-14 Deepthi Devaki + + * Merge + [1b9b6880f410] + + * source/encoder/vec/intrapred.inc: + uncrustified intrapred.inc + [c23f7530165e] + + * source/encoder/vec/intrapred.inc, source/test/intrapredharness.cpp: + Pred Intra Planar 16x16 vectorization + [6ce476cd2d2e] + +2013-05-14 Steve Borho + + * source/test/mbdstharness.cpp: + mbdstharness: remove redundant enum + [84e91a173197] + + * source/encoder/vec/macroblock.inc: + macroblock: disable BUTTERFLY_4 until it as a C reference + [b449fc77bbe6] + + * source/test/testbench.cpp: + testbench: zero C primitive struct to better isolate missing C + primitives + [70c55c6ce1d1] + +2013-05-14 praveentiwari + + * source/test/mbdstharness.h: + Added extra buffer for intrinsic code test + [c8edec8e1333] + + * source/encoder/vec/macroblock.inc: + Intrinsic Code for partialButterfly4 + [e5df64c80123] + +2013-05-14 praveen Tiwari + + * Merged multicoreware/xhevc into default + [f66250dd1d07] + + * Merged multicoreware/xhevc into default + [426cf8c7b3d5] + +2013-05-14 praveentiwari + + * source/test/mbdstharness.cpp: + Test code for PartialButterfly4 intrisic code + [7a499330e879] + +2013-05-14 Deepthi + + * Merge + [36cb4e19eebf] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Modify mode decision to include NxN cost as well + [f7654615b959] + +2013-05-14 Deepthi Devaki + + * Merge + [2446e9744085] + + * source/encoder/vec/intrapred.inc: + Predict intraplanar 8x8 - vector implementation + [fab3b59f778a] + + * source/test/intrapredharness.cpp, source/test/intrapredharness.h: + Predict IntraPlanar - unit test integration + [018bdfa8fd47] + +2013-05-14 https://mandarmcw + + * build/BuildEncoderApplications.bat, + build/RunEncoderApplications.bat: + Removed ENABLE_PRIMITIVES macro from Regression test + [73c2de1b5707] + + * Merge + [f65050f97357] + + * build/BuildEncoderApplications.bat, + build/CreateRegressionPackage.bat, build/RunEncoderApplications.bat, + build/config.txt: + Modified RegressionTest script for 8bit and 16bit + [3ca347cc6859] + +2013-05-14 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp: + Removing abortflag control - performance and bitrate impact mostly + positive + [93f0085d2164] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Modifying the recursion abort condition + [17c1d43da9dd] + + * source/Lib/TLibEncoder/TEncCu.cpp: + Fix for compile error with early partition select ON. + [3001e773bc64] + +2013-05-13 Deepthi + + * source/CMakeLists.txt, source/Lib/TLibEncoder/TEncCu.cpp: + Early exit for partition selection - macro defined. Will be disabled + by default - bitcost too high. + [0d8b88594b0b] + +2013-05-13 sumalatha + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp: + renamed the macro QUALITYFACTOR to LAMBDA_PARTITION_SELECT + [4bc94e0e7cc6] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h: + Made optimizations on xcompressCU - to check the cost of each 2Nx2N + block with the corresponding cost of the same CU with NxN partition. + Aborts recursion if 2Nx2N mode is better. + [6da859408e68] + +2013-05-12 Steve Borho + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + remove frame packing SEI + + This packet only contained data passed in by the user. They can + generate it themselves + [4751ca1eb836] + + * source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, source/x265.h, source/x265cfg.cpp, + source/x265enc.cpp: + remove J0149_TONE_MAPPING_SEI + + If an app wants to generate this SEI, they can do so themselves. + There's no point cluttering the encoder with metadata like this. + [424003ca7043] + + * source/x265cfg.cpp: + x265cfg: do not print chroma QP offsets if they are 0 + [3c01f9d9dd2b] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup unused variable warnings found by GCC + [e8dff9af77dd] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/encoder/motion.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h, + source/test/testharness.h, source/test/timer.cpp: + primitives: fix EMMS calls from the HM libraries, make compiler safe + + Use inline when available, else use ASM version of EMMS + [6e85e3e865d6] + + * source/x265cfg.cpp: + x265cfg: white-space tweaks, no code changes + [51d8de167925] + + * build/BuildEncoderApplications.bat, + build/CreateRegressionPackage.sh, source/CMakeLists.txt, + source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComYuv.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/CMakeLists.txt, + source/encoder/motion.cpp, source/encoder/primitives.cpp, + source/test/CMakeLists.txt, source/test/testbench.cpp: + cleanup: remove ENABLE_PRIMITIVES cmake build option, always enabled + [5eecf2d019c8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup: remove redundant brace levels, no logic changes + [d0e3af235277] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + cleanup: remove unnecessary x264_cpu_emms() + [69df9f38425b] + + * source/Lib/TLibEncoder/TEncCu.cpp: + cleanup: remove extra levels of indent in TEncCU.cpp, no logic + changes + [77c5d4ff7ef6] + + * source/Lib/TLibEncoder/TEncCu.cpp: + cleanup: remove iMinQP + [0ded36ddaf39] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + remove FEN configurable, assume always true + [cbfcb9ccee64] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + remove max delta QP configuration logic + [08005afcd2a1] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: prune per-CU multiple QP logic + + When we add/enable rate control, each CU will only be coded once + with its determined QP. We won't be encoding the CU multiple times + for RDO + [83146bab2aa7] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCU: rename rpcCU to pcCu since it is no longer a reference + [25567c1a1191] + +2013-05-11 Steve Borho + + * source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + remove slice-level per-delta QP and lambda + [1cb2a420f6b2] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + remove slice-level multiple-QP rate distortion loop + [a30ec6c0bbbd] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + tpyo fix + [6088dbb69829] + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + uncrustify: tidy white-space + [383ac352dde6] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg: + remove no longer supported config options + [107e039f90b1] + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + merge with slice/tile removal + [d9ed2c77f27f] + + * source/x265cfg.cpp: + don't print configurations that cannot change + [55084a3b3d5e] + +2013-05-11 Min Chen + + * source/Lib/TLibEncoder/TEncCu.cpp: + cleanup: remove code for bSliceStart + --- source/Lib/TLibEncoder/TEncCu.cpp | 9 +++------ 1 files changed, + 3 insertions(+), 6 deletions(-) + [555278f25dfd] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: remove code for sliceStartCU + --- source/Lib/TLibCommon/TComDataCU.cpp | 169 + +++------------------------------- + source/Lib/TLibCommon/TComDataCU.h | 4 - + source/Lib/TLibCommon/TComSlice.cpp | 4 - + source/Lib/TLibCommon/TComSlice.h | 10 -- + source/Lib/TLibEncoder/TEncCavlc.cpp | 18 +--- + source/Lib/TLibEncoder/TEncCu.cpp | 35 ++------ + source/Lib/TLibEncoder/TEncGOP.cpp | 24 +----- + source/Lib/TLibEncoder/TEncSlice.cpp | 130 + ++++----------------------- 8 files changed, 44 insertions(+), 350 + deletions(-) + [b4d10409be04] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + cleanup: remove unused code since dependentSliceSegmentsEnabledFlag + always false + --- source/Lib/TLibCommon/TComSlice.cpp | 1 - + source/Lib/TLibCommon/TComSlice.h | 5 - + source/Lib/TLibEncoder/TEncCavlc.cpp | 6 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 156 + +-------------------------------- source/Lib/TLibEncoder/TEncTop.cpp + | 12 --- 5 files changed, 6 insertions(+), 174 deletions(-) + [cc4d3b223d18] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h: + cleanup: remove unused xCalculateNxtCUAddr + --- source/Lib/TLibCommon/TComPicSym.cpp | 25 + ------------------------- source/Lib/TLibCommon/TComPicSym.h | 1 - 2 + files changed, 0 insertions(+), 26 deletions(-) + [d00c47bdfe0a] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: remove class TComTile + --- source/Lib/TLibCommon/TComPicSym.cpp | 41 + +-------------------------------- source/Lib/TLibCommon/TComPicSym.h + | 31 ------------------------- source/Lib/TLibEncoder/TEncGOP.cpp | + 6 ----- source/Lib/TLibEncoder/TEncSlice.cpp | 17 ++++++------- 4 + files changed, 10 insertions(+), 85 deletions(-) + [1b10f4a0ab74] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: cleanup code for getTileWidth and getTileHeight + --- source/Lib/TLibCommon/TComPicSym.cpp | 12 +++++------- + source/Lib/TLibCommon/TComPicSym.h | 10 ---------- + source/Lib/TLibEncoder/TEncGOP.cpp | 7 ------- 3 files changed, 5 + insertions(+), 24 deletions(-) + [097f2b0c2d72] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: cleanup code for getPicSCUEncOrder and getPicSCUAddr + --- source/Lib/TLibCommon/TComPicSym.cpp | 10 ---------- + source/Lib/TLibCommon/TComPicSym.h | 2 -- + source/Lib/TLibEncoder/TEncCu.cpp | 6 +++--- + source/Lib/TLibEncoder/TEncGOP.cpp | 6 +++--- + source/Lib/TLibEncoder/TEncSlice.cpp | 20 ++++++++++---------- 5 + files changed, 16 insertions(+), 28 deletions(-) + [96f911caef7c] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPic.cpp, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: cleanup more code for CUOrderMap + --- source/Lib/TLibCommon/TComDataCU.cpp | 24 + ++++++++++++------------ source/Lib/TLibCommon/TComPic.cpp | 4 ++-- + source/Lib/TLibCommon/TComPicSym.cpp | 4 ++-- + source/Lib/TLibCommon/TComPicSym.h | 4 ---- + source/Lib/TLibEncoder/TEncCavlc.cpp | 2 +- + source/Lib/TLibEncoder/TEncCu.cpp | 18 +++++++++--------- + source/Lib/TLibEncoder/TEncGOP.cpp | 2 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 20 ++++++++++---------- 8 + files changed, 37 insertions(+), 41 deletions(-) + [7f054fc6613a] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: cleanup code for CUOrderMap + --- source/Lib/TLibCommon/TComPicSym.cpp | 17 ----------------- + source/Lib/TLibCommon/TComPicSym.h | 10 ++-------- + source/Lib/TLibEncoder/TEncGOP.cpp | 13 +------------ 3 files + changed, 3 insertions(+), 37 deletions(-) + [68d8a7b12206] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: cleanup code for tilesIDX + --- source/Lib/TLibCommon/TComDataCU.cpp | 33 +++----- + source/Lib/TLibCommon/TComDataCU.h | 3 +- + source/Lib/TLibCommon/TComPic.cpp | 34 +------- + source/Lib/TLibCommon/TComPic.h | 2 +- + source/Lib/TLibCommon/TComPicSym.cpp | 83 +++----------------- + source/Lib/TLibCommon/TComPicSym.h | 7 +- + source/Lib/TLibEncoder/TEncGOP.cpp | 4 +- + .../Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp | 12 ++-- + source/Lib/TLibEncoder/TEncSlice.cpp | 66 +++------------- 9 files + changed, 50 insertions(+), 194 deletions(-) + [b39cd3c37dc4] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: remove unused code for uniformSpacingFlag + --- source/Lib/TLibCommon/TComSlice.cpp | 1 - + source/Lib/TLibCommon/TComSlice.h | 5 ---- + source/Lib/TLibEncoder/TEncGOP.cpp | 37 + +--------------------------------- 3 files changed, 2 insertions(+), + 41 deletions(-) + [045677ed690d] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + cleanup: remove unused code since tilesEnabledFlag always false + --- source/Lib/TLibCommon/TComSlice.cpp | 1 - + source/Lib/TLibCommon/TComSlice.h | 5 ----- + source/Lib/TLibEncoder/TEncCavlc.cpp | 33 + +++------------------------------ source/Lib/TLibEncoder/TEncTop.cpp + | 1 - 4 files changed, 3 insertions(+), 37 deletions(-) + [1766af826d7b] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + cleanup: remove unused code since uiTilesAcross always 1 + --- source/Lib/TLibEncoder/TEncSlice.cpp | 30 + ++++++++---------------------- 1 files changed, 8 insertions(+), 22 + deletions(-) + [52069da06f5e] + + * source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + cleanup: remove code for option TilesFixedStructure + --- source/Lib/TLibCommon/TComSlice.h | 6 ------ + source/Lib/TLibEncoder/TEncCavlc.cpp | 2 +- + source/Lib/TLibEncoder/TEncCfg.h | 5 ----- + source/Lib/TLibEncoder/TEncTop.cpp | 1 - source/x265.h | 1 - + source/x265cfg.cpp | 1 - source/x265enc.cpp | 1 - 7 files changed, 1 + insertions(+), 16 deletions(-) + [38354c90843d] + + * source/x265.h, source/x265cfg.cpp, source/x265enc.cpp: + cleanup: remove code for option LFCrossTileBoundaryFlag + --- source/x265.h | 1 - source/x265cfg.cpp | 1 - source/x265enc.cpp | + 3 --- 3 files changed, 0 insertions(+), 5 deletions(-) + [5a642068a6b0] + + * source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + cleanup: remove code for options UniformSpacingIdc + --- source/Lib/TLibEncoder/TEncCfg.h | 5 ----- + source/Lib/TLibEncoder/TEncTop.cpp | 1 - source/x265.h | 1 - + source/x265cfg.cpp | 1 - source/x265enc.cpp | 1 - 5 files changed, 0 + insertions(+), 9 deletions(-) + [868bfad3eaca] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + cleanup: remove unused code in TComPic::createNonDBFilterInfo + --- source/Lib/TLibCommon/TComPic.cpp | 10 +++------- + source/Lib/TLibCommon/TComPic.h | 1 - + source/Lib/TLibEncoder/TEncGOP.cpp | 2 +- 3 files changed, 4 + insertions(+), 9 deletions(-) + [96640251b7f3] + + * source/x265cfg.cpp: + cleanup: remove unused code since tileFlag always false + --- source/x265cfg.cpp | 30 ++---------------------------- 1 files + changed, 2 insertions(+), 28 deletions(-) + [7bbab452fbb1] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265cfg.h, source/x265enc.cpp: + cleanup: remove code for options NumTileColumnsMinus1 and + NumTileRowsMinus1 + --- source/Lib/TLibCommon/TComPicSym.cpp | 38 +++++------- + source/Lib/TLibCommon/TComPicSym.h | 12 ---- + source/Lib/TLibCommon/TComSlice.cpp | 14 ---- + source/Lib/TLibCommon/TComSlice.h | 42 ------------- + source/Lib/TLibEncoder/TEncCavlc.cpp | 20 +------ + source/Lib/TLibEncoder/TEncCfg.h | 51 ---------------- + source/Lib/TLibEncoder/TEncGOP.cpp | 44 ++++---------- + source/Lib/TLibEncoder/TEncSlice.cpp | 20 +++--- + source/Lib/TLibEncoder/TEncTop.cpp | 73 +---------------------- + source/x265.h | 2 - source/x265cfg.cpp | 109 + +--------------------------------- source/x265cfg.h | 5 -- + source/x265enc.cpp | 12 ---- 13 files changed, 43 insertions(+), 399 + deletions(-) + [3cf723a2a3e7] + + * source/Lib/TLibCommon/TComDataCU.cpp: + cleanup: onlyOneSliceInPic always true + --- source/Lib/TLibCommon/TComDataCU.cpp | 188 + ++-------------------------------- 1 files changed, 8 insertions(+), + 180 deletions(-) + [da29d2e40837] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + cleanup: remove code for LFCrossSliceBoundaryFlag + --- source/Lib/TLibCommon/TComDataCU.cpp | 19 +++++++++---------- + source/Lib/TLibCommon/TComDataCU.h | 1 - + source/Lib/TLibCommon/TComLoopFilter.cpp | 18 +++++++++--------- + source/Lib/TLibCommon/TComPic.cpp | 12 ------------ + source/Lib/TLibCommon/TComPic.h | 1 - + source/Lib/TLibCommon/TComSlice.cpp | 2 -- + source/Lib/TLibCommon/TComSlice.h | 10 ---------- + source/Lib/TLibEncoder/TEncCavlc.cpp | 6 +++--- + source/Lib/TLibEncoder/TEncCfg.h | 6 ------ + source/Lib/TLibEncoder/TEncGOP.cpp | 12 ++---------- + source/Lib/TLibEncoder/TEncTop.cpp | 1 - source/x265.h | 1 - + source/x265cfg.cpp | 1 - source/x265enc.cpp | 3 --- 14 files + changed, 23 insertions(+), 70 deletions(-) + [2faa4694539d] + + * source/Lib/TLibCommon/TypeDef.h: + cleanup: remove enum SliceConstraint + --- source/Lib/TLibCommon/TypeDef.h | 9 --------- 1 files changed, 0 + insertions(+), 9 deletions(-) + [d8836d4bff30] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + cleanup: remove code for option sliceSegmentMode and + sliceSegmentModeArgument + --- source/Lib/TLibCommon/TComSlice.cpp | 4 - + source/Lib/TLibCommon/TComSlice.h | 9 --- + source/Lib/TLibEncoder/TEncCfg.h | 11 --- + source/Lib/TLibEncoder/TEncCu.cpp | 12 ---- + source/Lib/TLibEncoder/TEncGOP.cpp | 2 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 118 + +--------------------------------- + source/Lib/TLibEncoder/TEncTop.cpp | 4 - source/x265.h | 4 - + source/x265cfg.cpp | 17 +----- source/x265enc.cpp | 9 --- 10 files + changed, 3 insertions(+), 187 deletions(-) + [bd168f9a50b3] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + cleanup: remove code for option sliceMode and sliceModeArgument + --- source/Lib/TLibCommon/TComSlice.cpp | 4 - + source/Lib/TLibCommon/TComSlice.h | 10 --- + source/Lib/TLibEncoder/TEncCfg.h | 12 ---- + source/Lib/TLibEncoder/TEncCu.cpp | 11 +--- + source/Lib/TLibEncoder/TEncGOP.cpp | 7 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 106 + +-------------------------------- source/x265.h | 3 - + source/x265cfg.cpp | 20 +------ source/x265enc.cpp | 17 ------ 9 + files changed, 10 insertions(+), 180 deletions(-) + [a4788d814f55] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + threading: remove unused C++ pointer reference (variant name do not + modify for keep compatible) + --- source/Lib/TLibCommon/TComDataCU.cpp | 4 ++-- + source/Lib/TLibCommon/TComPic.h | 2 +- + source/Lib/TLibEncoder/TEncCu.cpp | 2 +- + source/Lib/TLibEncoder/TEncCu.h | 2 +- + source/Lib/TLibEncoder/TEncSlice.cpp | 8 ++++---- + source/Lib/TLibEncoder/TEncSlice.h | 4 ++-- 6 files changed, 11 + insertions(+), 11 deletions(-) + [90b611273d0a] + +2013-05-11 Steve Borho + + * source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + Remove SBACRD configuration option + [4fef14f41d6d] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix 16bpp compiler warnings + [3dfbf98b245e] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + TComRdCost: remove dead 2x2 satd code paths + + HEVC does not support 2x2 partitions, even for chroma. See + http://forum.doom9.org/showthread.php?t=167081 + [293b1e04bea4] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost: fix old HM bug in dead 2x2 logic + [73ef64e5f6a4] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: use SATD primitives to measure chroma intra residual + [574ec4bfc6b7] + + * source/encoder/x86/asm-primitives.cpp: + asm: use XOP satd primitives when available; roughly 10x C versions + [a387fff04cb9] + + * source/encoder/x86/asm-primitives.cpp: + asm: use x264_pixel_satd_4x16_avx when available + [6910ed19b5d6] + + * source/encoder/x86/asm-primitives.cpp: + asm: use x264_pixel_satd_4x16_sse4 when available + [26003f6ce226] + + * source/encoder/x86/asm-primitives.cpp: + asm: use SSE2 satd functions for all partitions, where available + + Roughly 6x faster than C, 2x faster than MMX + [2178cc1f7651] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/pixel.cpp, + source/encoder/primitives.h, source/encoder/x86/asm-primitives.cpp, + source/test/pixelharness.cpp: + primitive: add sa8d_32x32 and sa8d_64x64, use for intra cost + estimation + [e2f65de5920e] + + * source/Lib/TLibEncoder/TEncCu.cpp: + TEncCu: remove for-loops over QP, make them simple expressions + + No sane encoder would ever do that sort of redundant analysis. There + is a lot of cleanup that needs to trickle up from here. iMinQP and + isAddLowestQP can probably be removed, and all the slice and SPS + fields for QP RD can be removed, all the way up to the config + options that enabled it. + [d799578d900d] + +2013-05-10 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.h, source/encoder/motion.h: + fix aligned pixel caches for x86 and x64 builds + + Tested on VC10, VC11, and MinGW x86 + [b8cb76df149d] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/CMakeLists.txt, + source/encoder/InterpolationFilter.cpp, + source/encoder/InterpolationFilter.h: + remove unused InterpolationFilter.cpp, move filter args back into + TComPrediction + [29c61b186ec8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: replace filterCopy() call with cpyblock primitive + [928a426cc302] + + * source/encoder/motion.h: + motion: lessen alignment restriction for pixel cache + + Prevents compiler warnings about required padding + [bcc1dbbf732d] + + * source/encoder/primitives.cpp: + primitives: report XOP and FMA detection as well as CPU SIMD level + [031313a5bfcb] + + * source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: re-order member variables to avoid alignment causing + padding + + Some VC versions will issue warnings if alignment decls require + padding to be added (a performance warning) + [8d904a7306ac] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/encoder/motion.h: + motion: rename MotionReference.plane to lumaPlane, remove unused + channels + [9b655ff38985] + + * Merged in sumalatha/xhevc_sumalatha (pull request #132) + + Changed the variable name according to naming convention of HM + [0e45feaf8759] + +2013-05-10 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + changed the variable name according to naming convention of HM + Included the comment such that early abort can be disabled + [9df494f9e780] + + * source/Lib/TLibEncoder/TEncCu.cpp: + removed goto statement. + [a0bce7915be9] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + Incorporated Steves changes in optimized xcompressCU() to abort + recursion if cost of child CU is greater that the cost of parent CU. + [97195ad9e3e2] + +2013-05-10 Sumalatha Polureddy + + * Merged multicoreware/xhevc into default + [a3286df9a8cb] + +2013-05-09 sumalatha + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h: + optimized xcompressCU() to abort recursion if cost of child CU is + greater that the cost of parent CU. + [5432fa0411d9] + +2013-05-10 https://mandarmcw + + * build/BuildEncoderApplications.bat: + Implemented all posible configurations into Regression test(8bit_pre + m(enable_test),8bit_HM(disable_test),16bit_prem(enable_test),16bit_H + M(disable_test)) + [b686ba6045ad] + +2013-05-10 ShinYee Chung + + * source/encoder/vec/intrapred.inc: + Intra: Fix compile errors due to vector conversions. + [e4fa6f942fe7] + + * source/encoder/common.cpp: + Common: Fix the type mismatch between printf format and argument. + [915fc0395e29] + + * source/encoder/common.cpp: + Common: Fix broken eol, tabs, spaces. + [710015f394ee] + +2013-05-10 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/motion.cpp, + source/encoder/motion.h: + motion: new bitcost method for BitCost and MotionEstimate classes + [a553b5197f40] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + remove WEIGHTED_CHROMA_DISTORTION define + [39d47f7eaa87] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Remove GPB_SIMPLE_UNI define + [f8bc2b563e63] + +2013-05-10 Min Chen + + * source/encoder/vec/intrapred.inc: + intrapred: optimize predIntraPlanar4 with 16bpp + --- source/encoder/vec/intrapred.inc | 87 + +++++++++++++++++++++++++------------- 1 files changed, 58 + insertions(+), 29 deletions(-) + [0191c2e03e51] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: fix build warning at 16bpp + --- source/Lib/TLibCommon/TComPrediction.cpp | 4 ++-- 1 files + changed, 2 insertions(+), 2 deletions(-) + [14bd77d46976] + +2013-05-10 Steve Borho + + * source/encoder/vec/intrapred.inc, + source/encoder/vec/vecprimitives.inc: + vec: inc files should not have includes of their own + [7505b485f286] + +2013-05-10 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/IntraPred.cpp, source/encoder/primitives.h, + source/encoder/vec/intrapred.inc, + source/encoder/vec/vecprimitives.inc: + intrapred: optimize predIntraPlanar4 with 8bpp + --- source/Lib/TLibCommon/TComPrediction.cpp | 15 +++- + source/encoder/IntraPred.cpp | 51 +++++++++ + source/encoder/primitives.h | 2 + source/encoder/vec/intrapred.inc | + 169 ++++++++++++++++++++++++++++++ + source/encoder/vec/vecprimitives.inc | 1 + 5 files changed, 236 + insertions(+), 2 deletions(-) + [77f050ae9201] + + * source/VectorClass/vectori128.h: + vector128: unsafe compress(map to packuswb) + --- source/VectorClass/vectori128.h | 4 ++++ 1 files changed, 4 + insertions(+), 0 deletions(-) + [5b41635781a6] + + * source/VectorClass/vectori128.h: + vector128: constant shift right and broadcast + --- source/VectorClass/vectori128.h | 39 + +++++++++++++++++++++++++++++++++++++++ 1 files changed, 39 + insertions(+), 0 deletions(-) + [5b4d2cff5b76] + + * source/VectorClass/vectori128.h: + vector128: constant shift left + --- source/VectorClass/vectori128.h | 31 + +++++++++++++++++++++++++++++++ 1 files changed, 31 insertions(+), 0 + deletions(-) + [c1200d24153e] + +2013-05-09 Steve Borho + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: remove fref class member + [07c65d9466fc] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: remove residual functions from class + + motion compensation can happen well outside of the analysis loop, so + there is no benefit in trying to re-use our pixel cache + [3399d80552f7] + + * source/test/ipfilterharness.cpp, source/test/testharness.h: + testharness: adopt checkasm approach to measuring performance + [46d2553e471e] + + * source/Lib/TLibCommon/TComRdCost.cpp, source/encoder/x86/asm- + primitives.cpp, source/encoder/x86/const-a.asm, + source/encoder/x86/cpu-a.asm, source/encoder/x86/pixel-a.asm, + source/encoder/x86/pixel.h, source/encoder/x86/sad-a.asm, + source/encoder/x86/x86inc.asm, source/encoder/x86/x86util.asm: + asm: enable x264 sad, sad_x3, sad_x4, sa8d, and satd assembly + routines + + This required removing sa8d and satd primitives from TComRdCost.cpp + - they are only called from the new motion search now. x264 asm has + been updated to tip + [09b29a12d3b9] + +2013-05-09 Mandar Gurav + + * cfg/per-sequence/FourPeople.cfg: + Implemented Steve's comments - 2 + [7b7444bd1693] + + * build/dr_psnr_script/psnr_script.bat, cfg/per- + sequence/BasketballDrive.cfg, cfg/per-sequence/FourPeople.cfg: + Implemented Steve's comments. + [bade8b9e1b03] + +2013-05-09 Mandar Gurav + + * Merged multicoreware/xhevc into default + [b4aa1a7dbe9f] + +2013-05-09 Mandar Gurav + + * build/dr_psnr_script/dr_psnr.exe, + build/dr_psnr_script/psnr_script.bat: + Use tools/dr_psnr binary built from the source directory. And use + YUV file as input to dr_psnr instead of y4m. + [8c75b377005e] + +2013-05-09 Mandar Gurav + + * Merged multicoreware/xhevc into default + [4e2423672016] + +2013-05-08 Mandar Gurav + + * Merge + [be402320a9b8] + + * build/dr_psnr_script/psnr_script.bat: + Implemented suggestions from Steve. + [d6affb95df1d] + +2013-05-08 Mandar Gurav + + * Merged multicoreware/xhevc into default + [e76f1de33e6b] + +2013-05-08 Mandar Gurav + + * build/dr_psnr_script/BasketballDrive.cfg, + build/dr_psnr_script/FourPeople.cfg, + build/dr_psnr_script/encoder_I_15P.cfg, + build/dr_psnr_script/encoder_all_I.cfg, + build/dr_psnr_script/psnr_script.bat, cfg/per- + sequence/BasketballDrive.cfg, cfg/per-sequence/FourPeople.cfg: + Implemented suggestions from Deepthi N. + [b6ee43e7007e] + +2013-05-08 Mandar Gurav + + * Merged multicoreware/xhevc into default + [321ae9002b1e] + + * Merged multicoreware/xhevc into default + [fd85e05a4c0d] + +2013-05-07 Mandar Gurav + + * build/dr_psnr_script/BasketballDrive.cfg, + build/dr_psnr_script/FourPeople.cfg, + build/dr_psnr_script/TAppDecoder.exe, + build/dr_psnr_script/dr_psnr.exe, + build/dr_psnr_script/encoder_I_15P.cfg, + build/dr_psnr_script/encoder_all_I.cfg, + build/dr_psnr_script/psnr_script.bat: + Script for generating dr_psnr output. Output files are as follows + * encoder_output.txt + * decoder_output.txt + * dr_psnr_output.txt + [6e70935ca7db] + +2013-05-09 Deepthi Devaki + + * Merge + [acf7fe823eda] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Change comment. + [2a48b178a31c] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Remove copy of Full Pel from filter. Instead copy it in + extendPicBorder + [9d818fd017d9] + +2013-05-09 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + motion: remove redundant plane pointer assignments + [6bd4cc4a96c1] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + disable new motion search until UMH mode is added + + The UMH search should have comparable quality to the HM search. The + DIA search loses too much compression efficiency to use as a default + [fb3a87cc0e10] + + * Merged in ggopu/ggopu_xhevc (pull request #129) + + motion: fix reference plane pointers and strides + [8a5c2df2915c] + +2013-05-09 ggopu + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Integrated new motion vector changes + [2f781b3a5479] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + motion: fix reference plane pointers and strides + [ac4916565956] + +2013-05-09 Deepthi Devaki + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Fix output mismatch: change input width/height for filter + [8c7533211b95] + +2013-05-08 Steve Borho + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, source/x265cfg.cpp, + source/x265enc.cpp: + Remove REMOVE_SINGLE_SEI_EXTENSION_FLAGS, K0180_SCALABLE_NESTING_SEI + macros + [0a960f095a1e] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/x265cfg.cpp, + source/x265enc.cpp: + Remove L0208_SOP_DESCRIPTION_SEI define (remove from HM) + [7b3273e9b24a] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, source/x265cfg.cpp, + source/x265enc.cpp: + Remove SIGNAL_BITRATE_PICRATE_IN_VPS define (removed from tip of HM) + [315a8cb35bc7] + + * source/compat/msvc/getopt.c, source/tools/dr_psnr/CMakeLists.txt, + source/tools/dr_psnr/SSIMCalculator.cpp: + dr_psnr: fix GCC build warnings + [1504ac21e981] + + * source/CMakeLists.txt, source/tools/CMakeLists.txt, + source/tools/dr_psnr/CMakeLists.txt, + source/tools/dr_psnr/PsnrCalculator.cpp, + source/tools/dr_psnr/PsnrCalculator.h, + source/tools/dr_psnr/SSIMCalculator.cpp, + source/tools/dr_psnr/SSIMCalculator.h, + source/tools/dr_psnr/dr_psnr.cpp: + tools: add a tools/ folder and start with dr_psnr + [c211b4c4d1ba] + + * source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/ContextModel.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncSbac.h: + cleanup compile guards, no code effects + [eb687bb237ae] + + * source/Lib/TLibCommon/ContextTables.h: + remove FIX827, FIX712 #defines + [6cf68223c50f] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + random white-space cleanups, no code changes + [22363860b159] + +2013-05-08 nandaku2 + + * Merged in deepthidevaki/xhevc_deepthid (pull request #124) + + Modifications to subpel generation + [4dd4c610646f] + +2013-05-08 Deepthi Devaki + + * Merge + [f928fa3e7405] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Merge + [656789cd00e1] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Change filter temp variable to short* instead of TshortYuv + [6c4e3ee894bf] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp: + Uncrustified two files. + [86c4e1e12613] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPrediction.cpp: + Use block copy primitives instead of filterCopy + [47aa034b5545] + + * Merge + [770006a69dfa] + +2013-05-07 Deepthi Devaki + + * source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter16.inc, + source/encoder/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Backed out changeset: 1baaa3822b79 + [6b4cabf50573] + + * source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter16.inc, + source/encoder/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Backed out changeset: da889c1d7641 + [1baaa3822b79] + +2013-05-08 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [4470f5f36d76] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Merged multicoreware/xhevc into default + [cd5e5e2498d8] + +2013-05-07 Deepthi Devaki + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + Remove unwanted comments and code. + [bb747fc0379d] + +2013-05-08 Deepthi + + * Merge + [bad457c087fd] + + * source/encoder/CMakeLists.txt, source/encoder/common.cpp, + source/encoder/common.h: + Debug Support for dumping buffers. Simply typecast all pointers to + void*, and it will dump the required number of bytes into a buffer. + [1aa6126cf203] + +2013-05-08 Steve Borho + + * source/test/testharness.h: + testbench: GCC build fixes + [524b43126be7] + + * Merged in mandarmcw/xhevc_mandar_mahesh (pull request #121) + + Prints the avg. number of cycles taken for vec_primitive and C + function calls. + [e9572d3f84b7] + +2013-05-07 Mandar Gurav + + * source/test/testharness.h: + Prints the avg. number of cycles taken for vec_primitive and C + function calls. + [d339ef591ffd] + +2013-05-08 nandaku2 + + * Merged in sumalatha/xhevc_sumalatha (pull request #122) + + fix for the output mismatch issue. Now when AMPREfine flag is not + set, output matches + [1595173e7694] + +2013-05-08 sumalatha + + * source/Lib/TLibEncoder/TEncTop.cpp: + fix for the output mismatch issue. Now when AMPREfine flag is not + set, output matches + [34dd7b92a7f7] + +2013-05-07 Steve Borho + + * source/Lib/TLibCommon/TComPicYuv.cpp, source/PPA/ppaCPUEvents.h: + ppa: add an event for TComPicYuv::extendPicBorder() + [b4702c569e73] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg: + white-space nits in config files + [32049d39e86b] + +2013-05-07 Sumalatha Polureddy + + * source/test/filterharness.cpp, source/test/filterharness.h: + Merged multicoreware/xhevc into default + [e29437861397] + +2013-05-07 sumalatha + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265.h, + source/x265cfg.cpp, source/x265enc.cpp: + Included additional flag "AMPRefine" in cfg file and made changes in + the code such that if the flag is enabled, the partition size Nx2N + and 2NxN is used for search and other operations in INTER otherwise + these two partition sizes are excluded. + [4589879e4226] + +2013-05-07 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Removed unwanted comments + [498c2a8e99e9] + + * source/encoder/InterpolationFilter.cpp: + Modified filterverticalpel_pel + [5c9b304404b5] + + * source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter16.inc, + source/encoder/vec/ipfilter8.inc, source/test/ipfilterharness.cpp: + Added vector primitives for filtervertical_pel_pel + [da889c1d7641] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + Remove all references to chroma filterBlocks. + [171889c50962] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + ME and Prediction uses new Luma subpel blocks, instead of generating + subpels on demand. + [127995bc3034] + + * source/test/CMakeLists.txt, source/test/filterharness.cpp, + source/test/filterharness.h, source/test/testbench.cpp: + Removed old filter from testbench + [069d0a894c3b] + + * source/encoder/macroblock.cpp: + Removed old filter c primitives from macroblock.cpp + [f5f4cefafde6] + + * source/encoder/vec/macroblock.inc: + Removed old filters vec primitives + [98efb37469d6] + +2013-05-06 Steve Borho + + * source/encoder/vec/blockcopy.inc: + blockcopy: retask 8bpp s_p for s_c + [b80724df82fb] + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + pixel: introduce blockcopy_s_c primitive for expanding bytes to + shorts + + In some places, we always want to expand bytes to shorts + unconditionally, regardless of the size of pixel/Pel. This function + pointer may be used for that. Implementation wise, it should use the + existing vectorized function for 8bpp blockcopy_s_p. + [f0ce6f417381] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: fix build warning at 16bpp + [1c265556534f] + + * Merged in deepthidevaki/xhevc_deepthid (pull request #118) + + Added function to generate Luma subpels for entire frame. + [6e133daf825b] + +2013-05-06 Deepthi Devaki + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Modified function to generate Luma subpels for entire frame. + [8910b39167b2] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Functions to generate QPels for entire frame + [a230bfeb4924] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/encoder/InterpolationFilter.h: + Generate Hpel for entire frame. (tested!) + [5e8ea012ff63] + +2013-05-06 Deepthi Devaki Akkoorath + + * source/Lib/TLibCommon/TComPicYuv.h, + source/encoder/vec/interpolationfilter.inc: + Merged multicoreware/xhevc into default + [a9d4a7ce337c] + +2013-05-03 Deepthi Devaki + + * source/Lib/TLibCommon/TComPicYuv.h: + Added access functions for filteredBlocks + [260dd458563c] + + * Merge + [5fafbbd52b06] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Initialize m_filterBlocks to NULL + [dd7000afff97] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + Fix deallocating filterBlocks memory. + [e7e63540cbf8] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, source/encoder/TShortYUV.h: + Added Data structures for H/Q planes + [d2269b064b50] + +2013-05-06 Steve Borho + + * source/test/intrapredharness.cpp: + intrapredharness: prevent out-of-bounds reads during validation + tests + [25de1ee81ef5] + + * source/encoder/vec/ipfilter.inc: + ipfilter: remove cruft that was only for preventing unused parameter + warnings + [df6907669de5] + + * source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter8.inc: + Merged in mandarmcw/xhevc_mandar_mahesh (pull request #117) + + Filter implementation for HIGH_BIT_DEPTH=0 + [d6a9d7d2fb0e] + +2013-05-06 Mandar Gurav + + * source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter8.inc: + Filter implementation for HIGH_BIT_DEPH=0 + [0e7d769686df] + +2013-05-06 Min Chen + + * source/test/intrapredharness.cpp, source/test/intrapredharness.h: + intrapred: testbench for xPredIntraDC + --- source/test/intrapredharness.cpp | 59 + +++++++++++++++++++++++++++++--------- + source/test/intrapredharness.h | 4 ++- 2 files changed, 48 + insertions(+), 15 deletions(-) + [9d2804da4cff] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/IntraPred.cpp, source/encoder/primitives.h, + source/encoder/vec/intrapred.inc: + intrapred: C reference model and code cleanup + --- source/Lib/TLibCommon/TComPrediction.cpp | 12 ++- + source/encoder/IntraPred.cpp | 44 ++++++++++- + source/encoder/primitives.h | 6 +- source/encoder/vec/intrapred.inc + | 132 +----------------------------- 4 files changed, 53 + insertions(+), 141 deletions(-) + [8f5f1349ecb3] + + * source/encoder/vec/intrapred.inc: + intrapred: simplify in xDCPredFiltering() + --- source/encoder/vec/intrapred.inc | 10 ++++++---- 1 files changed, + 6 insertions(+), 4 deletions(-) + [22b068217f37] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/IntraPred.cpp, source/encoder/primitives.h, + source/encoder/vec/intrapred.inc: + intrapred: new vector optimize code for predIntraDC, + xDCPredFiltering + --- source/Lib/TLibCommon/TComPrediction.cpp | 4 + + source/encoder/IntraPred.cpp | 1 + source/encoder/primitives.h | 2 + + source/encoder/vec/intrapred.inc | 471 + ++++++++++++++++++++++++++++++ 4 files changed, 478 insertions(+), 0 + deletions(-) + [66de4e152e1f] + + * source/VectorClass/vectori128.h: + vector128: constant shift for Vec8us + --- source/VectorClass/vectori128.h | 12 ++++++++++++ 1 files + changed, 12 insertions(+), 0 deletions(-) + [6764e16594d5] + + * source/VectorClass/vectori128.h, source/encoder/vec/intrapred.inc: + vector128: more general load_partial and store_partial + --- source/VectorClass/vectori128.h | 56 + +++++++++++++++++++++++++++++++------ + source/encoder/vec/intrapred.inc | 3 +- 2 files changed, 48 + insertions(+), 11 deletions(-) + [2cc4fc0e415e] + + * source/VectorClass/vectori128.h, source/encoder/vec/intrapred.inc: + vectorclass: fix Vec4ui wrong action and name + --- source/VectorClass/vectori128.h | 4 ++-- + source/encoder/vec/intrapred.inc | 4 ++-- 2 files changed, 4 + insertions(+), 4 deletions(-) + [19c3e1abfd46] + +2013-05-06 Steve Borho + + * source/encoder/vec/ipfilter16.inc, source/encoder/vec/ipfilter8.inc: + ipfilter: more pruning + [2bb2e364d57c] + + * source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter16.inc, + source/encoder/vec/ipfilter8.inc, + source/encoder/vec/vecprimitives.inc: + vec: inc files should not include other headers + + Only vecprimitives.inc may include files; the other files are being + included within a namespace and can cause strange link problems if + headers are included. + [b6fc9bf11a3e] + + * source/Lib/TLibEncoder/TEncSlice.cpp: + TEncSlice: backout replacement of calcRdCost64 with CALCRDCOST_SAD + + Those were two entirely different function calls + [74bf6c970636] + +2013-05-06 nandaku2 + + * source/encoder/vec/interpolationfilter.inc: + Merged in mandarmcw/xhevc_mandar_mahesh (pull request #116) + + Interpolationfilter primitives restructured. + [81577757e572] + +2013-05-06 Mandar Gurav + + * source/encoder/vec/CMakeLists.txt, + source/encoder/vec/interpolationfilter.inc, + source/encoder/vec/ipfilter.inc, source/encoder/vec/ipfilter16.inc, + source/encoder/vec/ipfilter8.inc, + source/encoder/vec/vecprimitives.inc: + Interpolationfilter primitives restructured. + [07679e04c0e4] + +2013-05-06 sumalatha + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + Replaced the function call for calcRdCost() with MACRO CALCRDCOST + such that in macro, the switch case for selection of "eDFunc" is + removed. + [7f3fe334fb1e] + +2013-05-05 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix warning with 15bpp build + [cd46a81725fb] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv: use primitives for remaining copy functions + [34471ada8ab3] + + * source/Lib/TLibCommon/TComYuv.cpp: + TComYuv: use pixel copy primitives + [bce043dae3ef] + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicYuv.h: + white-space nits, no changes + [2bed949a69ab] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: get access to the original source pixel plane and real + strides + [d959edb866f9] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: get offset into original picture buffer + + pcOrgYuv seems to point to a small YUV buffer holding only the + pixels for the current CTU under analysis. + [d85c6703f022] + + * source/Lib/TLibCommon/TComPrediction.cpp: + TComPrediction: fix MSVC reported stack corruption + + When run in debug mode, MSVC reported stack corruption near + leftColumn. The 1st for loop is iterating over blkSize+1 + + This looks like an HM bug exposed by reducing MAX_CU_SIZE to the + actual max 64 + [14d43f8589d0] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: add vectorized sad_48_x3 + [765fc92c7d80] + + * source/encoder/pixel.cpp, source/encoder/vec/pixel.inc, + source/encoder/vec/pixel16.inc, source/encoder/vec/pixel8.inc: + pixel: add vectorized sad_48_x4 + [254937aa7425] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: add vectorized sad 48 + [f48a79bd4693] + + * source/encoder/vec/pixel16.inc: + pixel16: fix bug in sad_12_x4 + [a1ec7d7c20ee] + + * source/encoder/vec/pixel16.inc: + pixel16: fix bug in sad_12_x3 + [da712d42ba62] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: fix compiler warning + [3be39838c8d4] + + * source/encoder/vec/pixel8.inc: + pixel8: fix bug in sad_4_x4 + [0b7655d1a1b3] + + * source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/test/pixelharness.cpp: + pixel: add 48 sized partitions + [98c591baafeb] + +2013-05-05 Min Chen + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComRom.h: + intrapred: use alignment piAdibuf + --- source/Lib/TLibCommon/TComPattern.cpp | 18 +++++++++--------- + source/Lib/TLibCommon/TComPrediction.cpp | 4 ++-- + source/Lib/TLibCommon/TComRom.h | 1 + 3 files changed, 12 + insertions(+), 11 deletions(-) + [48895f57159b] + + * source/Lib/TLibCommon/TComPattern.cpp: + intrapred: perform by merge loop + --- source/Lib/TLibCommon/TComPattern.cpp | 10 ++-------- 1 files + changed, 2 insertions(+), 8 deletions(-) + [51b0fb4398be] + + * source/Lib/TLibCommon/TComPattern.cpp: + intrapred: perform pixel copy with memcpy() in + TComPattern::fillReferenceSamples + --- source/Lib/TLibCommon/TComPattern.cpp | 23 + ++++------------------- 1 files changed, 4 insertions(+), 19 + deletions(-) + [bab59102ce8e] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h: + intrapred: remove unsupported mode code for Chroma_FromLuma + --- source/Lib/TLibCommon/TComPattern.cpp | 15 +++------------ + source/Lib/TLibCommon/TComPattern.h | 3 +-- 2 files changed, 4 + insertions(+), 14 deletions(-) + [da7059cdee2c] + + * source/Lib/TLibCommon/TComPattern.cpp: + intrapred: perform pixel copy with memcpy() in + TComPattern::initAdiPattern + --- source/Lib/TLibCommon/TComPattern.cpp | 11 +++-------- 1 files + changed, 3 insertions(+), 8 deletions(-) + [d249be0a4674] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + intrapred: correct buffer name from YuvExt to PredBuf + --- source/Lib/TLibCommon/TComPrediction.cpp | 14 +++++++------- + source/Lib/TLibCommon/TComPrediction.h | 12 ++++++------ + source/Lib/TLibEncoder/TEncSearch.cpp | 14 +++++++------- 3 files + changed, 20 insertions(+), 20 deletions(-) + [3faafbf81d93] + + * source/Lib/TLibCommon/TComRom.h: + reduce MAX_CU_SIZE to 64 + --- source/Lib/TLibCommon/TComRom.h | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [800d16ed3620] + +2013-05-05 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + TEncSearch: SAD primitives all expect 16byte aligned fenc buffer + [7f90a9927f4a] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: finish optimizations - performance summary below + + Measured on AMD Trinity A10M (more tuning can be done) + + 16b orig 16b opt 8b orig 8b opt sad[ 4x4] 3.79x 3.51x 2.53x 3.34x + sad_x3[ 4x4] 2.02x 4.17x 4.17x 5.99x sad_x4[ 4x4] 3.63x 4.35x 3.64x + 5.95x sad[ 4x8] 3.76x 3.90x 4.21x 3.47x sad_x3[ 4x8] 3.64x 4.33x + 5.67x 7.58x sad_x4[ 4x8] 2.56x 4.74x 6.39x 8.21x sad[ 4x12] 4.25x + 3.94x 3.11x 4.74x sad_x3[ 4x12] 4.67x 4.98x 4.93x 8.96x sad_x4[ + 4x12] 2.82x 5.31x 6.71x 9.39x sad[ 4x16] 4.42x 3.96x 3.41x 3.60x + sad_x3[ 4x16] 4.65x 5.09x 6.98x 8.60x sad_x4[ 4x16] 3.17x 5.35x + 7.81x 8.41x sad[ 4x24] 5.08x 4.07x 3.69x 4.19x sad_x3[ 4x24] 5.53x + 5.83x 7.70x 8.82x sad_x4[ 4x24] 3.52x 5.73x 8.32x 9.37x sad[ 4x32] + 5.59x 3.94x 3.99x 4.00x sad_x3[ 4x32] 5.83x 5.27x 7.39x 9.31x + sad_x4[ 4x32] 3.46x 5.41x 7.94x 9.42x sad[ 4x64] 5.40x 3.68x 3.60x + 4.36x sad_x3[ 4x64] 5.89x 5.95x 8.27x 8.82x sad_x4[ 4x64] 3.56x + 6.37x 9.17x 9.50x sad[ 8x4] 6.10x 5.74x 5.87x 5.84x sad_x3[ 8x4] + 6.39x 8.13x 9.75x 11.53x sad_x4[ 8x4] 3.26x 8.52x 9.19x 10.51x sad[ + 8x8] 6.05x 6.33x 8.87x 8.01x sad_x3[ 8x8] 6.32x 9.18x 11.84x 13.61x + sad_x4[ 8x8] 4.14x 10.67x 12.58x 14.97x sad[ 8x12] 8.03x 6.79x + 10.30x 6.91x sad_x3[ 8x12] 8.84x 10.46x 12.15x 17.12x sad_x4[ 8x12] + 5.28x 11.92x 12.83x 16.98x sad[ 8x16] 8.23x 6.79x 7.63x 9.17x + sad_x3[ 8x16] 9.26x 11.33x 13.88x 17.11x sad_x4[ 8x16] 5.49x 12.22x + 14.27x 17.19x sad[ 8x24] 10.02x 6.93x 8.16x 9.32x sad_x3[ 8x24] + 11.32x 11.86x 15.38x 17.91x sad_x4[ 8x24] 6.48x 13.29x 15.28x 17.95x + sad[ 8x32] 10.96x 6.94x 8.03x 7.51x sad_x3[ 8x32] 12.70x 10.95x + 14.03x 17.67x sad_x4[ 8x32] 6.87x 13.09x 15.16x 17.80x sad[ 8x64] + 10.61x 6.76x 8.18x 8.75x sad_x3[ 8x64] 12.35x 12.67x 16.05x 17.61x + sad_x4[ 8x64] 6.70x 14.02x 17.93x 17.90x sad[ 12x4] 12.01x 8.40x + 4.32x 11.48x sad_x3[ 12x4] 14.04x 7.17x 9.33x 12.21x sad_x4[ 12x4] + 4.42x 7.61x 9.60x 12.85x sad[ 12x8] 5.78x 10.04x 3.35x 14.10x + sad_x3[ 12x8] 5.31x 8.08x 10.47x 15.54x sad_x4[ 12x8] 4.34x 9.55x + 11.67x 16.22x sad[12x12] 6.83x 10.16x 2.92x 15.61x sad_x3[12x12] + 6.52x 9.10x 9.02x 18.85x sad_x4[12x12] 3.65x 9.97x 11.17x 18.88x + sad[12x16] 6.68x 10.00x 3.90x 14.68x sad_x3[12x16] 6.04x 9.09x + 12.11x 18.01x sad_x4[12x16] 4.12x 10.20x 10.40x 16.77x sad[12x24] + 7.30x 10.50x 3.72x 17.46x sad_x3[12x24] 6.50x 9.48x 11.51x 18.17x + sad_x4[12x24] 4.42x 10.80x 12.50x 19.86x sad[12x32] 7.26x 10.35x + 3.60x 15.86x sad_x3[12x32] 7.37x 9.28x 10.14x 19.57x sad_x4[12x32] + 4.50x 10.46x 11.64x 19.26x sad[12x64] 7.39x 10.43x 3.68x 16.47x + sad_x3[12x64] 6.97x 9.85x 11.97x 19.00x sad_x4[12x64] 4.74x 10.70x + 13.86x 19.14x sad[ 16x4] 6.70x 11.02x 16.55x 15.20x sad_x3[ 16x4] + 2.94x 10.38x 14.13x 20.10x sad_x4[ 16x4] 9.72x 10.35x 13.35x 18.95x + sad[ 16x8] 8.44x 13.62x 23.35x 22.69x sad_x3[ 16x8] 8.36x 11.16x + 17.99x 25.49x sad_x4[ 16x8] 12.18x 13.23x 18.34x 23.65x sad[16x12] + 10.72x 13.74x 25.86x 19.47x sad_x3[16x12] 11.61x 13.38x 19.21x + 31.36x sad_x4[16x12] 13.33x 13.83x 19.15x 31.15x sad[16x16] 9.02x + 13.12x 20.34x 24.25x sad_x3[16x16] 10.56x 13.96x 22.97x 31.96x + sad_x4[16x16] 14.20x 14.20x 23.01x 28.75x sad[16x24] 11.53x 13.88x + 25.07x 21.63x sad_x3[16x24] 12.69x 14.23x 25.64x 32.38x + sad_x4[16x24] 12.16x 15.11x 21.14x 34.09x sad[16x32] 13.78x 12.36x + 24.08x 20.00x sad_x3[16x32] 11.73x 13.73x 25.03x 33.07x + sad_x4[16x32] 12.30x 14.24x 17.85x 33.74x sad[16x64] 12.01x 14.42x + 29.62x 20.27x sad_x3[16x64] 9.17x 15.85x 28.93x 26.92x sad_x4[16x64] + 10.81x 13.19x 13.30x 27.21x sad[ 24x4] 11.59x 17.13x sad_x3[ 24x4] + 10.48x 13.78x sad_x4[ 24x4] 11.10x 11.48x sad[ 24x8] 12.13x 18.30x + sad_x3[ 24x8] 10.54x 21.45x sad_x4[ 24x8] 10.38x 20.92x sad[24x12] + 13.24x 19.48x sad_x3[24x12] 11.65x 24.07x sad_x4[24x12] 10.70x + 23.24x sad[24x16] 14.78x 16.64x sad_x3[24x16] 12.28x 24.14x + sad_x4[24x16] 11.00x 23.40x sad[24x24] 13.03x 18.30x sad_x3[24x24] + 11.58x 25.63x sad_x4[24x24] 11.58x 24.97x sad[24x32] 13.31x 20.60x + sad_x3[24x32] 11.98x 24.82x sad_x4[24x32] 11.75x 24.23x sad[24x64] + 14.44x 19.69x sad_x3[24x64] 11.99x 24.59x sad_x4[24x64] 10.38x + 24.77x sad[ 32x4] 14.00x 13.52x 16.56x 22.89x sad_x3[ 32x4] 11.05x + 10.79x 19.78x 27.12x sad_x4[ 32x4] 10.05x 9.93x 13.85x 24.46x sad[ + 32x8] 8.41x 14.32x 27.08x 25.34x sad_x3[ 32x8] 12.81x 11.18x 27.39x + 29.64x sad_x4[ 32x8] 10.39x 9.57x 13.85x 29.09x sad[32x12] 10.30x + 16.07x 26.75x 27.38x sad_x3[32x12] 14.42x 11.35x 23.73x 33.19x + sad_x4[32x12] 10.77x 10.22x 16.79x 33.34x sad[32x16] 9.98x 15.20x + 29.70x 27.72x sad_x3[32x16] 10.34x 11.84x 27.65x 32.54x + sad_x4[32x16] 10.91x 11.07x 12.62x 33.63x sad[32x24] 10.65x 15.88x + 32.63x 28.39x sad_x3[32x24] 11.09x 11.14x 29.62x 33.79x + sad_x4[32x24] 11.21x 11.18x 13.53x 34.14x sad[32x32] 11.62x 14.51x + 32.15x 24.65x sad_x3[32x32] 14.23x 11.83x 29.88x 30.06x + sad_x4[32x32] 11.63x 11.44x 15.78x 33.15x sad[32x64] 11.39x 15.72x + 35.13x 28.99x sad_x3[32x64] 10.43x 10.39x 28.87x 36.20x + sad_x4[32x64] 10.73x 9.71x 14.08x 36.61x sad[ 64x4] 9.92x 12.61x + 24.17x 25.65x sad_x3[ 64x4] 10.89x 9.16x 20.94x 27.04x sad_x4[ 64x4] + 7.64x 9.67x 11.56x 24.33x sad[ 64x8] 8.64x 13.43x 25.54x 26.88x + sad_x3[ 64x8] 11.55x 9.80x 22.40x 23.04x sad_x4[ 64x8] 8.34x 10.10x + 12.34x 18.87x sad[64x12] 9.41x 14.17x 27.20x 28.67x sad_x3[64x12] + 12.39x 9.30x 23.76x 26.13x sad_x4[64x12] 7.88x 9.21x 14.02x 22.10x + sad[64x16] 8.66x 14.51x 28.84x 30.87x sad_x3[64x16] 8.98x 9.60x + 24.42x 27.69x sad_x4[64x16] 8.27x 9.67x 29.10x 24.21x sad[64x24] + 9.16x 13.90x 24.11x 27.05x sad_x3[64x24] 8.56x 9.75x 16.25x 28.11x + sad_x4[64x24] 9.61x 9.99x 30.53x 24.77x sad[64x32] 10.06x 14.10x + 25.67x 28.96x sad_x3[64x32] 8.46x 9.97x 13.58x 29.50x sad_x4[64x32] + 10.15x 10.14x 31.67x 25.41x sad[64x64] 10.06x 14.75x 25.82x 32.25x + sad_x3[64x64] 8.04x 8.93x 31.06x 30.90x sad_x4[64x64] 9.36x 9.88x + 26.55x 26.53x + [82239d0023c8] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: optimize sad_4 and sad_8 to not need helpers + [3ad29dc2d608] + + * source/encoder/vec/pixel8.inc: + pixel8: more cleanups, no behavior changes + [9ebffae04e6f] + + * source/encoder/vec/pixel8.inc: + pixel8: intermediate variable rename, to be consistent with pixel16 + [6f3ac7d4cfbd] + + * source/encoder/vec/pixel8.inc: + pixel8: 8bpp sad_64_x4 and sad_64_x3 cannot have residual rows + [6fe1fd06c373] + + * source/encoder/vec/pixel16.inc: + pixel16: optimize 16bpp sad_N_x3 (adapted copies of sad_N_x4) + [9b3489a15596] + + * source/encoder/vec/pixel16.inc: + pixel16: optimize sad_N_x4 16bpp + [fde81cd177e9] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel8.inc: + pixel8: optimize sad_N_x4 8bpp, add sad_24_x4 + [2b22cc59fa15] + + * source/encoder/vec/pixel8.inc: + pixel8: optimize 8bpp sad_12_x4 + [379a62ef320b] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: add sad_24 for 8bpp and 24bpp + [8851cd024b0f] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc: + pixel16: add sad_24_x3 + [e2002f58b3a8] + + * source/encoder/vec/pixel16.inc: + pixel16: fix some unaligned access exceptions triggered by VC10 + 16bpp x64 + [3d667e4c590c] + + * source/encoder/vec/pixel16.inc: + pixel16: fixups for 16bpp primitives + [e88afd27d35a] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: optimize sad_8_x4 - now same X over C as sad_8_x3 + [c17efc4e7f4a] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: optimize sad_4_x4, remove unnecessary sad_X_4_x4 functions + [06554d1dedba] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: remove more useless lx parameters and for-loops for x3 + functions + [72a35f3f6e2e] + + * source/encoder/vec/pixel8.inc: + pixel: optimize sad_64_x3 - about 20% improvement + [46c082e89598] + + * source/encoder/vec/pixel8.inc: + pixel: optimize sad_32_x3 - about 20% improvement, remove unused + sad_32_4 + [12351a667289] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: optimize sad_16_x3 - about 30% improvement, remove sad_16_4 + [ca2192011209] + +2013-05-04 Steve Borho + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: optimize sad_12_x3 - about 2x improvement, remove sad_12_4 + [fed8c8152de7] + + * source/encoder/vec/pixel8.inc: + pixel8: slight tuneup for sad_8_3x + [056c029b7762] + + * source/encoder/vec/pixel8.inc: + pixel8: allow 16 rows in a row of 4-wide 8bpp blocks - up to 30% + faster + [d906f4351c02] + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: sad_8_x3<4> is faster than sad_8_4_x3<8,4> which seems to + have no use + [e834992087df] + +2013-05-05 ShinYee Chung + + * source/encoder/vec/CMakeLists.txt: + cmake: Specify the compilation of architecture SSE2 explicitly. + + The VS2010 crashes when executing one of the sad variants. The + instruction vmovdqa() was found generated, crashing cpuid == 4 CPU. + [9cf98cfecaa7] + +2013-05-04 Steve Borho + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + pixel: sad_4_x3 should not have lx template var; it was broken for + 8bpp + + Removes sad_4_4_x3 since it added no optimization + [a89f7e937705] + + * source/encoder/vec/pixel.inc: + pixel: enable SAD PARTITION_16x24 now that it is supported for 8bpp + [bc25ce99514e] + + * source/encoder/vec/pixel8.inc: + pixel8: use Vec8us to accumulate sums with addSumAbsDiff() + + From 30 to 100% speedups for 8bpp SAD functions + [a194cda3c978] + + * source/VectorClass/vectori128.h: + vector: add Vec8us::addSumAbsDiff() + + This avoids data casts and removes a couple of instructions from + some tight paths. + [f167c85b6707] + + * source/encoder/vec/CMakeLists.txt: + cmake: add a comment about inc files listed in PrimitivesVec project + [bc8c389159c5] + + * source/CMakeLists.txt: + cmake: remove option for multithreaded build, enable unconditionally + [013198c3eb01] + +2013-05-05 ShinYee Chung + + * source/encoder/vec/interpolationfilter.inc: + Interpolation: Fix compile warning/error on unused parameter. + [e160901853b5] + + * source/encoder/vec/vecprimitives.inc: + primitives: Fix compile error on re-declaration of selectf() and + selectd(). + + It is probably due to cycle in the header includes. + [3736fe22253e] + +2013-05-04 Min Chen + + * source/VectorClass/vectori128.h, source/encoder/vec/intrapred.inc: + intrapred: more improvement 8bpp DC for 32 and 64 blocks + --- source/VectorClass/vectori128.h | 45 + ++++++++++++++++++++++++++++++++++++++ + source/encoder/vec/intrapred.inc | 38 + +++++++++++++------------------ 2 files changed, 61 insertions(+), + 22 deletions(-) + [c8fe324edc4d] + + * source/encoder/threadpool.cpp: + threadpool: VS2008 not defined _WIN32_WINNT_VISTA + --- source/encoder/threadpool.cpp | 2 +- 1 files changed, 1 + insertions(+), 1 deletions(-) + [d5d163ce5222] + +2013-05-04 Steve Borho + + * source/encoder/vec/intrapred.inc: + intrapred: remove includes and namespace from inc header + + includes must be in vecprimitives.inc, outside of the anonoymous + namespace + [ed8b00af0055] + + * source/encoder/vec/CMakeLists.txt: + cmake: add intrapred.inc to the PrimitivesVec project + + So it's easy to find + [40ccd41545cb] + + * source/VectorClass/vectori128.h, source/encoder/vec/intrapred.inc: + vectori128: add fromUint64() method, only available for x64 builds + + Needs a better fallback and cross-platform x64 compilation detection + [2daae31d6ab5] + + * source/encoder/vec/intrapred.inc: + intrapred: simplify 8bpp DC for 32 and 64 blocks, now faster than C + [78aeea32fd92] + + * source/encoder/vec/intrapred.inc: + intrapred: add an 8bpp path for DC pred (currently slower than C at + 64x64) + + This primitive might have to use aligned loads to be faster than C.. + or have a function pointer per intra-block width to avoid the switch + statement. + [4f1b0d47e596] + +2013-05-03 Steve Borho + + * source/encoder/vec/intrapred.inc: + intrapred: use g_aucConvertToBit for bit size + [b2b21d6fe1fc] + + * source/test/intrapredharness.cpp: + intrapredharness: ensure top pixels always avail for perf test + [3bb5cca8fb52] + + * source/encoder/vec/intrapred.inc: + intrapred: perform horizontal sum at increased depth + + Test now passes with 10bit pixels + [5a3d377654c2] + + * source/test/intrapredharness.cpp: + intrapredharness: pre-calculate all random pixels, ensure top pixels + always avail + [55dc29e361fa] + + * source/test/intrapredharness.cpp: + intrapredharness: this primitive requires initROM() to be called + [793b2e36e7e8] + + * source/test/CMakeLists.txt: + testbench: 8bpp link now requires HM, just for a global array + [d883646084ac] + + * source/encoder/vec/intrapred.inc: + intrapred: disable 4100 warnings until 8bpp is supported + [3db61f610fc5] + + * source/encoder/threadpool.cpp: + threadpool: further compilation fixes for other platforms + [5c2c51892268] + + * source/test/intrapredharness.cpp: + intrapredharness: fix shadowed loop variable + [c9341cd75fca] + + * source/test/CMakeLists.txt: + cmake: add new intrapred harness to testbench build + [fee22e63bf0a] + +2013-05-04 Min Chen + + * source/test/intrapredharness.cpp, source/test/intrapredharness.h, + source/test/testbench.cpp: + intrapred: testbench + + From 990ec815902203bfb954921bac75e1913719e630 Mon Sep 17 00:00:00 + 2001 + --- source/test/testbench.cpp | 5 +- source/test/intrapredharness.cpp + | 122 ++++++++++++++++++++++++++++++++++++++ + source/test/intrapredharness.h | 53 ++++++++++++++++ 3 files + changed, 179 insertions(+), 1 deletions(-) create mode 100644 + source/test/intrapredharness.cpp create mode 100644 + source/test/intrapredharness.h + [50d74bce7e74] + +2013-05-03 Steve Borho + + * source/encoder/vec/intrapred.inc: + intrapred: fix one obvious bug, disable DC vec prim for 8bpp until + debugged + [1276967fdbcc] + + * source/encoder/IntraPred.cpp: + IntraPred: yes, pixel is an alias for Pel and vice-versa + [4ee48f5e5c8d] + + * source/encoder/CMakeLists.txt: + cmake: add IntraPred.cpp to CPRIMITIVES file list + [6da50ba09221] + +2013-05-03 Min Chen + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/VectorClass/vectori128.h, source/encoder/IntraPred.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h, + source/encoder/vec/intrapred.inc, + source/encoder/vec/vecprimitives.inc: + intrapred: vector code + --- source/Lib/TLibCommon/TComPrediction.cpp | 8 ++- + source/VectorClass/vectori128.h | 9 ++ source/encoder/IntraPred.cpp + | 83 ++++++++++++++++ source/encoder/primitives.cpp | 2 + + source/encoder/primitives.h | 3 + source/encoder/vec/intrapred.inc | + 156 ++++++++++++++++++++++++++++++ + source/encoder/vec/vecprimitives.inc | 3 + 7 files changed, 263 + insertions(+), 1 deletions(-) create mode 100644 + source/encoder/IntraPred.cpp create mode 100644 + source/encoder/vec/intrapred.inc + [8ba7ba4f1b69] + + * source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + replace piAdiBuf type from Int to Pel + --- source/Lib/TLibCommon/TComPattern.cpp | 28 + ++++++++++++++-------------- source/Lib/TLibCommon/TComPattern.h | + 14 +++++++------- source/Lib/TLibCommon/TComPrediction.cpp | 22 + +++++++++++----------- source/Lib/TLibCommon/TComPrediction.h | 6 + +++--- source/Lib/TLibEncoder/TEncSearch.cpp | 6 +++--- 5 files + changed, 38 insertions(+), 38 deletions(-) + [a787dfc1a781] + + * source/Lib/TLibCommon/TComPrediction.cpp: + Intra dc prediction own function Conflicts: + + source/Lib/TLibCommon/TComPrediction.cpp + --- source/Lib/TLibCommon/TComPrediction.cpp | 61 + +++++++++++++++++------------- 1 files changed, 35 insertions(+), 26 + deletions(-) + [2a4abd157919] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + extract Intra prediction functions from class to normal + --- source/Lib/TLibCommon/TComPrediction.cpp | 10 ++++++---- + source/Lib/TLibCommon/TComPrediction.h | 5 ----- 2 files changed, 6 + insertions(+), 9 deletions(-) + [b947edf8099b] + + * source/Lib/TLibCommon/TComPrediction.cpp: + OPT predIntraLumaAng: merge function call + --- source/Lib/TLibCommon/TComPrediction.cpp | 16 +++++----------- 1 + files changed, 5 insertions(+), 11 deletions(-) + [39e2bb372e4c] + +2013-05-03 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: cut down on iterations for measuring x3 and x4 perf + [ba8113125178] + + * source/encoder/threadpool.cpp: + threadpool: fixup commit for GCC and later Windows versions + [c8479b70bcdb] + +2013-05-03 Min Chen + + * source/encoder/threadpool.cpp: + Fix WinXP compatible + --- source/encoder/threadpool.cpp | 23 +++++++++++++++++++++-- 1 + files changed, 21 insertions(+), 2 deletions(-) + [069ddee0aceb] + +2013-05-03 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/motion.cpp, + source/test/testbench.cpp: + motion: make new motion search disappear without ENABLE_PRIMITIVES + [96457db97f8e] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: don't use neighbor MVs as search candidates, merge will + check them + + I'm leaving the search candidate arguments in place, later we might + get some search candidates that are not also merge candidates + (lookahead, for one) + [078a4dfddbcd] + +2013-05-02 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: found some predictor MVs + [3e656c4c7905] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/motion.cpp, + source/encoder/motion.h, source/encoder/mv.h: + motion: partial integration into TEncSearch for debugging purposes + + Adds a small amount of overhead + [0842edc65dea] + + * source/encoder/vec/pixel.inc: + pixel: disable satd function completely for 8bpp + [b49e50e70bb2] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: add TODOs where new performance primitives can be added + [ba309a167e38] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + TComPicYuv: white-space nits, no effect + [387f505139c6] + + * source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + TComPic: remove unused m_bNeededForOutput + [0279f22166ba] + + * source/encoder/motion.cpp: + motion: use block copy primitive + [a116b67a5a01] + + * source/CMakeLists.txt: + cmake: the big switch. Make 8bpp the default build configuration + [6c9154cdab6f] + + * source/encoder/bitcost.cpp: + bitcost: and another cast + [af04f537b82e] + + * source/encoder/bitcost.cpp: + bitcost: add back explicit typecasts + [80b6aba5b4e4] + +2013-05-03 ShinYee Chung + + * source/encoder/bitcost.cpp: + bitcost: Use portable std::min() instead of implicit min(). + [49605906cd31] + +2013-05-02 Steve Borho + + * source/encoder/bitcost.cpp: + bitcost: document more differences between the HM and x264 + approaches + [dc6634bdf401] + + * source/encoder/bitcost.cpp: + bitcost: use integer min() for MSVC compatibility + [abe3030915d6] + +2013-05-03 ShinYee Chung + + * source/encoder/bitcost.cpp: + bitcost: Fix segfault due to out of bound accesses to logs table. + [7b18a1bab2fa] + + * source/input/y4m.cpp: + y4m: Fix -ve PSNR when encoding a y4m file. + + Sample output: + + POC 0 TId: 0 ( I-SLICE, nQP 32 QP 32 ) 23444072 bits [Y -40.5587 dB + U -40.8645 dB V -40.8654 dB] [ET 21 ] [L0 ] [L1 ] + [13cd19348761] + +2013-05-02 ShinYee Chung + + * source/encoder/motion.cpp: + motion: Fix a compile error due to undeclared memcpy(). + [85d91dfdccc6] + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: Fix compile warnings/errors due to shadowed variables. + [9e6d97767358] + + * source/CMakeLists.txt, source/encoder/bitcost.cpp: + bitcost: Fix undeclared min(). + + The math.h header defines fmin(), and algorithm header defines + std::min(). We decided to use fmin(). + [ebfbc677b238] + +2013-05-02 nandaku2 + + * Merged in deepthidevaki/xhevc_deepthid (pull request #111) + + Fix compile issue in TComPrediction.cpp with disable primitives. + [4ec6cd5d2732] + +2013-05-02 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp: + Fix compile issue in TComPrediction.cpp with disable primitives. + [a4f56554606d] + +2013-05-02 Mandar Gurav + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc: + Implementation of sad_x4 stage 2 and sad_x3 + [c8d6caa73c42] + +2013-05-02 Mandar Gurav + + * Merged multicoreware/xhevc into default + [700a3436aa9f] + + * source/encoder/InterpolationFilter.cpp: + Merged multicoreware/xhevc into default + [1ded90aad015] + +2013-05-02 Mandar Gurav + + * source/encoder/vec/pixel.inc, source/encoder/vec/pixel8.inc: + Implementation for sad_x4 - 8,16,32,64 with HIGH_BT_WIDTH disabled. + [3dd9ebabfdcb] + +2013-05-02 Deepthi + + * source/encoder/primitives.cpp: + Compilation fix for primitives disabled + [4f863ec8e9d8] + +2013-05-02 Deepthi Devaki + + * source/encoder/CMakeLists.txt: + Merge + [0ad59133e0b6] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/CMakeLists.txt, + source/encoder/InterpolationFilter.cpp, + source/encoder/InterpolationFilter.h: + cleaning up TComPrediction.cpp with new filters + [7827c5d4f308] + + * source/encoder/CMakeLists.txt, + source/encoder/InterpolationFilter.cpp, source/encoder/ipfilter.cpp: + Rename .cpp file with filter-primitives + [bd3f6bf45ab7] + + * source/encoder/InterpolationFilter.cpp, + source/encoder/interpolationfilter.cpp: + Backed out changeset: 2e29f87a6088 + [2efde965e0ee] + + * source/encoder/CMakeLists.txt: + Backed out changeset: dd6580ef216e + [6df4928a88a8] + + * source/encoder/CMakeLists.txt: + Renaming interpolationfilter.cpp in CMakeList + [dd6580ef216e] + + * source/encoder/InterpolationFilter.cpp, + source/encoder/interpolationfilter.cpp: + Rename .cpp file containing filter-primitives + [2e29f87a6088] + +2013-05-02 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/motion.cpp, + source/encoder/motion.h: + motion: motion search now compiles, added to build but unused + [56dfb55dad17] + + * Merged in praveentiwari/xhevc_praveent (pull request #108) + + 2.20x more performance gain for partialButterflyInverse16 + [7b4811d9db39] + +2013-05-02 praveentiwari + + * source/encoder/vec/macroblock.inc: + 2.20x more performance gain for partialButterflyInverse16 + [eba40e7f48dc] + +2013-05-02 Mandar Gurav + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/encoder/vec/interpolationfilter.inc, + source/encoder/vec/pixel16.inc, source/test/pixelharness.cpp: + Implementation of sad_x3 and sad_x4 with reference frame stride. + [249bdb052e0a] + +2013-05-02 Steve Borho + + * source/encoder/motion.cpp, source/encoder/motion.h: + motion: compilation fixes - file still not part of build + [3dd5ac35ea76] + +2013-05-01 Steve Borho + + * source/encoder/vec/pixel8.inc: + pixel8: remove unused satd_4x4 and sa8d_8x8 + [d8bc48ebe95c] + +2013-05-02 Mandar Gurav + + * source/encoder/vec/CMakeLists.txt, + source/encoder/vec/interpolationfilter.inc, + source/encoder/vec/pixel.inc, source/encoder/vec/pixel16.inc, + source/encoder/vec/pixel8.inc, source/encoder/vec/vecprimitives.inc: + Restructure pixel.inc + [78d93b165924] + +2013-05-01 Steve Borho + + * source/test/pixelharness.cpp: + pixelharness: increase size of pixel buffers + [d30a346a00d0] + + * source/test/pixelharness.cpp: + pixelharness: test random PU block sizes + [a16589611ae6] + + * source/encoder/vec/blockcopy.inc, source/test/pixelharness.cpp: + blockcopy: couple of bug fixes, now all but p_s pass with 8bpp + [26bcd02b0ffc] + + * source/encoder/vec/blockcopy.inc, source/encoder/vec/pixel.inc: + pixel: fix HIGH_BIT_DEPTH=0 build, temp fix + [e3f72af44dff] + + * source/encoder/vec/CMakeLists.txt, source/encoder/vec/blockcopy.inc, + source/encoder/vec/vecprimitives.inc: + primitives: add blockcopy.inc - only HIGH_BIT_DEPTH is accelerated + [a2cb9e405fbe] + + * source/encoder/pixel.cpp: + pixel: fixes for block copy C ref + [122dcad10a6f] + + * source/test/pixelharness.cpp: + pixelharness: add block copy perf tests + [3842dc74d0e3] + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/test/pixelharness.cpp, source/test/pixelharness.h: + primitive: add block copy primitives + [7f115c88c899] + + * source/test/pixelharness.cpp: + pixelharness: nit white-space + [de5953b7be6e] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/bitcost.cpp: + bitcost: store logs as fractional values *CHANGES OUTPUTS* + + /me dumb. What was the point of storing the output of ceil() in a + float? + + This was a small but unambiguous win in compression efficiency + [ca0cd4e5b78c] + + * source/encoder/bitcost.cpp: + bitcost: use just one copy of "acquire" logic + [94d5a08f85bd] + + * source/encoder/bitcost.h: + bitcost: nit - reorder methods in order they are typically used + [fd114658c2d0] + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h: + bitcost: store cost estimates as 16bits, clamp to max16 + + It's somewhat pedantic to keep more bits than this, and wasteful of + precious CPU cache. If someone in the future really wants >16bit mv + costs, they can template this class and instantiate the size they + want. This had no measurable effect on the outputs. + [1522b5f696e4] + + * source/encoder/bitcost.h: + bitcost: reorder initializers to make GCC happy + [75f7df4a2567] + +2013-04-30 Steve Borho + + * source/encoder/primitives.h: + primitives: avoid redecl of CDECL with GCC + [e3e7d5fc0930] + +2013-05-01 Steve Borho + + * source/encoder/primitives.h: + Merged in mandarmcw/xhevc_mandar_mahesh (pull request #105) + + Implementation of sad_4_x3 and sad_12_x3. and sad_x4 + [6f85a46676d5] + +2013-05-01 Mandar Gurav + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/encoder/vec/pixel.inc, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Implementation of sad_x4 + [b8dac1cab20f] + + * source/encoder/vec/pixel.inc, source/test/pixelharness.cpp: + Implementation of sad_4_x3 and sad_12_x3. + [e1c2cc61fd99] + +2013-05-01 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/bitcost.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h: + bitcost: use new bitcost class for mv cost estimates *CHANGES + OUTPUTS* + + Sample encode statistics comparing HM vs x265 mv cost estimates: + (comparing HM vs our HM optimization, not the x264 approach) + + kbps Y PSNR U PSNR V PSNR HM getCost(): 1381.8880 32.8096 41.0167 + 43.3698 m_bc.mvcost(): 1383.2000 32.8171 41.0402 43.3580 + + (mostly noise, but it is different) + [8fbf3bf1bce5] + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h, + source/encoder/primitives.cpp: + bitcost: rename cleanupCosts to destroy + [e0e5b913b3ea] + + * source/encoder/bitcost.cpp: + bitcost: bring mv costs close in line with HM's methods + + mvcost rounds earlier, so it results in even scores while the HM + rounds at the end so it usually gets odd scores, but they are + typically one digit apart. In a few places logs[] will vary from + Motion_Costs by 1, causing the result to differ by 1 lambda. + [af142516c4ff] + +2013-05-01 Deepthi + + * Merge + [55bb44dbbcab] + + * source/x265enc.cpp: + The very common sizeof bug. + [961680a9f94e] + +2013-05-01 Steve Borho + + * source/encoder/primitives.cpp: + primitives: free Motion_Cost in x265_cleanup() + [c424425a66f2] + +2013-04-30 Steve Borho + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h: + bitcost: precalculate 2 * log2(n) for each MVD offset + [42f145aaeea0] + + * source/encoder/mv.h: + mv: bug fixes; cannot use word shifts because x or y can be signed + [425867841725] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: HM uses different sqrt(lambda) to calculate SAD bitcosts + [28a3ad887c33] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: comment improvements and white-space nits + [fe0009c03145] + + * source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/bitcost.cpp, + source/encoder/bitcost.h: + bitcost: bug fixes and enable integration into TEncSearch.cpp + [c18cfa4d13a6] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/primitives.h: + bitcost: begin integration into TEncSearch + [81b815f0ac28] + + * source/encoder/bitcost.cpp, source/x265.h: + x265: comment nits + [1ca12b92b67f] + + * source/encoder/bitcost.cpp: + bitcost: fix bug caused by bad eye-sight and worse fonts + [2208db9f4e49] + + * source/encoder/primitives.cpp, source/x265.h, source/x265main.cpp: + api: add x265_cleanup() method, to make leak detectors happy + [3bbdb7451176] + + * source/encoder/CMakeLists.txt, source/encoder/bitcost.cpp, + source/encoder/bitcost.h, source/encoder/motion.h: + bitcost: add BitCost to x265 project build + [1fb55290fb14] + + * source/Lib/TLibEncoder/TEncCu.cpp, source/encoder/mv.h: + mv: remove getAbsHor|Ver access methods + [99ed9f571a1c] + + * source/Lib/TLibCommon/TComDataCU.cpp, source/encoder/mv.h: + mv: move scaleMV method to file static in TComDataCU.cpp + + This was the only file to use it, and it requires CommonDef.h so it + was cleaner to move it there than to #include CommonDef in our new + code. + [22f8092af4c0] + + * source/encoder/bitcost.h, source/encoder/motion.cpp, + source/encoder/motion.h, source/encoder/mv.h: + motion: simplify block offset calculations and MV API + [d45e0bf08279] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + backout micro optimization in TComTrQuant + + uiQ is much larger than a pixel value, so doing calculation as Int64 + is correct This change was resulting in an infinite for-loop in + debug runs + [2803c3221494] + + * source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComMotionInfo.cpp, source/encoder/mv.h: + mv: replace setZero() calls with assignments to 0 (implicit cast) + [8bf5cfa3dc58] + + * source/encoder/mv.h: + mv: fix shadow compile warnings + [26f80f4c0e63] + + * source/encoder/vec/macroblock.inc: + Merged in mandarmcw/xhevc_mandar_mahesh (pull request #103) + + Implementation of sad_16_x3, sad_32_x3 stage 1 + [dcc9d54d3338] + +2013-04-30 Mandar Gurav + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/vec/macroblock.inc: + Revert to xhevc code... + [d78e74091ac4] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/vec/macroblock.inc: + Merge + [c3c08427ee92] + + * source/encoder/vec/pixel.inc: + Implementation of sad_16_x3, sad_32_x3 stage 2 + [4e9455062807] + +2013-04-30 Mandar Gurav + + * Merged multicoreware/xhevc into default + [181bdddec8a7] + +2013-04-30 Mandar Gurav + + * source/encoder/vec/pixel.inc: + Implementation of sad_16_x3, sad_32_x3 stage 1 + [d0e56fa89954] + +2013-04-30 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: white-space nits + [c3dc302391b9] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComMv.h, source/encoder/CMakeLists.txt: + mv: substitute our x265::MV class for TComMv + [7f84c91c706f] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + TComTrQuant: white-space nits + [2efe97558d45] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + TEncSearch: eoln fixes + [706caf90a11f] + + * source/encoder/mv.h: + mv: compilation fixes + [10022975148e] + + * source/encoder/motion.cpp: + motion: call x264_cpu_emms() at end of motion search + [ab58c2ee829a] + + * Merge + [da161f8dda40] + +2013-04-30 Deepthi + + * source/Lib/TLibCommon/TComSlice.h: + Signed/unsigned comparison mismatch + [19e430b976bd] + +2013-04-30 Deepthi Devaki + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/encoder/InterpolationFilter.cpp, + source/encoder/InterpolationFilter.h, source/encoder/primitives.h: + Filter in TComPrediction.cpp 0 - compiles with highBitDepth = 0/ 1 + [ee367956acdc] + +2013-04-30 praveentiwari + + * source/encoder/vec/macroblock.inc: + Robust logic for clip3 function and multiplication replaced with + shift + [b9a706e1c328] + + * source/encoder/vec/macroblock.inc: + .62X more performance gain for partialButterflyInverse8 + [fa15bd1d94d1] + +2013-04-30 praveen Tiwari + + * Merged multicoreware/xhevc into default + [d4c8a6822d48] + + * Merged multicoreware/xhevc into default + [0595c6492fc9] + +2013-04-29 praveen Tiwari + + * Merged multicoreware/xhevc into default + [e1633a0b4388] + +2013-04-26 praveen Tiwari + + * Merged multicoreware/xhevc into default + [2e307dbf8bc3] + +2013-04-25 praveen Tiwari + + * Merged multicoreware/xhevc into default + [8079c95648f2] + + * Merged multicoreware/xhevc into default + [df407ba015a9] + +2013-04-24 praveen Tiwari + + * source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [d084fa7c1507] + +2013-04-30 Deepthi + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + Adding support for Short-Short/Pel-Short SSE calculations. + [963b0a22e185] + + * Merge + [df3e251c1550] + + * source/Lib/TLibCommon/TComRdCost.h: + Merge + [43db01e2d0f1] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h: + Restructuring TComRdCost to allow for Pel,Short combinations of SSE + calc. + [6929ad484805] + +2013-04-30 Steve Borho + + * source/encoder/vec/pixel.inc: + pixel: disable sad_x3 functions for HIGH_BIT_DEPTH=0 + [7d724b6d631d] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + primitives: nit cleanups + [4fabe59015fb] + + * source/encoder/primitives.h: + Merged in ggopu/ggopu_xhevc (pull request #101) + + Removed GetCost() Function + [5923e7ea9058] + +2013-04-29 ggopu + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h: + Removed GetCost() Function + [d59ff5bf67d9] + +2013-04-30 Steve Borho + + * Merged in sumalatha/xhevc_sumalatha (pull request #99) + + C Level Optimization on xRateDistOptQuant() + [9920f881602d] + +2013-04-30 sumalatha + + * source/Lib/TLibCommon/TComTrQuant.cpp: + C Level Optimization on xRateDistOptQuant() - 1. removed all memsets + ( except for pdCostCoeffGroupSig[]) - initialized only those array + elements that will be read first before its updated from the inside + the main for loop itself. 2. optimized a few of local variables used + in certain calculations inside the loop. + + Also done changes based on steves suggestion + [8bb8a28015b6] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComTrQuant.cpp: + Backed out changeset: eb3d76f250f8 + [93f865e11524] + +2013-04-30 Sumalatha Polureddy + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComTrQuant.cpp: + Merged multicoreware/xhevc into default + [5d85009c6e6b] + +2013-04-29 sumalatha + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComTrQuant.cpp: + C Level Optimization on xRateDistOptQuant() - 1. removed all memsets + ( except for pdCostCoeffGroupSig[]) - initialized only those array + elements that will be read first before its updated from the inside + the main for loop itself. 2. optimized a few of local variables used + in certain calculations inside the loop. + [eb3d76f250f8] + +2013-04-30 Steve Borho + + * Merged in mandarmcw/xhevc_mandar_mahesh (pull request #100) + + Incoprporated Steve's comments - stage 2. + [a68d3284d72a] + +2013-04-30 Mandar Gurav + + * source/test/pixelharness.cpp, source/test/pixelharness.h: + Incoprporated Steve's comments - stage 2. + [45002b3f2b97] + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/encoder/vec/pixel.inc, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Incoprporated Steve's comments. + [c44e18dd256b] + +2013-04-30 Mandar Gurav + + * source/VectorClass/vectori128.h: + Merged multicoreware/xhevc into default + [d5e09736dcb5] + +2013-04-29 Mandar Gurav + + * source/VectorClass/vectori128.h, source/encoder/pixel.cpp, + source/encoder/vec/pixel.inc: + Revert old updates. + [3c1645af294a] + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/encoder/vec/pixel.inc, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Implemented sad_x3 for calculation of 3 SAD's in single call. + [e79693d592d3] + +2013-04-28 Mandar Gurav + + * source/VectorClass/vectori128.h, source/encoder/pixel.cpp, + source/encoder/primitives.h, source/encoder/vec/pixel.inc, + source/test/pixelharness.cpp: + Merged multicoreware/xhevc into default + [5692f4fd8ad5] + +2013-04-29 Mandar Gurav + + * source/encoder/vec/pixel.inc, source/test/pixelharness.cpp: + Revert to new xhevc code. + [21650019be5e] + +2013-04-23 Mandar Gurav + + * source/VectorClass/vectori128.h: + Vector class library function + [f4465f31f99c] + + * source/encoder/vec/pixel.inc: + Implemented x265_sad16 function into vectorclass librasy. + [aed28167b644] + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/encoder/vec/pixel.inc, source/test/pixelharness.cpp, + source/test/pixelharness.h: + Implementation of SAD using intrinsics for 8 bit values. + [6af56ea1b74c] + +2013-04-30 Steve Borho + + * source/encoder/bitcost.cpp, source/encoder/bitcost.h, + source/encoder/motion.cpp, source/encoder/motion.h, + source/encoder/mv.h: + add a new motion search core feature - is not part of the project + yet + + The MV, BitCost, MotionReference, and MotionEstimation classes will + form the core of the x265 motion estimation system; eventually + replacing TEncSearch, TComMV, and most of TComRdCost. + [6937acae97d5] + +2013-04-29 Steve Borho + + * source/VectorClass/instrset_detect.cpp: + instrset_detect: workaround on Mac OS X + [3b6663bd7fb9] + + * source/VectorClass/vectori128.h, source/encoder/vec/CMakeLists.txt: + cmake: isolate warnings disables within vectori128.h with two + exceptions + [c9ef5adc6f2b] + + * source/test/testbench.cpp: + testbench: mingw_aligned_malloc requires malloc.h + [8e58b5fc3759] + + * source/test/CMakeLists.txt, source/test/filterharness.cpp, + source/test/ipfilterharness.cpp, source/test/testbench.cpp: + testbench: remove compiler warning training wheels + [377da7143671] + + * source/CMakeLists.txt: + cmake: add a comment about why we ignore warnings in the CLI files + [1ec1c161fb9e] + + * source/Lib/TLibEncoder/TEncGOP.cpp: + TEncGOP: provide bool values for Bool arguments + [0cfc86971117] + + * source/Lib/TLibEncoder/TEncCfg.h: + TEncCfg: fix compiler warning + [9e9458725136] + + * source/Lib/TLibCommon/TComSlice.h: + TComSlice: use Bools where appropriate + [20b2e53cd947] + + * source/Lib/TLibCommon/TComPicSym.h: + TComPicSym: fix signed/unsigned mangling in the header + [2dd01031906c] + + * source/Lib/TAppCommon/program_options_lite.h: + TAppCommon: give classes noimpl copy constructors, makes MSVC happy + [18e014cab1d4] + + * source/CMakeLists.txt: + cmake: move tests above CLI, avoid pulling in disabled warnings + [4fd9c4ed8a0a] + + * source/CMakeLists.txt: + cmake: add /wd4800 and /wd4018 to x265main.cpp and x265cfg.cpp + [8ec187f802f3] + + * source/Lib/CMakeLists.txt: + cmake: add back /wd4018 and /wd4800 to HM source files + + It would be a gargantuan effort to clean all these up at this point. + Instead we will just make all the new code compile clean and these + old files will eventually be replaced + [7b1566c818a6] + + * source/Lib/TLibCommon/CommonDef.h: + Remove warnings disables from CommonDef.h, will add back with more + focus + + These were leaking into the encoder/ folder where we want our new + code to be warning free. + [a949c6f35a76] + +2013-04-29 ShinYee Chung + + * source/encoder/TShortYUV.cpp: + shortyuv: Fix compile warnings/errors due to comparing signed and + unsigned integers. + + GCC 4.8.0 Linux64: + + source/encoder/TShortYUV.cpp:424:32: error: comparison between + signed and unsigned integer expressions [-Werror=sign-compare] + for(int x = 0; x < iWidth; x++) + [03df6f73f616] + +2013-04-29 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Quick Fix for compilation errors in TEncSearch::xEstimateResidualQT. + This has to be re-architected asap. + [5305e9dd0f98] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, source/encoder/TShortYUV.h: + Merge + [f298a0cc803f] + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h: + One more function for TComYuv + [27e841216893] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Changing the datatype of m_pcQTTempTComYuv. Need to review this at + some point - why does it hold both recon and residual data? + [8e100ef76380] + + * source/Lib/TLibCommon/TComYuv.cpp: + Changes in TComYuv class + [9625d6b6bfc0] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h: + Pel to Short changes in TComTrQuant + [2161442f663e] + + * source/encoder/TShortYUV.cpp, source/encoder/TShortYUV.h: + Adding more functions to TShortYUV for TComyuv compatilibity. We can + use inheritance later for better maintenance and code reuse. + [03ed7310e127] + +2013-04-28 Deepthi + + * source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, source/encoder/TShortYUV.cpp, + source/encoder/TShortYUV.h: + Adding a bunch of function definition/declarations to TShortYUV and + TComYUV classes. Compilation successful. + [dc004e397c4d] + +2013-04-27 Deepthi + + * source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + Replacing all function declarations/definitions with TShortYUV. + [d17b57e25292] + +2013-04-29 ShinYee Chung + + * source/test/testharness.h: + test: Fix compile error due to undefined size_t. + [a3f27ae55e67] + + * source/Lib/TLibCommon/CommonDef.h: + HM/Common: Fix the undefined aligned memory allocation function. + [15c7da6ad105] + +2013-04-28 Steve Borho + + * source/test/mbdstharness.cpp, source/test/pixelharness.cpp, + source/test/testbench.cpp, source/test/testharness.h: + testbench: add aligned alloc and free helper functions + [a37bc125a568] + + * source/CMakeLists.txt, source/test/CMakeLists.txt: + cmake: define PLATFORM_LIBS in main cmake file, use for x265-cli + [bd817f383232] + + * source/Lib/TLibCommon/CommonDef.h: + aligned_malloc() needs stdlib.h + [589ba1b50405] + + * source/encoder/threadpool.cpp: + threadpool: use _BitScanReverse64() instead of __lzcnt64() + + All variants of __lzcnt() fail to work correctly on Intel CPUs, with + no warnings. + [04fa32c7226a] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + remove FIX203 #define, it will always be 1 + [6d3f0c68430c] + +2013-04-27 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + rename GetsadFunctions to getSadFunctions() + [7e2616997871] + + * source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/encoder/primitives.cpp, + source/encoder/threadpool.cpp, source/x265.h, source/x265cfg.cpp, + source/x265cfg.h: + api: improvements to C API, make encoder auto-setup primitives and + threadpool + + Remove m_FrameSkip and m_framesToBeEncoded from the public params + [a75806cbe14b] + + * source/Lib/TLibEncoder/TEncCu.cpp: + nits, no logic changes + [0f02c76c63d1] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + white-space nits + [0b87935166f5] + + * source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp, source/PPA/ppa.h, + source/PPA/ppaCPUEvents.h: + ppa: add some high-level events + [f8031314c27d] + + * source/x265.h, source/x265cfg.cpp, source/x265cfg.h: + x265: for public API, only internal bitdepth is configurable + + And when HIGH_BIT_DEPTH is 0, the value is ignored. This change + allows the input and output bit depth fields to be configurable for + !HIGH_BIT_DEPTH builds, but are only allowed to be 8. + + When x265 is used as a library, the input and output bitdepths are + completely up to the user. libx265 does not have to know what they + are. + [e9afe7587e44] + + * source/encoder/CMakeLists.txt: + cmake: add x265.h to libx265 project + [1a076f248de6] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + white-space nit + [e866124b9b3f] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost: micro optimizations, more for clarity than anything else + [2df4c0434338] + + * source/CMakeLists.txt, source/Lib/TLibCommon/TypeDef.h: + cmake: define HIGH_BIT_DEPTH to 0 for 8bpp + [02382ad0344d] + + * source/encoder/TShortYUV.h: + tshortyuv: white-space nits + [06a3ab224b5a] + + * source/encoder/TShortYUV.h: + tshortyuv: forward slashes for includes + [a25e6ee7923e] + +2013-04-27 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Chroma fixes for Pixel=UShort; Bitrate issues sorted out, bit + mismatch issues with Pixel=Short + [2ccf934a7a83] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [d5bf22fdf56d] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + 3 cheers!! Pixel now works with UShort (at a much higher bitrate). + [05ddb2c288c6] + +2013-04-26 Deepthi + + * source/encoder/TShortYUV.h: + Adding more member functions to TShortYUV + [78cd7e5ab0a1] + +2013-04-26 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.cpp: + shift SAD result from TComRdCost::getSADPart() rather than each diff + [03433b6ee01b] + + * source/Lib/TLibCommon/TComRdCost.cpp: + remove partition size check from TComRdCost::getSADPart() + [194f459ded86] + + * source/Lib/TLibCommon/TypeDef.h: + remove unused Pxl type + [4915dcd016d8] + + * source/Lib/TLibCommon/TypeDef.h: + clean up white-space alignment in TypeDef.h + [52d5df36432e] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncSlice.cpp: + remove L0033_RC_BUGFIX #define, it will always be 1 + [8485a6a1c12d] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265cfg.cpp, + source/x265enc.cpp: + remove RATE_CONTROL_LAMBDA_DOMAIN #define, it will always be 1 + [88101df1faee] + + * source/test/mbdstharness.cpp: + white-space nit + [941581484868] + + * source/encoder/x86/CMakeLists.txt: + cmake: tell gcc not to sqawk about empty ASM setup function + [659cbd8026b8] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComTrQuant.cpp: + improve portability of alignment macros + [12708950babd] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TypeDef.h: + remove MATRIX_MULT #define, it will always be 0 + [b7e14c53edda] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265cfg.cpp, + source/x265enc.cpp: + remove ADAPTIVE_QP_SELECTION #define, it will always be 1 + [c33334661f82] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h: + remove SAO_ENCODING_CHOICE #define, it will always be 1 + [1b2d55dd7b4f] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/x265cfg.cpp, + source/x265enc.cpp: + remove L0232_RD_PENALTY #define, it will always be 1 + [9dcea9d2b459] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + remove L0372 #define, it will always be 1 + [4fa6f3a09dc7] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncGOP.cpp, + source/x265cfg.cpp: + remove L0444_FPA_TYPE #define, it will always be 1 + [c86464d72455] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + remove L0255_MOVE_PPS_FLAGS #define, it will always be 1 + [c198a462c1c8] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + remove L0363_BYTE_ALIGN #define, it will always be 1 + [64d5e486b138] + + * source/Lib/TLibCommon/TComDataCU.cpp: + fix eoln damage in TComDataCU.cpp + [13b5ff7b9bb6] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TypeDef.h: + remove L0363_MVP_POC #define, it will always be 1 + [b354aa200feb] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265cfg.cpp, + source/x265enc.cpp: + remove L0046_CONSTRAINT_FLAGS #define, it will always be 1 + [a06f0d0bec2b] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + remove L0043_MSS_IDC, L0116_ENTRY_POINT #defines, they will always + be 1 + [5b78e69a6347] + + * source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp: + remove L0045_CONDITION_SIGNALLING #define, it will always be 1 + [3adb926ba0a7] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp: + remove L0046_RENAME_PROG_SRC_IDC #define, it will always be 1 + [0b26e3187fdb] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncTop.cpp: + remove L0043_TIMING_INFO #define, it will always be 1 + [5e92e75e0bb6] + + * source/Lib/TLibCommon/TypeDef.h: + typo nits + [bb19a6cf86dd] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp: + remove L0047_APS_FLAGS #define, it will always be 1 + [f55166a0ca8b] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp: + remove L0044_CPB_DPB_DELAY_OFFSET #define, it will always be 1 + [472b0e4bc12d] + + * source/Lib/TLibCommon/NAL.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h: + remove L0045_NON_NESTED_SEI_RESTRICTIONS #define, it will always be + 1 + [f716604c56ab] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp: + remove L0045_PERSISTENCE_FLAGS #define, it will always be 1 + [397acf975a04] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp: + remove L0044_DU_DPB_OUTPUT_DELAY_HRD #define, it will always be 1 + [8d3ed21fb716] + + * source/Lib/TLibCommon/SEI.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp: + remove L0328_SPLICING #define, it will always be 1 + [355807b023d6] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + remove L0363_DU_BIT_RATE #define, it will always be 1 + [426f0abeaed4] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp: + remove FIX1071 #define, this temporary fix has been there for 1.5 + years + [b43f11555c88] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265cfg.cpp, + source/x265enc.cpp: + remove L0386_DB_METRIC #define, it will always be 1 + [9eb517f53928] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, source/x265cfg.cpp: + remove L0323_DPB #define, it will always be 1 + [5e53a1acd0de] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncTop.cpp, source/x265cfg.cpp, + source/x265enc.cpp: + remove L0034_COMBINED_LIST_CLEANUP #define, it will always be 1 + [c44835756f8c] + + * source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h: + remove FAST_BIT_EST #define, it will always be 1, improve + readability + [7d1e89b0e90a] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + remove NS_HAD #define, it will always be 0, improve readability + [b455c6b5b150] + + * source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + remove ZERO_MVD_EST #define, it will always be 0, improve + readability + [7c96e41f3155] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + uncrustify: cleanup TEncSearch.cpp after AMP_MRG define removal + [f93d8ade008f] + + * source/Lib/TLibCommon/TComDataCU.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h: + remove AMP_MRG #define, it will always be 1, improve readability + [900e0173b6fe] + + * source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h: + remove AMP_ENC_SPEEDUP #define, it will always be 1, improve + readability + [423fefc35e88] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + remove AMP_SAD #define, it will always be 1, improve readability + [0efc8467e6cd] + + * source/encoder/x86/asm-primitives.cpp: + asm: disable assembly; was causing stack corruption in debug runs + + The SATD functions are getting called from random places that do not + enforce input buffer alignment. These need to be fixed before ASM + can be re-enabled. + [962848c1c6ff] + + * source/test/CMakeLists.txt: + cmake: add primitive defines to test folder + [0bc70419ada8] + +2013-04-27 ShinYee Chung + + * source/encoder/TShortYUV.cpp: + shortyuv: Fix the '\' in an #include. + + Use '/' instead for compatibility with Windows and Linux. + [c325ba0eb7b3] + +2013-04-26 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h: + primitive: remove FastHeight(), use same logic with/without + ENABLE_PRIMITIVES + + * Allow iSubShift if iRows is greater than 12, since half-height + blocks will be supported. + * Unconditionally use the partition returned from PartitionFromSizes() + [aa94262bd0b9] + +2013-04-26 nandaku2 + + * Merged in deepthidevaki/xhevc_deepthid (pull request #94) + + Modified FIlter vec primitives + [c531582449c4] + +2013-04-26 Deepthi Devaki + + * source/encoder/vec/interpolationfilter.inc: + Modified Filter vec primitives + [b0115edcc276] + + * source/test/ipfilterharness.cpp: + Fix errors in IPfilterharness + [c32c36a3ac29] + +2013-04-26 Deepthi + + * source/encoder/CMakeLists.txt: + Merge + [0e30f0ceb6c3] + + * source/encoder/TShortYUV.cpp, source/encoder/TShortYUV.h: + Formatting the TShortYUV class. + [a178a6e93c00] + + * source/Lib/TLibCommon/TComPrediction.h, + source/encoder/CMakeLists.txt, source/encoder/TShortYUV.cpp, + source/encoder/TShortYUV.h: + Introducing TShortYUV formally + [80ff753040be] + +2013-04-26 Deepthi Devaki + + * source/encoder/vec/interpolationfilter.inc, + source/test/ipfilterharness.cpp: + FilterConvert - vec primitives + [dbf2e884f457] + + * source/encoder/InterpolationFilter.cpp, + source/encoder/vec/interpolationfilter.inc: + Vectorized FilterHorizontal-pel-pel and pel_short + [5ec9ce76685c] + + * Merge + [2af6e7c7b715] + + * source/test/ipfilterharness.cpp: + Changes to ipfilterharness.cpp + [6accfad1e082] + +2013-04-25 Deepthi Devaki + + * source/encoder/vec/CMakeLists.txt, + source/encoder/vec/interpolationfilter.inc, + source/encoder/vec/vecprimitives.inc, + source/test/ipfilterharness.cpp: + Vectorized version of filterVertical_short_Pel + [4faf8dcd1180] + +2013-04-26 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [f34557bd0c1b] + +2013-04-25 Deepthi Devaki + + * source/encoder/InterpolationFilter.cpp: + Typecast result val to pel. + [8cc213785f0f] + +2013-04-25 Steve Borho + + * source/x265cfg.cpp: + x265: output string nit + [062b636765af] + + * source/x265.h, source/x265cfg.cpp, source/x265cfg.h, + source/x265enc.cpp: + x265: move encoder configurable fields into a C friendly header + [ad159c607977] + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: make vector and asm build flags local to encoder/ folder + + Toggling the ASM or vector primitives should not require the HM + library to be rebuilt. + [e942246d64aa] + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: move VectorClass include path within encoder/ folder + [dc3899c4f7a7] + + * source/CMakeLists.txt: + cmake: remove getopt from build; leave files in place for potential + later use + [fdda9447ee37] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt: + cmake: move program_options_lite out of HM lib and into CLI project + [1d8a40959e29] + + * source/encoder/primitives.cpp: + primitives: turn partition checks into asserts; we expect success + [0f56cdd99c40] + + * source/x265main.cpp: + main: nits in output header + [f8a551edd2b7] + + * source/encoder/vec/pixel.inc: + pixel: add vector width/height 12 SAD functions + [27f5f9a00390] + + * source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/test/pixelharness.cpp: + pixel: add support for width/height of 12 + [e2cb9064721e] + + * source/encoder/vec/pixel.inc: + pixel: add vector width/height 24 SAD functions + [2c86ffb3f28b] + + * source/test/pixelharness.cpp: + pixelharness: fix up partition name strings + [a968ffee3ff1] + + * source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h: + pixel: add support for width/height of 24 + [fb552f51143d] + + * source/encoder/pixel.cpp: + pixel: reorder C refs to match enums, add missing 64 sized SATD + functions + + Intra will not use 64 sized partitions, but sub-pel inter might. I'm + surprised this didn't break uterly before? + [167863d172b5] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + white-space nit + [4ccf1edbb887] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + search: do not use iSubShift if it would result in slow path + *CHANGES OUTPUT* + + This doesn't have a large impact on performance today because motion + search is still so bloody innefficient. But it is the right thing to + do. + [f99e73f1e0f5] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + primitives: add a FastHeight() method to give early warnings of slow + paths + [7a97443c2155] + + * source/encoder/primitives.cpp: + primitives: simplify PartitionFromSizes() + [f4b4f25eb901] + + * source/encoder/primitives.h: + primitives: add macros for 32byte aligment + [b65c2c4fa585] + + * source/encoder/primitives.h: + primitives: re-order enums so partitions with the same width are + contiguous + [0d3a7ebba9e9] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/primitives.cpp: + Merged in deepthidevaki/xhevc_deepthid (pull request #92) + + Call Filter Primitives from SamplingQ, SamplingH + [57aa9e90c935] + +2013-04-25 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Add filter Primitives call in SamplingQ + [4e66d9fa333c] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Add Filter Primitives call in SamplingQ + [9362b0761fa0] + + * source/encoder/InterpolationFilter.cpp: + Remove assert from filters + [286250886456] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Call Filter Primitives from SamplingQ, SamplingH + [c005c9980a4b] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/InterpolationFilter.h: + Merge + [94205c681a97] + + * source/encoder/InterpolationFilter.cpp, + source/test/ipfilterharness.cpp: + Fix unused variables. + [3a12d696bf54] + +2013-04-25 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [995b6235dffc] + +2013-04-25 Deepthi Devaki + + * source/test/ipfilterharness.cpp: + Add limits.h to IPFilterHarness. + [a2483d531f95] + + * source/encoder/CMakeLists.txt, source/encoder/primitives.cpp, + source/test/CMakeLists.txt: + Filter c primitives - add to Cmake and primitives.cpp + [410a769be98e] + +2013-04-25 Steve Borho + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merged in ggopu/gopu_xhevc (pull request #89) + + included 64 Bit in PartitionFromSizes and Eliminated the + setDisParam() call from xTZSearchHelp() and Removed the primitive + call for rdCost for all the sad functions + [0004001ac17b] + +2013-04-25 ggopu + + * source/Lib/TLibEncoder/TEncSearch.cpp: + included 64 Bit in PartitionFromSizes and Eliminated the + setDisParam() call from xTZSearchHelp() and Removed the primitive + call for rdCost for all the sad functions Resolved the issue based + on the comment + [3e30fb1adc96] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + included 64 Bit in PartitionFromSizes and Eliminated the + setDisParam() call from xTZSearchHelp() and Removed the primitive + call for rdCost for all the sad functions + --solved the issues based on the comment + [cbde0042b475] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + included 64 Bit in PartitionFromSizes and Eliminated the + setDisParam() call from xTZSearchHelp() and Removed the primitive + call for rdCost for all the sad functions + -- solved the issues based on the comment + [c6af59bec3cd] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp: + included 64 Bit in PartitionFromSizes and Eliminated the + setDisParam() call from xTZSearchHelp() and Removed the primitive + call for rdCost for all the sad functions + [a3a21e07ef69] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/primitives.cpp: + Included 64 Bit in PartitionFromSizes and Eliminated the + setDisParam() call from xTZSearchHelp() Removed the primitive call + for rdCost for all the sad functions + [df5479194d8d] + +2013-04-24 Gopu G + + * Merged multicoreware/xhevc into default + [388938d47d3a] + + * Merged multicoreware/xhevc into default + [18941a42d529] + +2013-04-23 Gopu G + + * source/Lib/TLibVideoIO/TVideoIO.h, + source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h: + Merged multicoreware/xhevc into default + [a1d0b89d4a09] + +2013-04-22 Gopu G + + * source/Lib/encoder.cpp, source/Lib/encoder.h, + source/Lib/libmd5/MD5.h, source/Lib/libmd5/libmd5.c, + source/Lib/libmd5/libmd5.h: + Merged multicoreware/xhevc into default + [51911ce2d52e] + +2013-04-25 nandaku2 + + * Merged in deepthidevaki/xhevc_deepthid (pull request #91) + + Added new IPFilter function pointers. Added TestSuite for new + IPFilter primitives. + [ceded0279117] + +2013-04-25 Deepthi Devaki + + * source/encoder/InterpolationFilter.cpp: + assert bitDepth==8 + [81b6dda66506] + + * source/encoder/primitives.h, source/test/ipfilterharness.cpp, + source/test/ipfilterharness.h, source/test/testbench.cpp: + Added new IPFilter function pointers. Added TestSuite for new + IPFilter primitives. + [6e541a1f02e4] + +2013-04-25 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merge + [93dcae8999dd] + + * source/Lib/TLibCommon/TComPrediction.cpp: + Random data type changes + [dde5cb2850a0] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Correct typecasts in IFSamplingQ and H + [3b74bb5a814c] + + * cfg/encoder_I_15P.cfg: + Turn off SAO + [fb2f0ff73dc3] + +2013-04-25 Deepthi Devaki + + * source/encoder/CMakeLists.txt: + Add InterpolationFilter.cpp to CMakeList + [4356b72d5c7b] + + * source/encoder/InterpolationFilter.cpp, + source/encoder/InterpolationFilter.h: + Changes to filter (fixing merge conflict) + [92c17b5fe436] + +2013-04-25 Deepthi Devaki Akkoorath + + * source/encoder/InterpolationFilter.h: + Merged multicoreware/xhevc into default + [a6ee0ff27646] + +2013-04-25 Deepthi Devaki + + * source/encoder/InterpolationFilter.h: + Backed out changeset: cc54b45a0eac + [d2aebf4a7562] + +2013-04-24 Deepthi Devaki + + * source/encoder/InterpolationFilter.cpp, + source/encoder/InterpolationFilter.h: + added filterVertical_pel_pel + [e2cacd2ec890] + + * source/encoder/InterpolationFilter.cpp, + source/encoder/InterpolationFilter.h: + Split interpolationfilter.h and .cpp + [cc54b45a0eac] + + * source/encoder/InterpolationFilter.h: + Removed copying coeff to temporary array + [9a69b9fd012f] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/InterpolationFilter.h: + modified filter assuming bitDepth is always 8 + [1bb0a3358d50] + + * source/encoder/InterpolationFilter.h: + Removed filterVertical_pel_pel. + [76a182472142] + + * source/encoder/InterpolationFilter.h: + Add memcpy in filterCopy + [6ea7aa1a93c3] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/InterpolationFilter.h: + Eliminated redundant operations in filter + [6f17253fe853] + +2013-04-24 Steve Borho + + * source/encoder/vec/macroblock.inc: + Merged in praveentiwari/xhevc_praveent (pull request #87) + + Vector code for partialButterflyInverse32 + [09cf53b336c3] + +2013-04-24 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + vector code for partialButterflyInverse32 + [71b1a222d57a] + +2013-04-24 praveen Tiwari + + * Merged multicoreware/xhevc into default + [c43e9ed20d1f] + +2013-04-24 praveentiwari + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code for partialButterflyInverse32 + [e47972d48684] + +2013-04-24 ShinYee Chung + + * source/encoder/InterpolationFilter.h: + Interpolation: Fix unused variable in the horizontal filter. + [3d35a4bbd255] + +2013-04-24 Steve Borho + + * source/encoder/InterpolationFilter.h: + fix build on Linux, always use forward slashes in includes + [e4e556b60058] + +2013-04-24 Deepthi + + * Merge + [ebdfe69383cc] + + * source/Lib/TLibCommon/TComYuv.cpp: + Static_casts in TComYuv + [8541479eac97] + +2013-04-24 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Fixed case mismatch in include + [e345fae910b3] + +2013-04-24 Deepthi Devaki Akkoorath + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Merged multicoreware/xhevc into default + [73ed43fb0ef7] + +2013-04-24 Deepthi Devaki + + * source/encoder/InterpolationFilter.h: + Added InterpolationFilter.h + [74d34f6cd481] + + * source/encoder/interpolationFilter.h: + removed interpolationFilter.h + [98afe5c82a2e] + + * source/Lib/TLibEncoder/TEncSearch.cpp, + source/encoder/CMakeLists.txt, source/encoder/interpolationFilter.h: + Fixed data types of srcPtr,dstPtr and intPtr. + [1cfc4aa1e5be] + +2013-04-23 Deepthi Devaki + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Changed typecasting to Pel. + [2a77751c7ab8] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Modified call to Filters based on input/ouput data type + [58cee8796bb4] + + * source/encoder/CMakeLists.txt, source/encoder/interpolationFilter.h: + Added new filter functions for different datatypes(Pel or short) for + input/output. + [77b56bf695f2] + +2013-04-23 Deepthi Devaki Akkoorath + + * source/Lib/TLibVideoIO/TVideoIO.h, + source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h, + source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [6daee65aa014] + + * source/Lib/encoder.cpp, source/Lib/encoder.h, + source/Lib/libmd5/MD5.h, source/Lib/libmd5/libmd5.c, + source/Lib/libmd5/libmd5.h, source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [eb50336c8520] + +2013-04-23 Deepthi Devaki + + * source/encoder/vec/macroblock.inc: + fix the merge conflicts + [2996c71961a0] + +2013-04-19 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [afcb19e80d04] + +2013-04-24 Deepthi + + * Merge + [3e006ed9fb6c] + +2013-04-23 Deepthi + + * source/Lib/TLibEncoder/TEncSearch.cpp: + More static_cast in IntraCoding + [979f31a68ee5] + + * Merge + [5c3124219338] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + static_casts in IntraCoding + [ba3596764ed0] + + * source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp: + Change to short to prevent infinite loop + [11fe08841d59] + +2013-04-23 Steve Borho + + * source/x265enc.cpp: + x265enc: check return status of input.readPicture() + [0484be696c65] + + * source/input/y4m.cpp: + input: further y4m reading fixes + [a399bd21bdf5] + + * source/input/y4m.cpp: + input: declare header as char[] so sizeof works correctly + [f9a651676c2f] + +2013-04-23 ShinYee Chung + + * source/encoder/md5.cpp: + md5: Fix incorrect memory reset on MD5 context structure. + [fd8e4a7004cd] + +2013-04-23 Steve Borho + + * source/test/testbench.cpp: + testbench: report bit depth in testbench output log + [dc2cd230f09d] + + * source/encoder/vec/pixel.inc: + pixel: refactor pixel templates, optimizations mostly for 8bpp, see + below + + SAD part 16bpp opt 8bpp opt [4x4] 4.88x 4.24x 2.75x 3.16x [8x4] + 4.57x 6.40x 4.79x 5.87x [4x8] 4.95x 4.34x 3.08x 3.24x [8x8] 6.63x + 6.14x 6.03x 6.98x [4x16] 3.70x 6.10x 3.76x 3.67x [16x4] 8.51x 10.55x + 8.77x 10.75x [8x16] 7.08x 6.99x 6.65x 7.83x [16x8] 12.57x 14.31x + 11.21x 18.17x [16x16] 15.30x 13.36x 12.80x 26.20x [4x32] 4.83x 4.95x + 3.55x 3.99x [32x4] 10.30x 12.03x 7.07x 13.03x [8x32] 6.77x 7.66x + 6.43x 8.01x [32x8] 11.01x 14.35x 9.25x 13.80x [16x32] 14.12x 14.24x + 13.38x 17.81x [32x16] 10.75x 12.63x 9.72x 19.84x [32x32] 12.11x + 12.01x 9.19x 14.41x [4x64] 4.23x 4.12x 4.00x 4.73x [64x4] 11.34x + 11.63x 8.53x 14.24x [8x64] 7.68x 7.25x 7.74x 8.32x [64x8] 10.69x + 13.61x 8.79x 14.71x [16x64] 12.93x 12.82x 11.45x 21.61x [64x16] + 10.37x 10.46x 9.12x 17.29x [32x64] 13.22x 13.03x 10.34x 19.46x + [64x32] 10.49x 10.81x 8.58x 13.40x [64x64] 11.19x 11.34x 9.45x + 16.19x + [7dbd423a8820] + + * source/VectorClass/vectori128.h: + vec: add two methods to Vec16uc that are handy for pixelcmp + [d45d724f45e4] + + * source/x265cfg.cpp: + x265cfg: more HIGH_BIT_DEPTH fixes + [3118ee99c520] + + * source/VectorClass/vectori128.h: + vec: fix eoln in vectori128.h + [f0069ad5eb70] + + * source/PPA/ppaCPUEvents.h, source/output/y4m.cpp, + source/output/yuv.cpp: + ppa: add profile events for file write + [667b19cb2ca6] + + * source/input/input.h, source/input/y4m.h, source/output/y4m.cpp: + nit: prune unnecessary includes + [31f5f01b6a80] + + * source/encoder/md5.cpp, source/encoder/md5.h: + nit: hide internal function + [f4b07506d49b] + + * source/encoder/threadpool.cpp, source/test/testpool.cpp: + threadpool: fix 32bit Windows __lzcnt_2x32 + [6dbcb3afc321] + + * doc/uncrustify/codingstyle.cfg: + uncrustify: comment two lines that triggered warnings + [a918e7b44e0d] + + * source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibEncoder/TEncGOP.cpp: + take HM updates up to 10.1rc + [fdace7753eed] + + * source/output/output.h, source/output/y4m.h, source/output/yuv.h, + source/x265cfg.cpp: + output: add error handling for output reconstructed video file + [3c4a4ccc8da8] + + * source/output/yuv.cpp, source/output/yuv.h: + output: use ofstream for yuv + [5e083ea9e0a7] + + * source/output/y4m.cpp, source/output/y4m.h: + output: use ofstream for y4m + [dc215c6aa2ff] + + * source/output/output.cpp, source/output/output.h, + source/output/y4m.cpp, source/output/y4m.h, source/x265cfg.cpp: + output: pass frame rate to output file + [8808b77229fe] + + * source/x265enc.cpp: + we must initialize the fields which have been deprecated from config + [29c9d0740692] + + * source/x265cfg.cpp: + use estimated frame count + [1e3a180ec0ad] + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h: + input: implement guessFrameCount(), use std::ifstream, cleanup Y4M + header read + + Prevent double-close of YUV file handle + [7805c8506eda] + + * source/x265cfg.h: + white-space nits + [6324d2cd5e04] + + * source/x265cfg.cpp, source/x265cfg.h, source/x265enc.cpp: + cli: make bit depth arguments go away when HIGH_BIT_DEPTH=0, + simplify + + Enforce the requirement that the output bit depth cannot be larger + than the internal bit depth. Remove "conformance mode" and aiPad[]. + Those operations, including color space adjustments, don't belong in + an encoder. They should be done by a video processing library prior + to the encoder. + [5c453394226c] + + * source/encoder/x86/asm-primitives.cpp: + asm-primitives: use #if 0 rather than comments + [2d660854c182] + + * source/encoder/vec/pixel.inc: + pixel: add separate implementations for 8bit pixels + + the current 8bit versions are sub-optimal. Needs psadbw + [c557f437293c] + + * source/encoder/pixel.cpp: + pixel: remove old TODO comment + [c8bef114ec96] + +2013-04-23 nandaku2 + + * Merged in praveentiwari/xhevc_praveent (pull request #84) + + Vector code for partialButterflyInverse16 + [0e66af1883ad] + +2013-04-23 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Vector code for partialButterflyInverse16 + [9703357da52b] + +2013-04-23 praveen Tiwari + + * Merged multicoreware/xhevc into default + [788b291a1720] + + * Merged multicoreware/xhevc into default + [b6dcceb8ec9e] + + * Merged multicoreware/xhevc into default + [d3ed8a3f0be5] + +2013-04-23 praveentiwari + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Testcode for partialButterflyInverse16 + [15924dfd46bb] + +2013-04-23 praveen Tiwari + + * source/Lib/TLibVideoIO/TVideoIO.h, + source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h: + Merged multicoreware/xhevc into default + [dfa318bc6c1f] + +2013-04-23 praveentiwari + + * source/encoder/vec/macroblock.inc: + Robust alternative logic for Clip3 + [6026e8c2efd5] + +2013-04-23 nandaku2 + + * Merged in ggopu/gopu_xhevc (pull request #83) + + Enhanced the Regression suite and added Regression suite for Mingw + and Linux + [e85d16d29d0c] + +2013-04-22 ggopu + + * build/BuildEncoderApplications.bat, + build/CreateRegressionPackage.bat, build/CreateRegressionPackage.sh, + build/RunEncoderApplications.bat, build/config.txt: + Enhanced the Regression suite and added Regression suite for Mingw + and Linux + [06e573f8842f] + +2013-04-23 Deepthi + + * source/Lib/TLibVideoIO/TVideoIO.h, + source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h: + Merge + [e30ef3ca26c8] + + * source/Lib/TLibCommon/TComPrediction.cpp: + more precision related static casts in prediction. + [7d9f38dc50a8] + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + Deblocking Filter made type safe + [b8763aa9e2cb] + +2013-04-23 Steve Borho + + * source/output/y4m.cpp: + output: simplify frame header generation + [a07c8d0cb501] + + * source/output/yuv.cpp: + output: handle 16bit YUV output files correctly + [b383da39acb2] + + * .hgignore: + hg: ignore y4m files + [5182602454f0] + +2013-04-23 Deepthi + + * source/Lib/TLibCommon/TComLoopFilter.cpp: + static_cast to remove precision loss in loop filter (during Pel type + conversions) + [569611fe5c7c] + +2013-04-23 Steve Borho + + * source/output/y4m.cpp, source/output/y4m.h, source/output/yuv.cpp, + source/output/yuv.h, source/x265cfg.cpp, source/x265enc.cpp: + output: handle 16bit Pel type from the encoder + [4f7100891763] + +2013-04-22 Steve Borho + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/Lib/TLibEncoder/TEncTop.h, source/Lib/TLibVideoIO/TVideoIO.h, + source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h, source/output/CMakeLists.txt, + source/output/output.cpp, source/output/output.h, + source/output/y4m.cpp, source/output/y4m.h, source/output/yuv.cpp, + source/output/yuv.h, source/x265cfg.cpp, source/x265cfg.h, + source/x265enc.cpp, source/x265enc.h: + replace TLibVideoIO with as-yet unimplemented Output classes + [1b5074c2cf17] + +2013-04-23 Steve Borho + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp: + remove unicode chars from HM source + [99deb04e11c5] + + * source/test/timer.cpp: + testbench: re-add EMMS calls to timer funcs, for x86 builds + + Apparently uint64_t somehow involves the same hardware registers as + MMX on x86 + [caa9d879d8bd] + +2013-04-22 Steve Borho + + * source/test/timer.cpp: + testbench: fix timer behavior on Linux + [900e90ef3ec6] + + * source/test/CMakeLists.txt, source/test/timer.cpp: + testbench: fix Linux compile + [8a71a9d9dd3c] + + * source/test/CMakeLists.txt: + cmake: the test bench apps no long need to link with the HM libs + [c34f8aa32544] + + * source/test/filterharness.cpp, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/test/testbench.cpp, + source/test/testharness.h, source/test/timer.cpp: + testbench: report only speedups over C reference + [5bf58f12bed0] + + * source/x265main.cpp: + fix eoln damage to x265main.cpp + [18a1d0360507] + + * source/Lib/CMakeLists.txt, source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibEncoder/TEncGOP.cpp, source/Lib/libmd5/MD5.h, + source/Lib/libmd5/libmd5.c, source/Lib/libmd5/libmd5.h, + source/encoder/CMakeLists.txt, source/encoder/md5.cpp, + source/encoder/md5.h, source/test/testpool.cpp: + replace BSD libmd5 library with tweaked GPL MD5 from Min's x265 + [4dc1489e326b] + + * source/input/y4m.cpp, source/input/yuv.cpp, source/x265enc.cpp: + move file read PPA events inside the file read classes + [ff2e1d6a420c] + + * source/x265cfg.cpp: + nit cleanups in x265cfg.cpp + [3a706e89af04] + + * source/x265main.cpp: + use correct data type for clock() output + [3a23868f4daf] + + * source/x265cfg.cpp, source/x265cfg.h, source/x265main.cpp: + move some bits out of x265main into their proper locations + [fd477ac10863] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/Lib/encoder.cpp, source/Lib/encoder.h, source/x265enc.cpp, + source/x265enc.h, source/x265main.cpp: + move CLI encoder toplevel class out of Lib/ folder + [9fbc3e142f42] + + * source/Lib/TLibCommon/TComPicYuv.cpp: + fix eoln of TComPicYuv::copyFromPicture + [00930012bc2b] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, source/Lib/encoder.cpp, + source/x265cfg.cpp, source/x265cfg.h: + Use Input classes for file reads, bit-exact + [98ea3cd98a49] + + * source/input/input.h, source/input/y4m.h, source/input/yuv.h: + prune unnecessary Input class methods + [edf1cf197b7f] + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h, source/x265.h: + introduce x265.h - public interface header, C compatible + [1b9dbdbe3b3c] + + * source/input/input.h, source/input/y4m.cpp: + input: remove include of internal header + [0fd879feff34] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg: + remove ScalingListFile from config files + + It was never implemented, and we're unlikely to add it + [d7c1fa3d6c92] + + * source/Lib/TLibEncoder/TEncCu.cpp: + white-space tweak to TEncCu.cpp + [fcf2c9a0f811] + + * source/input/input.h, source/input/y4m.h, source/input/yuv.h: + input tweaks + [c5f827448a65] + + * .hgignore: + hg: ignore test run remnants + [2708d9bed556] + +2013-04-21 Steve Borho + + * source/CMakeLists.txt: + cmake: set -D_CRT_NONSTDC_NO_DEPRECATE on x265cfg.cpp + [6d42d360d056] + + * source/Lib/TLibEncoder/TEncCfg.h: + TEncCfg: minor white-space cleanups + [7d53979cdc0c] + +2013-04-22 nandaku2 + + * Merged in praveentiwari/xhevc_praveent (pull request #82) + + Vectorized code for partialButterflyInverse8 + [5d949f9e5b57] + +2013-04-22 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Vectorized code for partialButterflyInverse8 + [2c0f4fb594da] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code for partialButterflyInverse8 + [e1ae28e06e85] + +2013-04-22 praveen Tiwari + + * Merged multicoreware/xhevc into default + [e5de08686fce] + +2013-04-22 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Vectorized code for partialButterflyInverse4 + [65245e2e352b] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Vectorized code for partialButterfly8 + [2331625b3ed3] + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code for partialButterflyInverse4 + [70688db2af31] + +2013-04-22 Deepthi + + * source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp: + Bit Depth checks. + [303d89672240] + + * source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + Final Short* parameter type casts. Now compiles with Pel as UShort. + Output incorrect. + [d2c2078f9a54] + + * source/Lib/TLibCommon/TComPrediction.h: + Merge + [83b1e733d352] + +2013-04-21 Deepthi + + * source/Lib/TLibCommon/TComPrediction.h: + Comments + [eadc32e88cc5] + +2013-04-21 Steve Borho + + * source/x265cfg.h: + eoln fixes for x265cfg.h + [80b3d7f2ea2b] + + * source/cmake/version.cmake: + cmake: print detected xhevc version + [7732fd1be70c] + + * source/x265cfg.h: + reintrodude old TVideIO classes, short term fix + [0b3a7f032ca7] + + * source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibEncoder/TEncCavlc.h: + re-apply some changes that were accidentally backed out + [301365973418] + + * source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h: + remaining checkin of HM files + [d1697a1eb39d] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h: + partial checkin of more HM files + [843a9b287b93] + + * source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h: + partial checkin of more HM files + [c34e62ee498a] + + * source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComMv.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h: + partial checkin of more HM files + [ede16ee58c25] + + * source/Lib/TLibCommon/ContextModel.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitStream.h, source/x265cfg.h: + apply uncrustify tweak + [5151eb7614ca] + + * doc/uncrustify/codingstyle.cfg: + uncrustify: yet another tweak + [6f329f8be6cb] + + * source/Lib/TAppCommon/program_options_lite.cpp, + source/Lib/TAppCommon/program_options_lite.h, + source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp: + partial checkin of more HM files + [a3e1adc924bc] + + * cfg/encoder_I_15P.cfg, source/Lib/encoder.cpp, source/x265cfg.cpp, + source/x265cfg.h: + partial check-in of HM bug fixes - hand tweaking of large table + formats + [b54b96b9d205] + + * source/Lib/TAppCommon/program_options_lite.cpp, + source/Lib/TAppCommon/program_options_lite.h, + source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextModel.h, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComMv.h, source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.h, source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibVideoIO/TVideoIOY4m.h, source/Lib/encoder.cpp, + source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/threading.cpp, + source/encoder/threadpool.cpp, source/encoder/vec/pixel.inc, + source/encoder/vec/vec-primitives.cpp, + source/encoder/vec/vecprimitives.inc, source/input/input.cpp, + source/input/y4m.cpp, source/input/y4m.h, source/input/yuv.cpp, + source/input/yuv.h, source/test/filterharness.cpp, + source/test/filterharness.h, source/test/mbdstharness.cpp, + source/test/pixelharness.cpp, source/test/pixelharness.h, + source/test/testbench.cpp, source/test/testharness.h, + source/test/testpool.cpp, source/test/timer.cpp: + uncrustify: apply coding style tweaks + + The new style needed to be in place before I updated the HM code, in + order to reduce the number of diffs to sift through. + [6d038ab13735] + + * doc/uncrustify/codingstyle.cfg: + uncrustify: more tweaks to style setting + [c451dbda7d94] + + * doc/uncrustify/apply-to-all-source.py, + doc/uncrustify/codingstyle.cfg, doc/uncrustify/uncrustify.exe: + uncrustify: update uncrustify executable + [46c33fbe9333] + + * source/encoder/x86/asm-primitives.cpp: + asm: disable SAD assembly to get back to bit-exact outputs + + Perhaps a data alignment problem, or arguments in the wrong order + somewhere + [33d2ebf5f487] + + * Merge + [3fe21a59f6fd] + +2013-04-21 Deepthi + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + Cleanups; remove TComYUV structure. + [581d2044bf91] + + * source/Lib/TLibEncoder/TEncSearch.cpp: + Replace TComYUV class by TShortYUV + [c9777c7b75fe] + + * source/Lib/TLibCommon/TComPrediction.h: + More members functions added to TShortYUV + [64fc6448254c] + + * source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h: + Create and destroy filteredTmp class + [01deb523db8b] + + * source/Lib/TLibCommon/TComPrediction.h: + Defining a new class for holding Short YUV buffers. This will + replace the Pel YUV intermediate buffers in filter interpolation. + [24500d602afd] + +2013-04-20 Steve Borho + + * source/encoder/macroblock.cpp: + cleanup some unfortunate uncrustify expression reflows + [a9e3b68b81d9] + +2013-04-20 Mandar Gurav + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc, + source/test/filterharness.cpp: + * Implemented vectorization of Filter_horizontal_4 + * Uncrustify all the commit files + * Print one more field (speedup = c_time/vec_time) in testbench + [d5eaa39bf3b6] + +2013-04-20 Deepthi + + * source/Lib/TLibCommon/TComPrediction.cpp: + Changing the intermediate filtering output to short-lived heap + memory (type short). + [e8b8232ddbb2] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Merge + [e77a39c2105c] + +2013-04-19 Mandar Gurav + + * source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Filter Horizontal vectorization stage 2 + [6f25724b3f93] + +2013-04-19 Mandar Gurav + + * Merged multicoreware/xhevc into default + [aa8a54f38812] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Merged multicoreware/xhevc into default + [0e019be400f8] + + * source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [dc9f04885f43] + +2013-04-17 Mandar Gurav + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc, + source/test/filterharness.cpp: + Merged multicoreware/xhevc into default + [7ea883f9071d] + +2013-04-18 Mandar Gurav + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp: + Copy file directly from xhevc. + [2b8e435f91bb] + +2013-04-17 Mandar Gurav + + * build/RegressionTester.bat, source/Lib/config.cpp, + source/Lib/config.h, source/encoder/macroblock.cpp, + source/encoder/primitives.h, source/encoder/vec/macroblock.inc, + source/test/filterharness.cpp: + Merged multicoreware/xhevc into default + [7f14170ed329] + +2013-04-17 Mandar Gurav + + * source/encoder/primitives.h: + Move to Xhevc source stage 2 + [a45e2efa1fd6] + + * source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc, + source/test/filterharness.cpp: + Move to latest xhevc source + [b9152b69273b] + + * source/encoder/macroblock.cpp: + Revert changes + [d451573343a3] + + * source/encoder/macroblock.cpp: + Copy file directly from xHEVC + [aabf6e1c4ce3] + +2013-04-16 Mandar Gurav + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Restrctured filter function calling/template as per the discussion + [675639180295] + +2013-04-20 Deepthi + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Merge + [a8918bd5c534] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/filterharness.cpp, + source/test/filterharness.h: + Changing all filter interpolation primitives to accept short + pointers. + [53e2f960638a] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h: + Changing all filter interpolation functions to accept Short + pointers. + [55d414249baa] + +2013-04-19 Steve Borho + + * source/encoder/x86/CMakeLists.txt: + cmake: use .obj extension for ASM compilation outputs, makes VS + happier + [e81ef56a1f28] + + * source/CMakeLists.txt: + cmake: enable /Oi flag globally for Visual Studio + + This makes the test bench run faster compiled with VS than with GCC + [e4cbdf357e3d] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + add x264_cpu_emms() in some likely locations + [454a38de0b89] + + * source/encoder/x86/asm-primitives.cpp: + asm: disable many of the sa8d_8x8 functions, they are causing + crashes + + (perhaps misalingment related, don't know. this will have to be + revisited later) + [9fa4ad5c65bf] + + * source/encoder/x86/asm-primitives.cpp: + asm: enable many more primitives, leave bunches disabled + + Many of the high-bit depth primitives seem to be busted, they cause + stack corruption even when called from GCC, at least the versions we + have here. + [b6c387c6256b] + + * source/test/pixelharness.cpp: + pixelharness: tweak print statements to show progress more clearly + [796e05062cdc] + + * source/encoder/x86/CMakeLists.txt: + cmake: pass ARCH_X86_64 flag to asm-primitives.cpp + [e81431e4bf6c] + + * source/encoder/vec/pixel.inc: + pixel: add a 16x4 satd vector primitive + [ea89ec5d7ace] + + * source/encoder/pixel.cpp: + pixel: fix larger dimension satd C primitives, using simple template + [b899a70a433b] + + * source/encoder/x86/CMakeLists.txt, source/encoder/x86/sad-a.asm: + cmake: add x264's sad-a.asm for more SAD functions + [55920369b223] + + * source/encoder/x86/CMakeLists.txt: + cmake: fix linking of assembly code with MSVC x86 + [2b777b1de100] + + * source/encoder/x86/asm-primitives.cpp: + asm-primitives: repair EOLN + [b758e0607830] + + * source/input/yuv.cpp: + yuv: fix MSVC x86 compiler warning + [789f01fd0555] + + * source/encoder/primitives.h, source/encoder/x86/CMakeLists.txt, + source/encoder/x86/asm-primitives.cpp: + cmake: fix linking of assembly code with MSVC builds (rename outputs + to .lib) + [06c9c88a35a8] + + * source/encoder/primitives.h, source/encoder/x86/CMakeLists.txt, + source/encoder/x86/asm-primitives.cpp, source/encoder/x86/cpu-a.asm, + source/test/CMakeLists.txt, source/test/timer.cpp: + asm: pull in cpu-a.asm from x264, includes EMMS function + + The intrinsic support for EMMS is broken with MS compilers, so we + have to use this short function provided by x264. We could improve + this somewhat with inline assembly in primitives.h, but it would + have to be cross-compiler. + + linking the assembly with MSVC seems clearly broken + [42546d6bb2da] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h: + EOLN repair in HM files + [11cf80f51ee9] + + * source/cmake/CMakeASM_YASMInformation.cmake, + source/cmake/CMakeDetermineASM_YASMCompiler.cmake, + source/cmake/CMakeTestASM_YASMCompiler.cmake, + source/cmake/version.cmake: + cmake: dos to unix eoln + [b7668187381e] + + * source/test/timer.cpp: + timer: issue EMMS before attempting any floating point math + [ea5150ee49a4] + + * source/cmake/CMakeASM_YASMInformation.cmake, source/encoder/x86/asm- + primitives.cpp, source/test/CMakeLists.txt: + cmake: fix GCC Win32 linkage with x264 assembly + [74d0c855ad7d] + + * source/input/y4m.cpp: + input: fix GCC build + [234401a376c3] + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h: + input: remove unnecessary public methods, cleanup read functions + + fread returns the count of elements read. The file reader should + know nothing about margins. + [40d080f01e1b] + + * source/input/y4m.cpp, source/input/yuv.cpp: + input: fix compile warnings + [875679b4cb08] + + * source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h: + input: update headers and classes + [aa83c6072932] + +2013-04-18 Steve Borho + + * source/encoder/macroblock.cpp: + macroblock: move warning disable to top of file, remove dups + [358d7a322561] + +2013-04-19 Steve Borho + + * Merged in praveentiwari/xhevc_praveent (pull request #78) + + Vectorized code for partialButterfly32 + [fe41a1e01a93] + +2013-04-19 praveentiwari + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code of partialButterfly8 + [1f6cd6ba73f8] + +2013-04-19 praveen Tiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Merged multicoreware/xhevc into default + [cc32f7a1aacf] + + * source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [c2a7b5bce16e] + + * source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [37b3da3c6bf9] + +2013-04-19 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Vectorized code for partialButterfly32 + [961ebc25f630] + +2013-04-19 Steve Borho + + * Merge + [d613c2fde0f8] + +2013-04-19 Deepthi + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Changing 16-bit hard coded memory copy in TComTrQuant::xT + [8a517c789bbb] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Changing 16-bit hard coded memory copy in TComTrQuant::xIT + [3e327e235ef6] + +2013-04-19 Deepthi Devaki + + * source/encoder/vec/macroblock.inc: + Fix build issue in macroblock.inc with Mingw, Msys + [45b7819d08eb] + +2013-04-19 Deepthi + + * source/encoder/vec/macroblock.inc: + Comment + [4252824d518f] + +2013-04-19 Deepthi Devaki + + * source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Vertical Filter - vector primitive with 32 bit operations. + [5334266bbebb] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Filter C - primitive with isFirst, isLast and 32 bit sums. + [f517af8fa3e1] + +2013-04-19 Deepthi Devaki Akkoorath + + * source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [b95fa4555d92] + +2013-04-19 Deepthi Devaki + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Backed out changeset: c51b962d2155 + [53fe4175ecf6] + +2013-04-18 Rajesh + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Changed the array of Filter function pointers in primitives.h to + have only Filter[V_8] and FIlter[V_4]. + [c51b962d2155] + +2013-04-19 praveentiwari + + * source/test/mbdstharness.cpp, source/test/mbdstharness.h: + Test code for partialButterfly32 + [7d9db6232717] + +2013-04-18 Steve Borho + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: move CPRIMITIVES variable creation into + source/encoder/CMakeLists.txt + [d69532344cd2] + + * source/encoder/CMakeLists.txt: + cmake: document the reason for the -march-i686 flag + [f86cfcd39cda] + + * source/encoder/threading.cpp, source/encoder/threading.h, + source/encoder/threadpool.cpp: + threading: cleanup of shutdown methods + + This fixes the deadlock we see once 1 in 1 million runs of the pool + test. + [189956c5ac7e] + + * source/encoder/primitives.cpp: + primitives: remove PartitionFromSizes if primitives are disabled + [1b4bcbe0b604] + + * source/encoder/primitives.cpp, source/x265main.cpp: + primitives: report primitive compilation settings when setting up + primitives + [622eada81044] + + * source/encoder/primitives.cpp: + primitives: fix GCC compile without primitives + [f77e4badb7c3] + + * source/test/testbench.cpp: + testbench: allow to compile without primitives + [906891cfca09] + +2013-04-18 Deepthi + + * source/encoder/macroblock.cpp: + reverting vertical filter to isLast and isFirst as template + arguments. + [ba92d2b802b0] + +2013-04-17 Steve Borho + + * source/encoder/threadpool.cpp: + threadpool: ensure threads spin up before allowing any jobs to + enqueue + [1711ba4cbae9] + + * source/test/testpool.cpp: + testpool: output nothing to stdout unless hashes mismatch + [7a831fa74921] + + * source/encoder/threadpool.cpp, source/encoder/threadpool.h, + source/test/testpool.cpp: + threadpool: prevent threads seing partially destroyed job providers + [1c05ce459601] + + * source/encoder/x86/asm-primitives.cpp, source/test/mbdstharness.cpp: + further GCC fixes + [4817f4da5c71] + + * source/encoder/vec/macroblock.inc: + vec: fix GCC build of macroblock.inc + [75bd4e212561] + + * source/encoder/threadpool.cpp: + threadpool: add a mechanism to determine safety of job provider + deletion + [bada21fa0e75] + + * source/encoder/x86/CMakeLists.txt: + cmake: unify assembly file list, use a common object file name + [4d8da41d0a98] + +2013-04-18 Steve Borho + + * source/encoder/x86/CMakeLists.txt, source/encoder/x86/asm- + primitives.cpp, source/encoder/x86/const-a.asm: + asm: Fix integrations of pixel-a.asm. + [2ac269a795fe] + +2013-04-18 Deepthi + + * source/encoder/vec/macroblock.inc: + Merge + [72ad141d57f3] + +2013-04-17 Deepthi + + * source/encoder/vec/macroblock.inc: + Merge + [69fe3d55e91b] + + * merged + [125be1e48dcd] + +2013-04-17 Deepthi Devaki + + * source/encoder/vec/macroblock.inc: + Modified vertical_filter to pre-load coeff + [e162b60189b8] + + * source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Backed out changeset: b520a3ddbf36 + [082f8068d4cd] + + * source/encoder/vec/macroblock.inc, source/test/filterharness.cpp: + Modified vertical_filter to pre-load coeff + [b520a3ddbf36] + + * source/encoder/vec/macroblock.inc: + Backed out changeset: 0848ac28c541 + [31b64b009d89] + + * source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Backed out changeset: fbbc5ec48951 + [df782495c742] + + * source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Backed out changeset: bd537ec0cba7 + [fbbc5ec48951] + + * source/encoder/vec/macroblock.inc: + Backed out changeset: db0ac8504e1a + [0848ac28c541] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp: + Modifed TComInterpolationFilter.cpp to call vertical-filter + primitives + [2b049a973793] + + * source/test/filterharness.cpp: + Modifed filterharness to test vertical filter. + [25b506f908ff] + + * source/encoder/vec/macroblock.inc: + Modifed vertical-filter + [db0ac8504e1a] + + * source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Added vectorized vertical Filter + [bd537ec0cba7] + + * source/encoder/macroblock.cpp: + Added filter_Vertical C-primitive with isFirst = isLAst = True + always. + [808c38c73c93] + + * source/encoder/macroblock.cpp: + Added filter_Vertical C-primitive with isFirst = isLAst = True + always. + [1fe37c453312] + +2013-04-17 Deepthi Devaki Akkoorath + + * build/RegressionTester.bat, source/Lib/config.cpp, + source/Lib/config.h, source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [4c19cddce21b] + +2013-04-17 Deepthi Devaki + + * source/encoder/macroblock.cpp: + Backed out changeset: 0826f419aa6d + [e93b9f1e9f10] + +2013-04-16 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [66c24df49d97] + +2013-04-15 Deepthi Devaki Akkoorath + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [f05a5d3879f0] + +2013-04-15 Deepthi Devaki + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp: + Backed out changeset: 0826f419aa6d + [7f8daed75336] + +2013-04-12 Deepthi Devaki Akkoorath + + * source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [755bda7f920c] + +2013-04-12 Rajesh + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp: + Added filter c primitive to macroblock.cpp. Changed + TComInterpolationfilter.cpp to call filter primitives + [0826f419aa6d] + +2013-03-26 deepthidevaki + + * source/encoder/TComRdCost_SSE.cpp: + Modified xCalcHADs4x4 for better optimization + [6b9e69b013e8] + +2013-04-17 Deepthi + + * source/Lib/TLibCommon/TComTrQuant.cpp: + MingW support for alignment. + [91412bd3a9a8] + +2013-04-17 praveentiwari + + * source/encoder/vec/macroblock.inc: + partialButterfly16 vectorized code + [e590f654128d] + +2013-04-17 praveen Tiwari + + * source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [f98bb4be9285] + +2013-04-17 praveentiwari + + * source/encoder/vec/macroblock.inc: + solving merge conflict + [745408cf439a] + + * source/encoder/vec/macroblock.inc: + merge conflict + [0273e986d1ca] + + * source/encoder/vec/macroblock.inc: + solving merge conflict + [ee1fec0bfdc3] + + * source/encoder/vec/macroblock.inc: + solving merge conflict + [9ac0ab2c4469] + + * source/encoder/vec/macroblock.inc: + Replaced Vec16s and Vec8i usage with Vec8s and Vec4i for Vec- + partialButterfly16 + [70f2f5526c71] + +2013-04-17 Deepthi + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/vec/macroblock.inc: + Enforcing memory alignment in inversedst. + [e7784727f33b] + + * source/test/mbdstharness.cpp: + Aligned mem in mbdst test harness + [4580bf5d1062] + + * source/encoder/vec/macroblock.inc: + Removing extraneous min,max - performance improvements by 20% + [882c40e8df1a] + + * source/encoder/vec/macroblock.inc: + Merge + [7c3b537734e1] + +2013-04-17 praveentiwari + + * source/test/butterflyharness.cpp, source/test/butterflyharness.h: + removed butterflyharness files + [c0a349fc04e6] + +2013-04-17 praveen Tiwari + + * Merged multicoreware/xhevc into default + [12c9c9bceb8c] + +2013-04-17 praveentiwari + + * source/test/CMakeLists.txt, source/test/mbdstharness.cpp, + source/test/mbdstharness.h, source/test/testbench.cpp: + Moved the butterfly-harness into mbdst-harness + [d2e5c53f22d2] + +2013-04-17 praveen Tiwari + + * build/RegressionTester.bat, source/Lib/config.cpp, + source/Lib/config.h, source/encoder/macroblock.cpp, + source/test/CMakeLists.txt: + Merged multicoreware/xhevc into default + [170e9ffd084e] + +2013-04-16 praveen Tiwari + + * source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [a266c75a843d] + +2013-04-16 praveentiwari + + * source/test/CMakeLists.txt: + Included butterflyharness.h and .cpp file in make file + [d24a9c1f7f3e] + + * source/test/butterflyharness.cpp: + Increased the buffer size and used more meaningful name in Test Code + of partialButterfly16 + [3b621b943630] + +2013-04-16 praveen Tiwari + + * Merged multicoreware/xhevc into default + [2df037f5f4a2] + +2013-04-16 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Added vectorized code for partialButterfly16 and removed unused + function code + [8effa08eff14] + + * source/test/butterflyharness.cpp, source/test/butterflyharness.h, + source/test/testbench.cpp: + Test Suit for partialButterfly16 + [641d56d61ba2] + +2013-04-16 praveen Tiwari + + * source/encoder/vec/macroblock.inc, source/test/testbench.cpp: + Merged multicoreware/xhevc into default + [e75d8a96b2b8] + +2013-04-16 praveentiwari + + * source/test/testbench.cpp: + TestBench MergeConflict + [5be23aedc1d5] + + * source/test/testbench.cpp: + Merge conflict + [c5d7681b866f] + + * source/test/testbench.cpp: + Solving merge conflict with TestBench.cpp + [42821ff0e1d5] + + * source/test/testbench.cpp: + Solving merge conflict with Testbench.cpp + [fd6fae9fb6fa] + +2013-04-15 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/vec/macroblock.inc: + Added Primitives for partialButterfly32 + [9c97c040fc19] + +2013-04-15 praveen Tiwari + + * source/test/testbench.cpp: + Merged multicoreware/xhevc into default + [237ff80c3bd1] + +2013-04-15 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, source/encoder/butterfly.h, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/testbench.cpp: + Fixed alignment problem with partialButterfly16 + [1239bf84868c] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Fixed alignment problem with ENABLE_PRIMITIVES switch for + partialButterfly16 + [48d8838f4298] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/testbench.cpp: + Added ENABLE_PRIMITIVES switch for partialButterfly16 + [9b87cb3d8c84] + +2013-04-15 praveen Tiwari + + * Merged multicoreware/xhevc into default + [35e483e872ff] + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [af2db6bbb5a7] + +2013-04-12 praveen Tiwari + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [7413fc103a3e] + + * source/encoder/macroblock.cpp: + Merged multicoreware/xhevc into default + [70abf8d31e02] + +2013-04-11 praveen Tiwari + + * Merged multicoreware/xhevc into default + [4d7790972b0e] + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [71ca57e3681f] + +2013-04-10 praveen Tiwari + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [9687e766009f] + +2013-04-10 praveentiwari + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + resolving merge conflict + [00c935e908ce] + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Added C and Vector primitives for partialButterfly16 + [93846235af07] + + * source/encoder/butterfly.h: + Created butterfly.h to read the value from the g_aiT array + [6d234bd90345] + +2013-04-17 Deepthi + + * test_new.rtf: + Delete test commit + [1ad18182408d] + + * test_new.rtf: + Test commit + [59aec92c8f61] + +2013-04-16 Steve Borho + + * source/encoder/x86/CMakeLists.txt: + cmake: figure out a workaround for YASM builds of assembly files + with MSVC + [28408d0ef0f7] + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt, + source/encoder/vec/CMakeLists.txt, + source/encoder/x86/CMakeLists.txt: + cmake: allow YASM to be used by MSVC, even though its currently + broken + + Rename primitive projects so they sort together in VC + [ffe6cf7be445] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/Lib/config.cpp, source/Lib/config.h, source/Lib/encoder.h, + source/input/CMakeLists.txt, source/input/input.cpp, + source/input/input.h, source/input/y4m.cpp, source/input/y4m.h, + source/input/yuv.cpp, source/input/yuv.h, source/x265cfg.cpp, + source/x265cfg.h: + establish a new input file library, move CLI and command parsing to + x265-cli + [8163e7e7b324] + + * source/test/CMakeLists.txt: + cmake: disable the pool test by default, even when tests are enabled + + Now that its functional, not many people will want to build/run it + [5d16e9d82d62] + + * source/test/timer.cpp: + timer: replace NULL with 0 + [a182530927da] + + * source/test/pixelharness.cpp: + pixelharness: fix MinGW build + [4c2412305e62] + + * source/test/timer.cpp: + timer: use native Windows high resolution timer + [4f65489aaaf1] + + * source/test/filterharness.cpp, source/test/filterharness.h, + source/test/mbdstharness.cpp, source/test/mbdstharness.h, + source/test/pixelharness.cpp, source/test/pixelharness.h, + source/test/testharness.h: + dos to unix eoln for test harnesses + [26baba09de44] + + * source/test/testpool.cpp: + trim trailing white-space + [4f1719887601] + + * source/test/testpool.cpp: + testpool: verify below row is restartable + + output is now deterministic regardless of thread count + [19ba1f4edaf2] + + * source/encoder/macroblock.cpp: + Partial Backout changeset: 81101974408f + [d03b4fa32d79] + + * build/BuildEncoderApplications.bat: + white-space cleanup in BuildEncoderApplications.bat + [71a75afbb8e8] + + * build/RegressionTester.bat: + Merge + [ce10cac4ce71] + + * source/test/pixelharness.cpp: + testbench: add a POSIX friendly aligned malloc, pad to 32 bytes + [9e1a1faad621] + +2013-04-16 Deepthi + + * source/encoder/macroblock.cpp: + Changing 32-bit multiplies to 16-bit multiplies for filter. (This + may need more testing). + [81101974408f] + + * source/test/pixelharness.cpp: + Aligned mallocs and frees in pixel buffers + [defda4cf08c2] + +2013-04-16 nandaku2 + + * build/RegressionTester.bat: + Merged in ggopu/gopu_xhevc (pull request #66) + + Redesigned the Regression test script - the script will be + compatible for vc 9, vc 10 and vc 11 + [3b066f46831d] + +2013-04-15 ggopu + + * build/BuildEncoderApplications.bat, + build/CreateRegressionPackage.bat, build/RegressionTester.bat, + build/config.txt: + Redesigned the Regression test script - the script will be + compatible for vc 9, vc 10 and vc 11 + [0d9f10323b82] + +2013-04-16 sumalatha + + * source/Lib/TLibVideoIO/TVideoIOY4m.cpp: + Fixed issue in handling y4m files Issue: The reconstructed file(yuv) + were all zeros Fix: In y4m handler, bitDepthShiftY, bitDepthShiftC + were not updated properly in the open(), so changed the code to + update it properly. + [c422cc514b80] + +2013-04-15 Steve Borho + + * source/test/testbench.cpp: + testbench: add the ability to easily bypass some tests + [bd316d5a9edc] + + * source/test/filterharness.cpp, source/test/testbench.cpp: + testbench: GCC compilation fixes + [d5e4ce5c651d] + + * source/test/CMakeLists.txt, source/test/filterharness.cpp, + source/test/filterharness.h, source/test/mbdstharness.cpp, + source/test/mbdstharness.h, source/test/pixelharness.cpp, + source/test/pixelharness.h, source/test/testbench.cpp, + source/test/testharness.h, source/test/timer.cpp: + testbench: split into classes that know how to test each primitive + type + [d72521706de8] + + * source/test/testbench.cpp: + testbench: remove unused defines + [0eb7444ebf8a] + + * source/test/testbench.cpp: + testbench: don't forget to test the detected CPUID + [878233d4bcc9] + + * source/test/testbench.cpp: + less cargo-cult programming please + + checkasm used BENCH_ALIGNS to over-allocate the buffers then fix the + alignment We're just over-allocating buffers + [de3741f6a1af] + + * source/test/testbench.cpp: + testbench: tune iterations to primitive type so test finishes in 60 + seconds + [5206b476c65e] + + * source/test/testbench.cpp: + testbench: fix bugs exposed by GCC warnings + + t_size was being used as a global by two different primitives, and + then defined locally by one of them. + [06cbe8719bfa] + + * source/encoder/vec/pixel.inc: + pixel: use aligned loads for 8 and 16 wide SAD 14% improvement + + Using this optimization before the other loop optimizations hid the + benefits of reducing the pipeline stalls. 350->300ms for 16x16 + testbench (many) iterations + [7f9b889ce129] + + * source/test/testbench.cpp: + testbench: check primitive accuracy on all SIMD architectures + + Measure performance only once, with the most optimized primitives + available + [9b4729fe6825] + + * source/encoder/vec/pixel.inc: + pixel: optimize 4 pixel wide SAD + + Now 4x faster than the C version. We accumulate 8 values then + discard half at the end. + [4d0aa20b3fb8] + + * source/encoder/vec/pixel.inc: + pixel: further optimize multiples of 16 wide SAD + [b3b7dc98a6ba] + + * source/test/testbench.cpp: + testbench: use a pseudo-random seed + [f323471e8eaf] + + * source/encoder/vec/pixel.inc: + pixel: use lower case var named + [4161c1c2d852] + + * source/encoder/vec/pixel.inc: + pixel: collect SAD sums in vectors, do horizontal_add_x only at end + + The gives at least a 2X speedup. Many SAD functions are now 10x + faster than the C primitives. + [98af7878ab54] + + * source/VectorClass/vectori128.h, source/encoder/vec/macroblock.inc: + inversedst: use blend8s to generate outputs 30% faster + [b1c604dac752] + + * source/encoder/vec/sse2.cpp, source/encoder/vec/sse3.cpp, + source/encoder/vec/ssse3.cpp: + vectorized: correct the INSTRSET values for SSE2, SSE3, and SSSE3 + [a4d6b0f5c3c6] + +2013-04-15 Mandar Gurav + + * Merged multicoreware/xhevc into default + [9a0056333a34] + +2013-04-15 https://mandarmcw + + * source/test/testbench.cpp: + Used more meaningful variable names + [bc80f481a448] + + * source/test/testbench.cpp: + Implemented init/clean function for filter testbench. + [cce1fa50f3ff] + + * source/test/testbench.cpp: + Print a more meaningful message - replace %d value with a meaningful + string showing the filter configuration. + [96d57ac3598b] + +2013-04-15 Deepthi + + * source/encoder/vec/pixel.inc: + Reverting aligned loads in pixel.inc - caused crash. This needs more + research + [b5f1f4eef5ae] + +2013-04-14 Steve Borho + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp: + cleanup TComInterpolationFilter.cpp + [a230019cf137] + + * source/Lib/TLibCommon/TComRdCost.cpp, source/encoder/vec/pixel.inc: + change order of arguments to SATD primitives to match x264 assembly + + x264 assembly code wants fenc, fencstride, fref, frefstride. fenc is + assumed to be aligned by the block size. + [ce3635066069] + + * source/encoder/vec/pixel.inc, source/test/testbench.cpp: + pixel: use aligned loads for piOrg (encoded block), enable sad_4 + + vectorized sad_4 is now just a little bit faster than the C version. + [c8333b3c1536] + + * source/Lib/TLibCommon/TComRdCost.cpp: + change order of arguments to SAD primitives to match x264 assembly + + x264 assembly code wants fenc, fencstride, fref, frefstride. fenc is + assumed to be aligned by the block size. + [03dbce445e13] + + * source/Lib/config.cpp: + unfortunately the FrameN options are currently necessary + [db32d5905f73] + + * source/Lib/config.cpp: + nuke the --FrameN command line options + + These make the command line help really annoying + [6ac0c6263ad4] + + * source/encoder/vec/pixel.inc: + pixel: add sad_4, but do not use it for now + + When memory alignment is considered, it may be faster than the C + primitive + [c8a89155a025] + + * source/encoder/vec/pixel.inc: + pixel: add 16-wide vector SAD, instantiate all but 4x + implementations + [781034812b77] + + * source/encoder/pixel.cpp: + pixel: add C SAD partitions sized 64 + [18d7c9ec18c8] + +2013-04-13 Steve Borho + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/testbench.cpp: + backout bitdepth argument changes, the HM does not operate this way + yet + + The HM allows you to encode 8 bit pixels with HIGH_BIT_DEPTH + enabled, so we cannot assume the bit depth is 10 just because + HIGH_BIT_DEPTH is enabled. + + However, the test bench should always test 10 bit pixels in that + config. + [9d49396e32b0] + +2013-04-13 https://mandarmcw + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/macroblock.h, + source/encoder/primitives.h, source/encoder/vec/macroblock.inc, + source/test/testbench.cpp, source/test/unittest.h: + Updated the source as per the comments from Steve. + * Unit test for filter improved. + * bitDepth taken as a MACRO + * Moved #define to respective cpp files + * Remove unnecessary lines + * Names added to copyright headers + [c83ff08514ca] + +2013-04-13 Mandar Gurav + + * Merged multicoreware/xhevc into default + [fe804dbed45e] + +2013-04-12 https://mandarmcw + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.h, source/encoder/vec/macroblock.inc, + source/test/testbench.cpp, source/test/unittest.h: + vectorization of filter<8,0,...> stage 1 Added testbench support + [cdb296338769] + +2013-04-13 Deepthi + + * source/encoder/macroblock.cpp: + Removing implicit-type conversion warnings. + [848bb2845b01] + +2013-04-12 Steve Borho + + * source/encoder/macroblock.cpp: + macroblock: remove pixel typecasts, since block is now a Short* + [9eb88e4e0d10] + +2013-04-13 ShinYee Chung + + * source/encoder/threadpool.cpp: + threadpool: Fix incorrect memory deallocation. + + The in-place buffer is an array. + [5aaee38ecab2] + + * source/encoder/threadpool.cpp: + threadpool: Fix incorrect use of &= operator. + [fe63fd1ba96a] + +2013-04-12 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.cpp: + Merge + [98452862003d] + + * source/encoder/threadpool.cpp: + threadpool: fix VC9 compile + [9fce3c13f7f8] + +2013-04-12 Deepthi + + * source/Lib/TLibCommon/TComRdCost.cpp: + Replacing primitive call in xHADs8 and xHADs + [36d7b00b1262] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Replacing primitive call in xHads4 + [cd58cb707516] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Fixing final scaling in sad primitives; disabling satd primitives. + [a14a67418620] + +2013-04-12 Steve Borho + + * source/Lib/TLibCommon/TComTrQuant.cpp: + tabs to spaces + [6b21dab6629d] + +2013-04-12 Deepthi + + * Merge + [9193471ef54e] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Replacing original inversedst with primitives + [67404b0825e3] + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc, source/test/testbench.cpp: + Changing inversedst primitives parameters from pixel to short + [beb412d2232f] + +2013-04-11 Steve Borho + + * source/test/testpool.cpp: + testpool: use fflush() on stdout + [f3ba46f293db] + + * source/encoder/threadpool.cpp: + threadpool: wait for threads to quiesce before shutting them down + [10746baf67fd] + + * source/test/testpool.cpp: + testpool: ensure MD5Frame is freed before pool + [4749a6ab37f9] + + * source/encoder/threading.cpp, source/encoder/threading.h: + threading: build fixes for linux, more error checking + [985af075a732] + + * source/test/testpool.cpp: + testpool: add indication of thread pool size + [c20e695593ed] + + * source/encoder/threading.h, source/encoder/threadpool.cpp, + source/test/testpool.cpp: + threadpool: use counting semaphore on POSIX, single wake event, lock + for eol + + To avoid race hazards at the end of each row, a row deciding to quit + because it is block needs to block out the row above it that might + be trying to exit. The is only ever between two threads, so + hopefully it is low-contention. + + This patch also goes back to a blocking wait when idle, to save + power. + + Note: the test does not currently pass. there is still a data hazard + to be addressed. + [87d026d4cced] + +2013-04-11 Deepthi + + * source/test/testbench.cpp: + Increase iteration count for better measurements + [27c82c220aa9] + +2013-04-11 ShinYee Chung + + * source/encoder/macroblock.cpp: + macroblock: Fix the path in an #include. + + Use / instead of \ to remain portable to Windows and Linux. + [e47f5284eed3] + + * source/Lib/TLibVideoIO/TVideoIOY4m.cpp: + y4m: Fix missing math header include for function ceil(). + + We probably also need to add linking to the math library. + [9092312686da] + +2013-04-11 nandaku2 + + * Merged in ggopu/gopu_x265 (pull request #57) + + Fixed the issue for inversedst and malloc issue + [4e62c71ed307] + +2013-04-11 ggopu + + * source/test/testbench.cpp: + Fixed the issue for inversedst and malloc issue + [510d32f9a7ec] + +2013-04-11 sumalatha + + * source/Lib/encoder.cpp: + The code was crashing when the reconstructed filename is not given + in the cfg file. Fixed it in this version. The fix was like in the + xCreateLib(), the reconstructed file is opened based on the flag + "m_pchReconFile"(this flag is set when the recon filename is given + in cfg), but in xDestroyLib(), the recon file is closed always, so + have included the check like close the file only when the + m_pchReconFile is set. + [a21280198bad] + +2013-04-11 Deepthi + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Filter coefficients should always be const short* (irrespective of + bitdepth) + [4c4119d136fa] + + * source/test/testbench.cpp: + Comments for direction + [e1e522b2a01e] + +2013-04-10 nandaku2 + + * Merged in sumalatha/xhevc_sumalatha (pull request #54) + + support for y4m files(skipping the frame headers at each frame) + [a4ea6dec3a3a] + +2013-04-10 sumalatha + + * source/Lib/TLibVideoIO/TVideoIO.h, + source/Lib/TLibVideoIO/TVideoIOY4m.cpp, + source/Lib/TLibVideoIO/TVideoIOY4m.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h, source/Lib/config.cpp, + source/Lib/config.h, source/Lib/encoder.cpp, source/Lib/encoder.h: + added support for y4m files (like skipping the plain-text frame + headers present in each frame) + [5acc3f79f938] + +2013-04-10 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [72bd9b78bfef] + + * source/encoder/vec/macroblock.inc: + Merged multicoreware/xhevc into default + [1b339b28b04a] + +2013-04-09 Sumalatha Polureddy + + * Merged multicoreware/xhevc into default + [c682e0de3aa0] + +2013-04-08 Sumalatha Polureddy + + * source/encoder/vec/macroblock.inc, source/encoder/vec/pixel.inc, + source/encoder/vec/vecprimitives.inc: + Merged multicoreware/xhevc into default + [d0031331b9c7] + +2013-04-08 sumalatha + + * source/encoder/TComRdCost_SSE.cpp: + Deleted this file to remove merge conflict + [697eee51787c] + +2013-04-05 Sumalatha Polureddy + + * source/Lib/TLibCommon/TComRdCost.cpp, source/encoder/vec/pixel.inc: + Merged multicoreware/xhevc into default + [321cb189ce36] + +2013-04-04 Sumalatha Polureddy + + * source/encoder/vec/pixel.inc, source/test/TestBench.cpp, + source/test/UnitTest.cpp, source/test/UnitTest.h: + Merged multicoreware/xhevc into default + [6993599a77b4] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/VectorClass/dispatch_example.cpp, + source/encoder/TComRdCost_SSE.cpp, + source/encoder/vec/macroblock.inc, source/encoder/vec/pixel.inc, + source/encoder/vec/vecprimitives.inc: + Merged multicoreware/xhevc into default + [b9716716f21a] + +2013-04-04 sumalatha + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/TComRdCost_SSE.cpp, + source/encoder/vec/vecprimitives.inc: + Backed out changeset: 914f5238a7df + [9646554f4779] + + * source/encoder/TComRdCost_SSE.cpp: + Backed out changeset: 850f67a0d4be + [933d82869b8c] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Backed out changeset: 11f514fc3415 + [44be5efc13ac] + +2013-04-03 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.cpp: + made changes to remove merge conflicts + [11f514fc3415] + + * source/encoder/TComRdCost_SSE.cpp: + made changes to remove the merge conflict + [850f67a0d4be] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/TComRdCost_SSE.cpp, + source/encoder/vec/vecprimitives.inc: + moved the "xCalcHADs8x8()" from TComRdCost_SSE.cpp to + vecprimitives.in + [914f5238a7df] + +2013-04-10 nandaku2 + + * Merged in ggopu/gopu_x265 (pull request #55) + + Created the New script for Regression testing + [f7c677c40e17] + +2013-04-10 ggopu + + * build/RegressionTester.bat, build/config.txt: + Created the New script for Regression testing + [b5f0f40fb40e] + +2013-04-10 Deepthi + + * source/test/testbench.cpp: + Cleanup + [f3273ba48335] + + * source/test/testbench.cpp: + Cleanup of cycle count measurement + [0018c423de94] + + * source/test/testbench.cpp: + Disable cycle count. Separate mbdst testing into init_buffer, + checkprimitive and clearbuffer. + [6d16557c9e11] + + * source/test/testbench.cpp: + Buffer handling separated. pixelcmp buffers stay global for now. + [454fe1e43fb6] + + * source/test/testbench.cpp: + More global vars removed + [b05c7182c446] + + * source/test/testbench.cpp: + Removing unnecessary global variables + [a76a6109a7ff] + + * source/test/testbench.cpp: + Replace delete with free. + [08757a371067] + +2013-04-10 Deepthi Devaki + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Used array of function pointers for filter primitives. Added C + primitives for filter_nonvertical. + [66f39285c340] + +2013-04-10 praveen Tiwari + + * Merged multicoreware/xhevc into default + [9e59fba57f1b] + +2013-04-10 praveentiwari + + * source/encoder/macroblock.cpp: + Fixed fastInverseDst function in encoder primitive file by adding + the missing line + [2bb7cf3d86c1] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Added missing line in fastInverseDst function + [5008d1bf6cd2] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Removed last calculation in fastInverseDst function + [f959a6b66cba] + +2013-04-10 Steve Borho + + * source/test/testbench.cpp: + testbench: give an indication of pixel size in test bench + [306defc44548] + + * source/Lib/TLibCommon/TComRdCost.cpp: + TComRdCost: fix handling of iSubShift + + When iSubShift was 1, the strides were already being doubled. We + just needed to halve the rows and run the matching primitive. Also, + the subpel functions were checking for iStep == 0, but xCalcHADs8x8 + was asserting the value to be 1. So it was never calling our satd + performance primitives. + [a95d907ee411] + +2013-04-09 Deepthi + + * source/test/testbench.cpp: + Fix memory leak in testbench + [1dcb29da96d6] + +2013-04-09 Steve Borho + + * source/test/testbench.cpp: + testbench: fix warnings and errors when compiled with GCC + [17d58d88ae09] + + * source/Lib/TLibCommon/CommonDef.h, + source/VectorClass/instrset_detect.cpp: + more Linux Intel compiler fixes + [25f7ecb0aaee] + + * source/Lib/TLibCommon/CommonDef.h: + Prevent multiple NVM_COMPILEDBY definitions + [a344a1fabb0c] + + * source/CMakeLists.txt: + cmake: use CXX env var, which is not a full path + [7c1b5e9245c2] + + * source/CMakeLists.txt: + cmake: treat Intel C++ compiler on Linux as gcc + [aceaa0fb013d] + + * source/encoder/vec/macroblock.inc, source/test/testbench.cpp: + nit cleanups + [bfcb3bfff0f0] + +2013-04-09 Deepthi + + * source/test/testbench.cpp: + Adding to-do list + [0cc3ada11beb] + + * source/test/testbench.cpp: + Removing redundant function + [261e9c67faa6] + + * source/test/testbench.cpp: + Clearer error message + [cc9e92334c63] + + * source/test/testbench.cpp: + Renaming check_*_primitive functions. + [b3b4c099ab8c] + +2013-04-09 ggopu + + * source/test/testbench.cpp: + Added IntraDct check + [23a010c31203] + +2013-04-09 https://mandarmcw + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Modified Filter<8.0,..> functions as per the comments + [d0f96efda047] + +2013-04-08 Steve Borho + + * source/test/testbench.cpp: + testbench: call Setup_Assembly_Primitives() to setup assembly + primitives + [a1e16c2c4e39] + + * source/test/testbench.cpp: + testbench: fix a compiler warning, add notice of which primitives + are testing + [3b0549756509] + + * source/test/testbench.cpp: + testbench: use single pixelcmp test function, tighten up output + logging + [52836d9667ac] + + * source/cmake/FindHg.cmake: + cmake: backout FindHg.cmake, it only causes further problems + [725f3a534139] + + * source/cmake/FindHg.cmake, source/cmake/version.cmake: + cmake: add FindHg.cmake to our repo, only very recent cmake versions + have it + + and fix case-sensitivity issue + [9bbaca4f9d79] + +2013-04-08 ggopu + + * source/test/testbench.cpp: + Included comment to specify the source of code snippet + [3191e8a2d66e] + + * source/test/testbench.cpp: + Implemented the Cycle testing + [37ad4814764e] + +2013-04-07 Steve Borho + + * source/CMakeLists.txt: + cmake: disable ASM compilation for non-GCC compilers until a + workaround is found + [399977f3a6ec] + + * source/cmake/CMakeASM_YASMInformation.cmake: + cmake: use ASM_DIALECT consistently in + CMakeASM_YASMInformation.cmake + [584c490b9274] + + * build/nmake/make-solutions.bat: + cmake: add a VC11 nmake batch file, to test ASM builds + [df3e9810d7a2] + + * source/test/testbench.cpp: + testbench: fix a couple of bugs + [202fe2157dd3] + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: rename primitive variables so they are listed together in + cmake-gui + [0c0b08a9373f] + + * source/cmake/version.cmake: + cmake: add version detection support for hg archives + [aa2c04b65976] + + * source/CMakeLists.txt, source/cmake/version.cmake: + cmake: move version detection into cmake/version.cmake + [00d01d23e853] + + * source/encoder/CMakeLists.txt, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/vec/CMakeLists.txt, + source/encoder/vec/vec-primitives.cpp, + source/encoder/x86/CMakeLists.txt, source/encoder/x86/asm- + primitives.cpp, source/test/testbench.cpp: + Move primitive setup functions closer to their implementations + [c8563d2c4612] + + * source/cmake/CMakeASM_YASMInformation.cmake, + source/cmake/CMakeDetermineASM_YASMCompiler.cmake, + source/cmake/CMakeTestASM_YASMCompiler.cmake: + cmake: add yasm compilation ability + [e4b548bbe391] + + * source/CMakeLists.txt, source/cmake/FindYasm.cmake: + cmake: introduce a cmake/ folder, add a package to find a yasm + assembler + [6f2940ddc545] + + * source/encoder/x86/util.h: + remove x264's utils.h, it has inline assembly we cannot use + + And the functions implemented are not obviously applicable to HEVC + [cd85d3840279] + +2013-04-06 Steve Borho + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt, + source/encoder/primitives.cpp, source/encoder/x86/CMakeLists.txt, + source/encoder/x86/pixel-32.asm, source/encoder/x86/pixel-a.asm, + source/encoder/x86/pixel.h, source/encoder/x86/util.h, + source/encoder/x86/x86inc.asm, source/encoder/x86/x86util.asm, + source/test/CMakeLists.txt, source/test/testbench.cpp, + source/test/unittest.cpp: + cmake: begin to add plumbing for assembly language primitives + + This changeset makes the vectorized primitives a build option, + seperate from the C primitives and ASM primitives. + [0704a16139c0] + + * source/CMakeLists.txt: + cmake: improve version detection logic from revision control + [e59bb7a0d173] + + * source/CMakeLists.txt: + cmake: multithreaded make does not appear to work; gmake users + should use -j4 + [33b9731a501c] + +2013-04-05 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/TComRdCost_SSE.cpp: + remove TComRdCost_SSE.cpp + [ca197e22abff] + + * source/encoder/CMakeLists.txt: + cmake: show the GPL license file in Visual Studio in the x265 + project + [365967021bb4] + + * source/test/unittest.cpp: + unittest: fix pixel variable shadowing type name from primitives.h + + Only broke the GCC build + [5191e30d92cc] + + * source/CMakeLists.txt, source/compat/msvc/getopt.c, + source/compat/msvc/getopt.h: + add GNU getopt library from glibc for argument parsing + + Will replace all of TAppCommon and allow us to use getopt() style + argument parsing on all platforms. + [06c29bb43ee0] + + * source/CMakeLists.txt, source/x265main.cpp: + cmake: compile a version number from Mercurial into x265-cli.exe + + The version number is only updated when cmake is run, so if you care + about the version number you should trigger a cmake update by + touching a cmake file (or building from a clean repository) + [c1924f5ca686] + + * source/encoder/macroblock.cpp: + comment cleanups in macroblock.cpp + [65329bd984d3] + + * source/encoder/vec/pixel.inc: + pixel: add 16x16 and 32x32 vectorized SAD functions + [f153e4a050c6] + +2013-04-05 Deepthi Devaki + + * Merge + [ffb382a731cb] + + * source/encoder/vec/pixel.inc: + Uncrustified pixel.inc after changing sa8d_8x8 + [961f8617fa0a] + + * source/encoder/vec/pixel.inc: + Backed out changeset: dc9cf177f0d9 + [7fdb843fbcc9] + + * source/encoder/vec/pixel.inc: + Uncrustified pixel.inc after changing sa8d_8x8 + [dc9cf177f0d9] + +2013-04-05 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [6de9d7e911b6] + +2013-04-05 Deepthi Devaki + + * source/encoder/vec/pixel.inc: + Fixed overflow issue when 10 bit pixels are used in vec-sa8d_8x8 + [ad257e35e970] + + * source/encoder/vec/pixel.inc: + Backed out changeset: b39276d2ef19 + [e19c604a0505] + + * source/encoder/vec/pixel.inc: + Added cases for high bit depth = 1 and 0 in vec-sa8d_8x8 in + pixel.inc + [b39276d2ef19] + +2013-04-05 Deepthi + + * source/encoder/primitives.cpp: + Disabling 64-size primitives temporarily. + [9aaf8e56f457] + + * source/encoder/primitives.cpp: + Fixing the partition array for 64. + [05df015ff6f4] + + * source/encoder/primitives.cpp: + Rollback: to 8x8 matrix + [1b099c5ad876] + + * source/encoder/primitives.cpp: + Rollback: 64 expands the array heavily. + [fdad810f7948] + + * source/encoder/primitives.cpp: + Width and height permitted to be > 32 + [9d01406efb58] + + * Merge + [315488d4f4b7] + + * source/encoder/primitives.cpp: + Expanding the psize static array to 64. + [0108af4ff636] + + * source/encoder/primitives.h: + Expanding Partitions enum + [49f2265daf5a] + + * Merge + [7281801c55a0] + + * source/Lib/TLibCommon/TComPrediction.cpp: + Temporary array to match Pel + [7d1889dce383] + + * source/Lib/TLibCommon/TComInterpolationFilter.cpp: + Fix Error - matching function declaration and definition + [bf5df7838588] + +2013-04-05 Steve Borho + + * source/encoder/macroblock.cpp: + fix implicit integer type conversions in macroblock.cpp + [0d3b1caacadf] + + * source/x265main.cpp: + fix build error in x265main.cpp + [76e4a2d6a8ba] + +2013-04-04 Steve Borho + + * source/encoder/primitives.cpp, source/x265main.cpp: + rewrite main(), add code that describes detected CPU type and other + info + [5c55457268e2] + +2013-04-05 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TypeDef.h, source/encoder/macroblock.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Cleaned up the files checked in for modified encoder performance + primitive for fastinversedst. + [ba79f2fbc467] + + * source/encoder/TComTrQuant_SSE.cpp: + Removed as it is no longer needed. + [7ca0466b0cd6] + +2013-04-04 praveen Tiwari + + * Merged multicoreware/xhevc into default + [6a6d1c7492dc] + + * Merged multicoreware/xhevc into default + [9ebdc2159a69] + + * source/encoder/primitives.h, source/encoder/vec/pixel.inc: + Merged multicoreware/xhevc into default + [f582f89a4cad] + +2013-04-04 praveentiwari + + * source/encoder/macroblock.cpp, source/encoder/primitives.h, + source/encoder/vec/macroblock.inc: + Modified encoder performance primitive for fastinversedst + [044cf66c07ba] + +2013-04-04 praveen Tiwari + + * Merged multicoreware/xhevc into default + [067fdd3b2787] + + * source/encoder/vec/macroblock.inc, source/encoder/vec/pixel.inc, + source/encoder/vec/vecprimitives.inc, source/test/TestBench.cpp, + source/test/UnitTest.cpp, source/test/UnitTest.h: + Merged multicoreware/xhevc into default + [b6dcf4d300cd] + +2013-04-04 praveentiwari + + * source/encoder/vec/pixel.inc: + resolving merge conflict with pixel.inc + [0b5baa981f79] + + * source/Lib/TLibCommon/TypeDef.h, source/encoder/vec/macroblock.inc, + source/encoder/vec/pixel.inc: + resolving merge conflicts with pixel.inc and macroblock.inc + [205ece1b25c4] + + * source/encoder/vec/vecprimitives.inc: + second attempt to resolve merge conflict + [6107601bfd02] + + * source/encoder/vec/vecprimitives.inc: + solving merge conflict with vecprimitives.inc + [3706ae74ae2f] + + * source/encoder/macroblock.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/vec/vecprimitives.inc: + Encoder Performance Primitive fastInverseDst + [ad754aa34a5b] + + * source/encoder/primitives.h: + Added a cdecl funcdef to primitives.h and function pointer to + EncoderPrimitives structure + [79f25cc12b0f] + +2013-04-03 praveen Tiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/VectorClass/dispatch_example.cpp, + source/encoder/CMakeLists.txt: + Merged multicoreware/xhevc into default + [19bbb70cf090] + +2013-04-04 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp: + Solving merge conflict with TComTrQuant.cpp file + [30e4df5a4552] + +2013-04-03 praveen Tiwari + + * source/encoder/CMakeLists.txt: + Merged multicoreware/xhevc into default + [6e07d4ecd02e] + +2013-04-03 praveentiwari + + * source/encoder/CMakeLists.txt: + Second attempt to solve merge conflict + [305d19e49341] + + * source/encoder/CMakeLists.txt: + solving merge conflict with CMakeLists.txt + [d69a08ec93b4] + + * cfg/per-sequence/BasketballPass.cfg: + Backed out changeset: 1801be94c033 + [52da05be5212] + + * source/encoder/CMakeLists.txt: + Merge Conflict + [aa4d052d016c] + + * cfg/per-sequence/BasketballPass.cfg: + Merge Conflict + [1801be94c033] + +2013-04-02 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.h, source/encoder/CMakeLists.txt: + fastInverseDst with declarartion in TComTrQuant.h file + [686753ae63ac] + + * source/encoder/CMakeLists.txt: + Solving merge conflict with CMakeLists.txt + [d5d198a662ee] + +2013-04-01 praveen Tiwari + + * source/encoder/CMakeLists.txt: + Merged multicoreware/xhevc into default + [779f1786e667] + +2013-03-31 praveen Tiwari + + * source/encoder/CMakeLists.txt, source/x265cfg.cpp, source/x265cfg.h, + source/x265top.cpp, source/x265top.h: + Merged multicoreware/xhevc into default + [28876bb73087] + +2013-03-28 praveen Tiwari + + * source/encoder/CMakeLists.txt: + Merged multicoreware/xhevc into default + [7a2643c01212] + +2013-03-28 praveentiwari + + * source/encoder/CMakeLists.txt: + Trying to resolve conflict during merge + [55b8785b08bb] + +2013-03-27 praveentiwari + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/encoder/TComTrQuant_SSE.cpp: + Vectorized fastInverseDst with added fastInverse decleration and + ENABLE_VECTOR macro + [8d5b58c7cf54] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/TComTrQuant_SSE.cpp: + Backed out changeset: 587fa81d140b + [b84bc6b7ba79] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/TComTrQuant_SSE.cpp: + Replaced Clip3 and Inhanced techniqueVectorized + TComTrQuant::fastInverseDst with added ENABLE_VECTOR macro + [587fa81d140b] + +2013-03-27 praveen Tiwari + + * Merged multicoreware/xhevc into default + [531e2bde522d] + +2013-03-26 praveentiwari + + * source/encoder/TComTrQuant_SSE.cpp: + Replaced Clip3 and Inhanced technique to load data in vectorization + of TComTrQuant::fastInverseDst + [b3b9435fdb6c] + +2013-03-26 praveen Tiwari + + * source/encoder/CMakeLists.txt: + Merged multicoreware/xhevc into default + [225bfae48ee2] + +2013-03-25 praveentiwari + + * source/encoder/TComTrQuant_SSE.cpp: + Vectorized code for TComTrQuant::fastInverseDst + [14dab7ea86d5] + + * source/encoder/CMakeLists.txt: + Modified CMakeLists.txt for TComTrQuant_SSE.cpp + [3a4b0271c907] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/CMakeLists.txt: + Backed out changeset: aa84b30cd9ae + [297daf59f4e7] + + * source/Lib/TLibCommon/TComTrQuant.cpp, + source/encoder/CMakeLists.txt: + Modified CMakeLists.txt for TComTrQuant.cpp + [aa84b30cd9ae] + + * source/Lib/TLibCommon/TComTrQuant.cpp: + First attempt to vectorize + [efdab4e67252] + +2013-04-04 Steve Borho + + * source/test/testbench.cpp: + testbench: add remaining pixel functions + [a7b9fc5161dc] + + * source/test/testbench.cpp: + testbench: use a for() loop, rename numofprim to curpar + [084eea175be7] + + * source/test/testbench.cpp: + testbench: combine loops + [f139817f864a] + + * source/test/testbench.cpp, source/test/unittest.cpp, + source/test/unittest.h: + testbench: remove redundant partition enum, clean unittest.h + + Headers should #include only the minimal number of headers required + to ensure their own data types are defined. They should also not use + "using namespace", only CPP files should use those statements. + [16de12763457] + + * source/test/testbench.cpp: + testbench: more flexible argument parsing + [11cd8cf25ae1] + + * source/test/testbench.cpp: + testbench: remove unused do_bench + [7b6028c5e509] + + * source/test/testbench.cpp: + testbench: allow cpuid to be user-provided + [2df55ead8fe4] + + * source/test/testbench.cpp: + pass primitive function tables to validate to the check* functions + [6473652ab8da] + + * source/test/testbench.cpp: + testbench: fix typos in comments, remove unused global var + [380aa78e70ae] + + * source/Lib/TLibCommon/TComRdCost.cpp: + xCalcHADs8x8 should call sa8d_8x8, remove satd calls from similar + functions + + xCalcHADs16x4() and xCalcHADs4x16() would both need sa8d primitives + [1a2ac116b26c] + + * source/encoder/pixel.cpp, source/encoder/primitives.h, + source/encoder/vec/pixel.inc: + primitives: add sa8d_8x8 and sa8d_16x16 primitives + + I copied C references for those functions from x264, and renamed + satd_8x8 in pixel.inc to sa8d_8x8 since that is what it implements. + [6b67af80a1f4] + +2013-04-04 ggopu + + * source/test/testbench.cpp, source/test/unittest.cpp, + source/test/unittest.h: + Added header and removed unwanted variable and comments + [76e096fbd6bf] + + * doc/UnitTestUsage.txt, source/test/testbench.cpp: + Added Test bench usage Doc and Included the Single primitive check + and Fixed all the Warning in test bench + [cfa472c6afd5] + +2013-04-04 Gopu G + + * source/test/CMakeLists.txt: + Merged multicoreware/xhevc into default + [a373f73d422a] + +2013-04-04 ggopu + + * source/test/CMakeLists.txt: + solved conflict issue + [df3b82e6640f] + + * source/test/CMakeLists.txt: + Disabled warnings for testbench + [b7adfdf47dbe] + + * doc/UnitTestUsage.txt: + Added Doc for UnitTest Usage + [f9a1d53c7703] + + * source/test/testbench.cpp: + Added Single primitive check + [765992ddceb1] + +2013-04-04 Steve Borho + + * source/encoder/vec/pixel.inc: + uncrustify: fix eoln damage in pixel.inc + [a6a4ecc9d93c] + +2013-04-04 Deepthi + + * Merge + [e07bb748be27] + + * source/encoder/vec/pixel.inc: + Removing shadows/redeclarations + [ff75a4f2a134] + +2013-04-04 Steve Borho + + * source/test/CMakeLists.txt: + cmake: rename thread pool test to PoolTest + [b79778a2a241] + + * source/test/CMakeLists.txt: + cmake: fix case of test bench filenames + [8801f4346870] + +2013-04-04 nandaku2 + + * source/test/unittest.CPP: + Merged in ggopu/gopu_x265 (pull request #33) + + file name extension issue fixed + [4449194c1d02] + +2013-04-04 ggopu + + * source/test/unittest.CPP, source/test/unittest.cpp: + File name Extension issue fixed + [350b345eff63] + +2013-04-04 nandaku2 + + * Merged in deepthidevaki/xhevc_deepthid (pull request #32) + + Added vectorized satd_8x8 to pixel.inc + [7e51f92d13e8] + +2013-04-04 Deepthi Devaki + + * source/encoder/vec/pixel.inc: + Added vectorized satd_8x8 to pixel.inc + [349f2249ec7d] + +2013-04-04 nandaku2 + + * source/test/TestBench.cpp, source/test/UnitTest.cpp, + source/test/UnitTest.h: + Merged in ggopu/gopu_x265 (pull request #31) + + changed the file name conversion and added comparision for c and + vector primitives + [4abb704ac3ba] + +2013-04-04 ggopu + + * source/test/TestBench.cpp, source/test/UnitTest.cpp, + source/test/UnitTest.h, source/test/testbench.cpp, + source/test/unittest.CPP, source/test/unittest.h: + Changed file name conversion and included the c and vector primitive + comparision check + [3101b15fba69] + +2013-04-03 Gopu G + + * source/VectorClass/dispatch_example.cpp, source/test/TestBench.cpp: + Merged multicoreware/xhevc into default + [809027331954] + +2013-04-03 ggopu + + * source/test/TestBench.cpp: + conflict error + [3adf277f27b9] + + * source/test/TestBench.cpp: + resolved conflict error + [8089151317d8] + + * source/test/TestBench.cpp: + solved conflict error + [8bd75c20c1be] + + * source/test/TestBench.cpp: + resolved conflict error + [3e19d1cb4d37] + +2013-04-03 Gopu G + + * Merged multicoreware/xhevc into default + [50f85e140aac] + +2013-04-02 Gopu G + + * Merged multicoreware/xhevc into default + [181970e99874] + +2013-04-02 ggopu + + * source/test/TestBench.cpp: + Enhanced test bench for satd*x* + [8659ad7e9457] + +2013-04-01 Gopu G + + * source/test/TestBench.cpp, source/x265cfg.cpp, source/x265cfg.h, + source/x265top.cpp, source/x265top.h: + Merged multicoreware/xhevc into default + [7900bd4aedb5] + +2013-04-01 ggopu + + * source/test/TestBench.cpp: + Resolved the Conflict Error + [7d97e00d4fdf] + + * source/test/TestBench.cpp: + Resolved conflict + [f085c786246b] + +2013-03-28 ggopu + + * source/test/TestBench.cpp: + included test functions like pixelcmp + [585c75f7107a] + +2013-04-04 Steve Borho + + * source/encoder/vec/CMakeLists.txt, + source/encoder/vec/macroblock.inc, source/encoder/vec/pixel.inc, + source/encoder/vec/vecprimitives.inc: + break vecprimitives.inc into multiple headers + [462b9403a0bf] + +2013-04-03 Steve Borho + + * source/encoder/macroblock.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h: + make an empty shell for macroblock.cpp + [732999829240] + + * source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibEncoder/TEncPreanalyzer.cpp: + uncrustify: fixup some code where uncrustify thought it saw + templates + + x> 1 was converted to x > 1 instead of x < (foo>>1). I + added parens to avoid this abiguity. + [7595da33f0f2] + + * source/encoder/macroblock.cpp: + change macroblock.cpp to unix eoln + [70861c21adc1] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + Merge + [e94d0b91c049] + +2013-04-04 Deepthi + + * source/encoder/CMakeLists.txt: + cmake: add macroblock.cpp + [39e35d878b64] + + * source/encoder/macroblock.cpp: + Adding new file macroblock.cpp + [6618a2d9f5fe] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + Move Setup_C_PixelPrimitives decl to header file + [689fa6d23f01] + +2013-04-03 Steve Borho + + * source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibEncoder/TEncRateCtrl.h: + uncrustify: final run, fixups my hand edits + [d77975bfe8ec] + + * source/Lib/TAppCommon/program_options_lite.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComSlice.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncCu.cpp, source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, source/Lib/config.cpp, + source/Lib/encoder.cpp, source/Lib/libmd5/libmd5.c: + uncrustify: apply globally again + + After the first pass, some of the distance thresholds are now + exceeded, requiring #endif comments or better wrapping. I've hand + edited a few spots to make them easier to read + [186aa91fb1c3] + + * doc/uncrustify/apply-to-all-source.py: + uncrustify: do not apply style to VectorClass headers + + They are difficult functions to align mechanically, just leave them + be + [21185b179d11] + + * source/Lib/TAppCommon/program_options_lite.cpp, + source/Lib/TAppCommon/program_options_lite.h, + source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComCABACTables.cpp, + source/Lib/TLibCommon/TComCABACTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComMv.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.cpp, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncPreanalyzer.cpp, + source/Lib/TLibEncoder/TEncPreanalyzer.h, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h, source/Lib/config.cpp, + source/Lib/config.h, source/Lib/encoder.cpp, source/Lib/encoder.h, + source/Lib/libmd5/MD5.h, source/Lib/libmd5/libmd5.c, + source/Lib/libmd5/libmd5.h, source/PPA/ppa.cpp, source/PPA/ppa.h, + source/PPA/ppaApi.h, source/VectorClass/instrset.h, + source/VectorClass/instrset_detect.cpp, + source/encoder/TComRdCost_SSE.cpp, source/encoder/pixel.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h, + source/encoder/threading.cpp, source/encoder/threading.h, + source/encoder/threadpool.cpp, source/encoder/threadpool.h, + source/encoder/vec/vecprimitives.inc, source/test/TestBench.cpp, + source/test/UnitTest.cpp, source/test/UnitTest.h, + source/test/testpool.cpp: + uncrustify: apply globally + [a2054f7e2644] + + * doc/uncrustify/apply-to-all-source.py, doc/uncrustify/drag- + uncrustify.bat: + uncrustify: add a drag/drop batch file for uncrustify, and apply-to- + all.py + [a737735e60a6] + + * source/VectorClass/dispatch_example.cpp: + remove VectorClass/dispatch_example.cpp + [98b7425b8b1d] + + * source/encoder/threading.cpp: + threading: do not call pthread_exit after joining a thread + [8333f6a4b74e] + + * source/encoder/pixel.cpp, source/encoder/primitives.cpp: + primitives: allow for the fact that multiple C++ files will define C + primitives + [98bb68d4f9aa] + + * source/VectorClass/instrset_detect.cpp: + instrset: prevent compiler warnings about nop function + [4f459b703920] + + * source/encoder/pixel.cpp: + pixel: 4x16 and 16x4 were backwards + [212467c42aba] + + * source/VectorClass/instrset_detect.cpp: + instrset: improve compiler fallbacks for xgetbv + [8d1a5839809a] + + * source/encoder/TComRdCost_SSE.cpp: + remove cruft from TComRdCost_SSE.cpp + [9ac68e0fc2ee] + + * source/encoder/vec/vecprimitives.inc: + astyle: cleanup vecprimitives.inc + [3ecb9180930e] + + * Merged in deepthidevaki/xhevc_deepthid (pull request #30) + + Added satd4x4 in vecprimitives.inv + [d17d19c2f9d6] + +2013-04-03 Deepthi Devaki + + * source/encoder/vec/vecprimitives.inc: + Added vectorized hads4x4(satd_4x4) to vecprimitive.inc + [ba8f928d91d6] + +2013-04-03 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [e9cffb400da1] + + * source/encoder/TComRdCost_SSE.cpp: + Merged multicoreware/xhevc into default + [b553eaec9dd7] + +2013-04-03 Deepthi Devaki + + * source/encoder/TComRdCost_SSE.cpp: + Vectorized xCalcHADS8x8 with 16 bit operations + [552a75a69cb0] + + * source/encoder/TComRdCost_SSE.cpp: + Backed out changeset: e4616400a510 + [88a6837e15b5] + + * source/encoder/TComRdCost_SSE.cpp: + Backed out changeset: e500456e0146 + [e1ada400cd62] + +2013-04-03 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [9054e642786b] + +2013-04-02 Deepthi Devaki + + * source/encoder/TComRdCost_SSE.cpp: + Modifed xCalcHADS8x8 with 16bit operations. + [e500456e0146] + + * source/encoder/TComRdCost_SSE.cpp: + Modified xCalcHADS8x8 with 16bit operations. + [e4616400a510] + +2013-04-03 Steve Borho + + * source/encoder/threadpool.cpp: + threadpool: use InterlockedCompareExchange64 + [74715f9955e9] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + rename cpuIDDetect to CpuIDDetect, to follow convention of + primitives.cpp + [3b9f17747601] + + * source/encoder/primitives.cpp, source/encoder/primitives.h, + source/encoder/vec/CMakeLists.txt: + cmake: gcc 4.6 supports AVX, but not yet AVX2 + [5d84ec0af006] + +2013-04-03 Deepthi + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + Moving function declarations to the header file. + [329deaeea8e7] + +2013-04-03 Steve Borho + + * source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/vec/vecprimitives.inc: + Merge + [3732add09cc2] + +2013-04-03 Deepthi + + * source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/vec/vecprimitives.inc: + Changing Primitives Setup for C and vector code. + [c5b472d23023] + +2013-04-02 Deepthi + + * source/encoder/primitives.cpp: + Bug Fix for crashes in PartitionFromSizes + [c9dd52c5c274] + +2013-03-30 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TypeDef.h, source/encoder/TComRdCost_SSE.cpp: + Replace #define ENABLE_VECTOR with ENABLE_PRIMITIVES cmake option + + Use x265 performance primitives. We need to profile which of the 12, + 24, 48, and 64 partition sizes are worth adding as primitives. + + Ditto for the SSE (sum of square differences) functions. + [30ece3e5796f] + +2013-04-02 Steve Borho + + * doc/uncrustify/codingstyle.cfg: + uncrustify: tweak enum and struct bracing and comment margins + [2ce1d641581a] + + * doc/uncrustify/codingstyle.cfg, source/PPA/ppaApi.h, + source/encoder/pixel.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/threading.cpp, + source/encoder/threading.h, source/encoder/threadpool.cpp, + source/encoder/threadpool.h, source/encoder/vec/vecprimitives.inc: + Prepare to use uncrustify on our encoder sources + [ae8c19ec7690] + + * doc/uncrustify/codingstyle.cfg, doc/uncrustify/uncrustify.bat, + doc/uncrustify/uncrustify.exe: + add uncrustify and a base config + [0b9a400d0a1a] + + * source/encoder/threadpool.cpp: + threadpool: remove semicolon from macro definition + [a6a8378d4f32] + + * source/encoder/primitives.cpp: + primitives: replace asserts with if() checks + [f833a10d6b5c] + + * source/test/testpool.cpp: + testpool: add another assertion + [3b903329179e] + + * source/encoder/threadpool.cpp: + threadpool: correct the order of the arguments to + InterlockedCompareExchange() + + Amusingly, it mostly worked the wrong way, except it never cleared + any bits, so the worker threads kept working on rows until the frame + dequeued itself. + [c3dad08b999b] + + * source/test/testpool.cpp: + testpool: stop processing CUs on the last column + [e26d67716fd9] + + * source/encoder/threadpool.cpp: + threadpool: use the correct MSVC instrinsic for integer CAS + [821adc18540a] + + * source/encoder/threadpool.cpp: + threadpool: use atomic compare and swap when consuming rows + + The GCC __sync_and_and_fetch() instrinsic was returning the new + value, which is entirely useless. The thread could not tell if it + was the one which cleared the bit. So I've switched it to a compare + and swap operation. Now the thread pool test runs correctly when + compiled with GCC + [b2d5a5aa7a20] + + * source/encoder/threadpool.cpp: + threadpool: remove redundant semicolon + [10e63f2bdbc1] + + * source/encoder/primitives.cpp: + apply coding style to primitives.cpp, minor cleanups + [2b33261c0e61] + + * source/encoder/primitives.cpp: + primitives: remove cruft from primitives.cpp, add compiler checks + + Do not try to access vectorized architectures that we know the + compiler cannot generate. + [128a506f7cb9] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + primitives: do not include instrset.h from primitives.h + + Just declare the instrset_detect() function as an extern in + primitives.cpp + [0f32be5b4fa8] + + * source/encoder/CMakeLists.txt: + cmake: tabs to spaces + [1bd0a875d97d] + +2013-04-02 sumalatha + + * source/encoder/CMakeLists.txt, source/encoder/primitives.cpp, + source/encoder/primitives.h: + 1. Included the cpu detection logic in SetupPrimitives() function. + 2. Changed the SetupPrimitives() function such that, based on the + cpuid, the corresponing vector architure and fucntions are selected + [d5eb4f663e29] + + * source/encoder/CMakeLists.txt, source/encoder/primitives.cpp, + source/x265main.cpp: + Backed out changeset: 94aff9bad183 + [8f33a2e2af52] + + * source/encoder/cpu_detection.cpp, source/encoder/cpu_detection.h, + source/encoder/instrset.h, source/encoder/instrset_detect.cpp: + Backed out changeset: 36b4e54f54b9 + [274955b783ac] + +2013-04-02 Sumalatha Polureddy + + * source/encoder/CMakeLists.txt: + Merged multicoreware/xhevc into default + [a729ebc85dd7] + +2013-04-01 sumalatha + + * source/encoder/cpu_detection.cpp, source/encoder/cpu_detection.h, + source/encoder/instrset.h, source/encoder/instrset_detect.cpp: + these files has to go with the previous checkin version(199) + [36b4e54f54b9] + + * source/encoder/CMakeLists.txt, source/encoder/primitives.cpp, + source/x265main.cpp: + 1. Included the cpu detection logic in the main 2. Changed the + SetupPrimitives() function such that, based on the cpuid, the + corresponing vector architure and fucntions are selected + [94aff9bad183] + +2013-04-02 Steve Borho + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt, + source/encoder/primitives.h: + cmake: x64 Linux build fixes + [c67a39f91e07] + + * source/CMakeLists.txt: + cmake: provide an appopriate -march for 64bit GCC targets + [0c8fda784ba1] + +2013-04-01 Steve Borho + + * source/Lib/CMakeLists.txt, source/encoder/CMakeLists.txt, + source/encoder/vec/CMakeLists.txt, source/test/CMakeLists.txt: + cmake: use GCC variable, in lieu of CMAKE_COMPILER_IS_GNUCXX + [60c583f05000] + + * source/CMakeLists.txt: + cmake: detect 64bit build, set GCC flag if GCC is detected + + Both of these can be used in internal CMakeLists.txt files, to + improve clarity + [0e8858d0fb9f] + +2013-04-01 deepthidevaki + + * source/encoder/TComRdCost_SSE.cpp: + Modified xCalcHADS4x4 to use 16bit operations. + [87514f1663f3] + +2013-03-30 Steve Borho + + * source/Lib/TLibCommon/TypeDef.h: + Disable DISTORTION_PRECISION_ADJUSTMENT when HIGH_BIT_DEPTH is + disabled. + + This turns the operation into a >> 0, which any sane compiler will + discard. + [99eea17c57d6] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + add mapping function from Width x Height to Partition enum + [759fd1db4927] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/Lib/config.cpp, source/Lib/config.h, source/Lib/encoder.cpp, + source/Lib/encoder.h, source/x265cfg.cpp, source/x265cfg.h, + source/x265main.cpp, source/x265top.cpp, source/x265top.h: + move top level encoder classes into Lib/ folder + + The encoder should be usable as a shared library (x265.lib + + HM.lib). The top level class should not be part of the CLI-only + portion of the source. + [c509aabfc0a7] + + * source/encoder/CMakeLists.txt: + remove unused set_source_files_properties + [2c938c6441e3] + + * source/Lib/TLibEncoder/TEncCavlc.h: + fix VC compiler warnings (unreferenced formal parameter) in HM + source + [3df5de68dbe7] + + * source/Lib/TLibCommon/TComDataCU.h: + fix VC compiler warnings (integer size type conversions) in HM + source + [73f1aa34c9a0] + + * source/Lib/TLibCommon/ContextModel3DBuffer.h: + fix VC compiler warning in the HM source + [d6a9f964a0dc] + +2013-03-29 Steve Borho + + * source/Lib/TLibCommon/TComPicYuvMD5.cpp: + prevent one compile warning when HIGH_BIT_DEPTH is disabled + [e0b7e82b403f] + + * source/encoder/vec/vecprimitives.inc: + add a stub 8bit vectorized primitive to prevent compile warnings + [cbff9e95a7cb] + + * source/encoder/pixel.cpp, source/encoder/primitives.h: + move primitives_c extern to the file(s) which define its methods + [ace6ed34ee4c] + + * source/encoder/CMakeLists.txt, source/encoder/pixel.cpp: + add templated SATD functions, based on x264's macro routines + + These use SWAR to do two operations in each clock cycle + [018408f096c4] + + * source/encoder/pixel.cpp, source/encoder/primitives.h: + add more irregular partition sizes (we probably still need 24s) + [f676a5209ed7] + + * source/encoder/CMakeLists.txt, source/encoder/pixel.cpp: + add pixel.cpp with templated sad function + [0f94e98ddcfa] + + * source/encoder/vec/vecprimitives.inc: + bug fix in vecprimitives.inc + [aa80411df22e] + + * source/encoder/vec/vecprimitives.inc: + declare seperate implementations of vector primitives for 8bit and + 16bit pels + [7adb3cf4ebe2] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + stub in support for C reference versions of each encoder primitive + [ddc4616609f6] + + * source/compat/msvc/stdint.h: + ensure intptr_t is defined on MSVC compiles + [ab7e6788a2bc] + + * source/encoder/primitives.h, source/encoder/vec/vecprimitives.inc: + compiler portable CDECL macro + [130b6f1555d2] + + * source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h, + source/encoder/TComRdCost_SSE.cpp, source/encoder/primitives.cpp, + source/encoder/primitives.h, source/encoder/threading.cpp, + source/encoder/threading.h, source/encoder/threadpool.cpp, + source/encoder/threadpool.h, source/encoder/vec/avx.cpp, + source/encoder/vec/avx2.cpp, source/encoder/vec/sse2.cpp, + source/encoder/vec/sse3.cpp, source/encoder/vec/sse41.cpp, + source/encoder/vec/sse42.cpp, source/encoder/vec/ssse3.cpp, + source/encoder/vec/vecprimitives.inc, source/test/TestBench.cpp, + source/test/UnitTest.cpp, source/test/UnitTest.h, + source/test/testpool.cpp, source/x265cfg.cpp, source/x265cfg.h, + source/x265main.cpp, source/x265top.cpp, source/x265top.h: + apply coding style to all new code (leaving HM unmodified for the + moment) + [86b79c630261] + + * doc/astyle/apply-to-all-source.py: + add a Python script to apply coding style globally + [750cd70ec32b] + + * doc/astyle/drag-astyle.bat: + fix comment in drag-astyle.bat + [bae49dd4f46d] + + * doc/astyle/AStyle.exe, doc/astyle/astyle-config.txt, doc/astyle + /drag-astyle.bat: + Add astyle.exe for Windows, a config file, and a handy drag-drop + batch file + [5695e621070f] + + * source/x265top.cpp: + also take updates to x265top.cpp + [9096a01c31e7] + + * source/Lib/TLibCommon/CommonDef.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/TComRom.cpp, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncSbac.h: + Take HM tip code (mostly cleanups) + [d5a4085b832a] + +2013-03-29 Deepthi + + * source/encoder/vec/vecprimitives.inc: + xCaldHads4x4 added as an example vector primitive. + [7794ab12b1b8] + + * source/encoder/primitives.cpp: + Remove unused parameter warning + [f69b495d2323] + +2013-03-29 Steve Borho + + * source/CMakeLists.txt, source/Lib/TLibCommon/TypeDef.h, + source/encoder/primitives.h, source/encoder/vec/vecprimitives.inc: + cmake: add HIGH_BIT_DEPTH build flag, defaults to ON (previous + behavior) + [a168ba02c525] + +2013-03-28 Steve Borho + + * source/encoder/threadpool.cpp: + threadpool: fix MinGW build again + [ec62c87b6a69] + + * source/test/testpool.cpp: + testpool: show MD5 hash as 4 hex ints + [e78340875f4e] + + * source/encoder/threadpool.cpp, source/encoder/threadpool.h: + threadpool: make worker threads spin so long as job providers are + enqueued + + The idle list mechanism was buggy; threads would go idle when no + work was available but none would be in the idle list when a work + request arrived. Not sure how to fix this without locking, so + removing the idle list for now. All threads are awakened when job + providers are enqueued, and they stay active until there are no job + providers registered with the thread pool. + + Awakening a thread multiple times should be safe; might just cause a + few extra loops for it to go idle later. + [34d48bb54077] + + * source/encoder/threadpool.cpp: + threadpool: remove idle thread poke when job provider is enqueued + + The provider may not have work yet; and if it did it can poke the + pool itself + [ede480130c99] + + * source/CMakeLists.txt: + add -march=i686 globally to gcc compile flags + + This was required to fix MinGW link errors on Windows + (__sync_and_fetch*) + [64821a2d1dfe] + + * source/test/testpool.cpp: + add a proper hex print for the MD5 hash + [9235319777da] + + * source/test/UnitTest.cpp: + white-space cleanups in UnitTest.cpp + [e8052c1be059] + + * source/test/CMakeLists.txt, source/test/UnitTest.cpp: + more MSYS fixes + [e87400a37651] + + * source/test/CMakeLists.txt, source/test/TestBench.cpp: + better workaround for strdup + [14115a094db7] + + * source/test/CMakeLists.txt, source/test/TestBench.cpp: + msys compilation fixes for TestBench + [cdb75f7b2fa7] + + * source/test/CMakeLists.txt, source/test/TestBench.cpp, + source/test/UnitTest.cpp: + fix test bench compiler warnings + [fe55521ed24c] + + * source/test/CMakeLists.txt: + cmake: fix case sensitivity of filenames + [ad7dd76dc39d] + + * source/test/CMakeLists.txt: + cleanup source/test/CMakeLists.txt + [47f9ec819fe8] + +2013-03-25 ShinYee Chung + + * build/linux/make-Makefiles.bash: + Build: Fix missing executable permission on the BASH script. + [0abab5c52e97] + +2013-03-28 nandaku2 + + * Merged in ggopu/gopu_x265 (pull request #19) + + Created the Test bench For Vector Premitive Functions + [a35abc703177] + +2013-03-28 ggopu + + * source/test/CMakeLists.txt, source/test/TestBench.cpp, + source/test/UnitTest.cpp, source/test/UnitTest.h: + Created the Test bench For Vector Premitive Functions + [c530d6d7b14f] + +2013-03-29 ShinYee Chung + + * source/encoder/threadpool.cpp: + threadpool: Fix missing #include for memset(). + [5ac9ea3e104f] + +2013-03-28 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/TComRdCost_SSE.cpp, + source/encoder/primitives.cpp, source/encoder/primitives.h: + add ALIGN_VAR macros for compiler portability + [98b3d54f8bf6] + + * source/encoder/primitives.h: + primitives: use extern "C" instead of _cdecl; is more compiler + portable + [498c9bea248a] + + * source/encoder/primitives.cpp, source/encoder/primitives.h, + source/x265main.cpp: + primitives: call SetupPrimitives() before starting encoder + [da0d5b94edb6] + + * source/CMakeLists.txt, source/encoder/primitives.cpp: + primitives: add ENABLE_PRIMITIVES build option, default to ON + [f2fa629b460b] + + * source/encoder/primitives.cpp, source/encoder/primitives.h, + source/encoder/vec/CMakeLists.txt: + primitives: move vectorized function table externs into CPP file + [24ac23d75f87] + + * source/encoder/primitives.cpp, source/encoder/primitives.h: + move stdint.h include to primitives.h since it uses std int types + [a1b10c4251f9] + + * source/encoder/primitives.h, source/encoder/vec/vecprimitives.inc: + primitives: match x264's pixel compare funcdef so we can later use + their asm + [4dc9fabbf725] + + * source/encoder/CMakeLists.txt: + fix compile of TComRdCost_SSE.cpp on VC (not sure why this worked + before) + [a3a4f0ed61b4] + + * source/encoder/vec/CMakeLists.txt: + cmake: VC10 appears to support /arch:AVX + + We can back this out later if there are problems with it + [ce56b207242b] + +2013-03-27 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/primitives.cpp: + add a stub primitives.cpp, showing what CPU runtime detection should + do + [d169138ef9f2] + + * source/encoder/vec/vecprimitives.inc: + vec: automatically generate a primitive table for each vector + architecture + [169f67a4972c] + + * source/encoder/CMakeLists.txt: + cmake: add primitives.h x265 project + [0bc67c2dc278] + + * source/encoder/primitives.h: + introduce EncoderPrimitives structure of function pointers + [64638e854efb] + + * source/VectorClass/vectori256.h: + workaround two compiler warnings for AVX2 and VC11 + [97e315fd1008] + + * source/encoder/CMakeLists.txt, source/encoder/vec/CMakeLists.txt, + source/encoder/vec/avx.cpp, source/encoder/vec/avx2.cpp, + source/encoder/vec/sse2.cpp, source/encoder/vec/sse3.cpp, + source/encoder/vec/sse41.cpp, source/encoder/vec/sse42.cpp, + source/encoder/vec/ssse3.cpp, source/encoder/vec/vecprimitives.inc: + introduce a vec/ folder for instancing vectorized encoder primitives + + This is incomplete. Next step is to declare a function table and set + one up. + [05481bf6d65a] + + * source/encoder/TComRdCost_SSE.cpp: + use one contributor per line in copyright header + [27148d109699] + + * source/CMakeLists.txt: + cmake: add option to use multiple processors for compiling + [d265d139dd54] + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt: + cmake: move /Ob2 flag to be global + [695e210034ad] + + * source/test/testpool.cpp: + testpool: add semicolons after PPA macros + [ebb7b3d857cc] + + * source/encoder/threadpool.cpp: + cmake: use 64bit interlocked commands and C style typecasts + + For some reason Microsoft uses signed types for atomic intrinsics + and LONG64 is not the same as uint64_t + [bffed1db2e83] + + * source/test/CMakeLists.txt: + cmake: add PPA and pthread libraries for appropriate build + configurations + [207e0e47d1c4] + + * source/CMakeLists.txt: + cmake: fix eoln damage from copy-pasting from a web-page + [6c27e3178e4e] + +2013-03-27 deepthidevaki + + * source/encoder/TComRdCost_SSE.cpp: + Modified vector- xCalcHADs4x4 + [329defb33896] + + * source/encoder/TComRdCost_SSE.cpp: + Modifed vector- xCalcHADS4x4 + [ffc0604b89a9] + +2013-03-27 Deepthi Devaki Akkoorath + + * source/encoder/TComRdCost_SSE.cpp: + Merged multicoreware/xhevc into default + [c4df9c66f793] + +2013-03-26 deepthidevaki + + * source/encoder/TComRdCost_SSE.cpp: + Modified xCalcHADs4x4 for better optimization + [921103b76c69] + +2013-03-26 Deepthi Devaki Akkoorath + + * Merged multicoreware/xhevc into default + [a1206301f004] + +2013-03-25 deepthidevaki + + * source/encoder/TComRdCost_SSE.cpp: + Modifed vectorized xCalcHADs* for aligned data access + [cdddf736f4a1] + +2013-03-26 Steve Borho + + * source/encoder/threadpool.cpp: + threadpool: reorder initializers to match member declarations to + make gcc happy + [a42d35cf40ea] + + * source/test/testpool.cpp: + testpool: memset requires string.h on Linux + [5576e606f381] + + * source/test/testpool.cpp: + testpool: initialize CUData on allocation, makes hashes persistent + [2f9e75feedfa] + + * source/encoder/TComRdCost_SSE.cpp: + disable formal parameter warnings in TComRdCost_SSE, for Release + mode builds + + This pragma should go away when we clean up the vectorized code + [600af352c248] + + * source/test/testpool.cpp: + testpool: add more resolution to elapsed time + [347c13c7c13a] + + * source/encoder/threadpool.cpp: + threadpool: safely allow the threadpool to be freed and reallocated + [8ccacddcb5c2] + + * source/test/testpool.cpp: + testpool: do not check top-right data dependency for last block in + row + + The fact that the penumltimate block was finished implies all of the + dependencies for the last block are also available. + [a9b0cb1167f5] + + * source/encoder/threadpool.cpp, source/encoder/threadpool.h: + threadpool: fix a couple of bugs, introduce m_numWords variable for + clarity + [a2cc29d90646] + + * source/test/testpool.cpp: + testpool: enqueue row 0 to trigger processing + [550abf2e723d] + + * source/encoder/threadpool.cpp: + threadpool: initialize m_numThreads correctly + [72d310781d4f] + + * source/encoder/threadpool.cpp: + threadpool: clear m_queuedBitmap after allocation + [b70a1bd11c88] + +2013-03-25 Steve Borho + + * source/test/CMakeLists.txt, source/test/testpool.cpp: + testpool: add MD5 fake CTU encode logic + [ebef6ad81ce7] + + * source/test/testpool.cpp: + testpool: make progress on thread pool unit test + [719adfd4d95a] + + * build/README.txt: + simplify build/README.txt + [f0f36350faf2] + +2013-03-25 Deepthi + + * Merge + [928a9fe490c2] + + * source/encoder/CMakeLists.txt: + Force inlining for x265 project. + [c5d96e5290e1] + + * source/encoder/CMakeLists.txt: + Adding /Ob1 for inline functions in vectorclass + [58fdaadc8c9b] + +2013-03-25 Steve Borho + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/TComRdCost_SSE.cpp: + Added vectorized xCalcHADs4x4. + [e4511f7ffd67] + + * source/Lib/TLibCommon/TComRdCost.cpp, + source/encoder/TComRdCost_SSE.cpp: + Added vectorized xCalcHADs8x8 to TComRdCost_sse.cpp. + [75902874989e] + +2013-03-23 Steve Borho + + * build/ConfigureBuild.bat, build/README.txt, build/vc10-x86/build- + all.bat, build/vc10-x86_64/build-all.bat, build/vc11-x86/build- + all.bat, build/vc11-x86_64/build-all.bat, build/vc9-x86/build- + all.bat, build/vc9-x86_64/build-all.bat, source/CMakeLists.txt, + source/encoder/TComRdCost_SSE.cpp: + cmake: change solution name to x265.sln + + The repository will eventually be renamed to x265 as well + [0b06a183e9d9] + +2013-03-23 ShinYee Chung + + * source/encoder/TComRdCost_SSE.cpp: + Encoder: Fix the backslash used in an #include. + + The lookup fails in Linux platform, while VC works with both + backslash and forwardslash + [3f5f4347ca3c] + + * build/linux/make-Makefiles.bash: + Build: cmake bash script for Linux 64bit platform. + [1571b62368bf] + +2013-03-22 Steve Borho + + * source/CMakeLists.txt: + cmake: fix a copy-pasted comment + [b6d723716d3a] + + * source/CMakeLists.txt, source/VectorClass/CMakeLists.txt, + source/encoder/CMakeLists.txt: + cmake: merge VectorClass and all vectorization knowledge into + encoder/ + [42d9ce7dd5b6] + + * source/x265cfg.cpp: + fixup a harmless mistake made when renaming files + [2801e45e10fc] + +2013-03-23 ShinYee Chung + + * source/Lib/CMakeLists.txt: + Build: Fix compile errors in HM codes due to compiler warning on + unused variables. + [48b1fa53a585] + +2013-03-22 Steve Borho + + * source/CMakeLists.txt: + cmake: for gcc builds, select Release build type by default + [504de19e1c0c] + + * source/App/README.txt, source/App/TAppEncoder/CMakeLists.txt, + source/App/TAppEncoder/TAppEncCfg.cpp, + source/App/TAppEncoder/TAppEncCfg.h, + source/App/TAppEncoder/TAppEncTop.cpp, + source/App/TAppEncoder/TAppEncTop.h, + source/App/TAppEncoder/encmain.cpp, source/CMakeLists.txt, + source/x265cfg.cpp, source/x265cfg.h, source/x265main.cpp, + source/x265top.cpp, source/x265top.h: + cmake: remove the APP folder, move encoder main files to source/ + folder + + The CLI portion of the project is now a build option, defaulting to + ON. The build target is now named x265-cli, and it will be generated + in the build root Debug/ or Release/ folder + [b4939353bb2f] + + * source/App/TAppEncoder/CMakeLists.txt, source/CMakeLists.txt, + source/Lib/CMakeLists.txt, source/Lib/TLibCommon/CMakeLists.txt, + source/Lib/TLibEncoder/CMakeLists.txt: + cmake: group all HM code into one library + [dee196e7dd7f] + + * build/vc10-x86/build-all.bat, build/vc10-x86_64/build-all.bat, + build/vc11-x86/build-all.bat, build/vc11-x86_64/build-all.bat, + build/vc9-x86/build-all.bat, build/vc9-x86_64/build-all.bat: + cmake: add build-all batch files for each compiler target + + This auto-detects the VS install location and skips impossible + compiles + [ea00c6454cd6] + + * build/msys/make-Makefiles.sh, build/vc10-x86/make-solutions.bat, + build/vc10-x86_64/make-solutions.bat, build/vc11-x86/make- + solutions.bat, build/vc11-x86_64/make-solutions.bat, build/vc9-x86 + /make-solutions.bat, build/vc9-x86_64/make-solutions.bat: + cmake: add build folders for each compiler target and batch files to + open GUI + + This makes it easy to configure the build options for each target. + The number of build options is about to grow. + [f8838d1b8aeb] + + * source/Lib/TLibCommon/TComSlice.cpp: + remove an include of a decoder header from the common lib + [7308255a8774] + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppDecoder/TAppDecCfg.cpp, + source/App/TAppDecoder/TAppDecCfg.h, + source/App/TAppDecoder/TAppDecTop.cpp, + source/App/TAppDecoder/TAppDecTop.h, + source/App/TAppDecoder/decmain.cpp, source/CMakeLists.txt, + source/Lib/CMakeLists.txt, source/Lib/TLibDecoder/AnnexBread.cpp, + source/Lib/TLibDecoder/AnnexBread.h, + source/Lib/TLibDecoder/CMakeLists.txt, + source/Lib/TLibDecoder/NALread.cpp, + source/Lib/TLibDecoder/NALread.h, + source/Lib/TLibDecoder/SEIread.cpp, + source/Lib/TLibDecoder/SEIread.h, + source/Lib/TLibDecoder/SyntaxElementParser.cpp, + source/Lib/TLibDecoder/SyntaxElementParser.h, + source/Lib/TLibDecoder/TDecBinCoder.h, + source/Lib/TLibDecoder/TDecBinCoderCABAC.cpp, + source/Lib/TLibDecoder/TDecBinCoderCABAC.h, + source/Lib/TLibDecoder/TDecCAVLC.cpp, + source/Lib/TLibDecoder/TDecCAVLC.h, + source/Lib/TLibDecoder/TDecCu.cpp, source/Lib/TLibDecoder/TDecCu.h, + source/Lib/TLibDecoder/TDecEntropy.cpp, + source/Lib/TLibDecoder/TDecEntropy.h, + source/Lib/TLibDecoder/TDecGop.cpp, + source/Lib/TLibDecoder/TDecGop.h, + source/Lib/TLibDecoder/TDecSbac.cpp, + source/Lib/TLibDecoder/TDecSbac.h, + source/Lib/TLibDecoder/TDecSlice.cpp, + source/Lib/TLibDecoder/TDecSlice.h, + source/Lib/TLibDecoder/TDecTop.cpp, + source/Lib/TLibDecoder/TDecTop.h: + Drop the HM decoder project; we should use HM project directly for + decodes + + We should be finding other decoders to test with as well + [4e8dec8e4636] + + * source/encoder/threadpool.h, source/test/testpool.cpp: + threadpool: make pool destructor protected to force the use of + Release() + [e43c28659b53] + + * source/CMakeLists.txt, source/test/CMakeLists.txt, + source/test/testpool.cpp: + add unit test for thread pool, incomplete + [97e90fcd77fe] + + * source/App/TAppEncoder/TAppEncCfg.cpp: + ignore warnings in Y4M header reader imported from VPL + [aa2829f682fb] + +2013-03-21 Steve Borho + + * source/App/TAppEncoder/TAppEncCfg.cpp: + fix white-space damage in TAppEncCfg.cpp + [cca6a7ba5f3a] + + * source/App/TAppEncoder/TAppEncCfg.cpp: + Fix eoln damage in TAppEncCfg.cpp + [50261b240150] + + * source/Lib/TLibCommon/TComMv.h: + fix another compiler warning in the HM + [8a2d14661327] + + * source/Lib/TLibCommon/TComSlice.h: + fix a warning in the HM header + [1942f88a267c] + + * source/App/TAppEncoder/CMakeLists.txt: + cmake: encoder app now links against x265 library (source/encoder) + [d5de005561cf] + + * source/App/TAppEncoder/TAppEncCfg.cpp: + encoder: gcc requires math.h for ceil() + [d6d1e2f2f535] + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppEncoder/CMakeLists.txt, source/Lib/CMakeLists.txt: + cmake: rename md5 lib so it is not compiled as liblibmd5 + [f45eb3a34f5a] + + * source/CMakeLists.txt, source/Lib/TLibCommon/CMakeLists.txt, + source/encoder/CMakeLists.txt: + cmake: move TComRdCost_SSE build to encoder/CMakeLists.txt + [437850ed2225] + +2013-03-21 ggopu + + * build/ConfigureSolution.bash: + Noo Need for this Script + [21f0f70a76a6] + + * build/ConfigureSolution.bat: + No Need for this Script + [f780b517b7b8] + + * build/ConfigureBuild.bat: + Modified the Script for Build Mode + [f2ce5bd828bf] + +2013-03-21 Deepthi + + * Merge + [dd3d01a52f4c] + + * build/ConfigureBuild.bat: + Updates to build script + [ee39c9715956] + + * source/encoder/TComRdCost_SSE.cpp: + Removing redundant path + [c23b0c5f44b8] + + * source/Lib/TLibCommon/CMakeLists.txt: + Adding include directories and vector build options. + [e788257c2b47] + + * source/VectorClass/CMakeLists.txt: + Changing all vector builds to SSE2 for now. + [1ff8a33f3356] + + * source/Lib/TLibCommon/CMakeLists.txt: + Forward slash error in Cmake script + [6d19c8cf40d2] + + * build/buildSolution.bash: + Removing initial build script + [fa010f66f667] + +2013-03-21 sumalatha + + * source/App/TAppEncoder/TAppEncCfg.cpp: + Changed the code for supporting 1. YUV files with per sequence cfg + files 2. y4m files with no per seqence cfg files for y4m files, + width, height and frame rate are obtained by parsing the y4m file + [d825fcd6fcf0] + +2013-03-21 nandaku2 + + * Merged in ggopu/xhevc_ggopu (pull request #6) + + Build Scripts + [36be38d95a0e] + +2013-03-20 ggopu + + * build/ConfigureSolution.bat: + Script is for To Build only Visual Stdio Solutions for All Visual + Studio Compilers or any Specific Visual Studio Compiler + [1a26d0d759f3] + + * build/ConfigureSolution.bash: + Script is for To Build Only Solutions for All MinGW Compilers or Any + Specific MinGW Compilers + [cdb508562f50] + + * build/ConfigureBuild.bat: + Script is for To Build Bin and Solutions for All the Visual Studio + Compilers or any Specific Visual Studio Compiler + [924204a92ab3] + + * build/ConfigureBuild.bash: + Script is used to Configure the Solution and Build the Bin for all + MinGW Compilers - MingW, UNIX and MSYS + [cfc72567b7d1] + +2013-03-21 Steve Borho + + * source/encoder/threadpool.cpp: + add MACOS cpu number detection + + This doesn't imply we care about MacOS, only that I found code for + it online + [22b6e4c7dbef] + +2013-03-21 nandaku2 + + * Merged in mandarmcw/xhevc_mandar (pull request #5) + + Functions vectorized stage 1 + [cfccc2255421] + +2013-03-20 mandarmcw + + * source/Lib/TLibCommon/CMakeLists.txt, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TypeDef.h, source/encoder/TComRdCost_SSE.cpp: + Functions vectorized stage 1 : TcomRdCost::xGetSAD* (Replace * with + the numbers - 8, 16, 32, 64, 12, 24) Changes:- + * Defined a macro ENABLE_VECTOR for enabling the vectorized function + definations. + * Addedd file source/encoder/TComRdCost_SSE.cpp to Cmakelists file + [e5ab4f0e46f0] + +2013-03-19 Mandar Gurav + + * cfg/encoder_intra_main.cfg, cfg/encoder_intra_main10.cfg, + cfg/encoder_lowdelay_P_main.cfg, cfg/encoder_lowdelay_P_main10.cfg, + cfg/encoder_lowdelay_main.cfg, cfg/encoder_lowdelay_main10.cfg, + cfg/encoder_randomaccess_main.cfg, + cfg/encoder_randomaccess_main10.cfg: + Merged multicoreware/xhevc into default + [552584fc09d7] + + * Merged multicoreware/xhevc into default + [93fa9a5208a3] + +2013-03-19 mandarmcw + + * source/encoder/TComRdCost_SSE.cpp: + Updated all file names with *_SSE + [fd7954b7ccb3] + + * source/encoder/TComRdCost_SSE.cpp: + New TComRdCost.cpp file with Vectorization. + [7547355fbf79] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Merge + [831a802091f2] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Discard changes + [2e2e69ec398c] + + * source/Lib/TLibCommon/TComRdCost.cpp: + Discard changes. Revert back to original source + [30854b3469d4] + +2013-03-19 Mandar Gurav + + * Merged multicoreware/xhevc/default (954436fbb3df) into default + [605590cb93f4] + +2013-03-19 mandarmcw + + * source/Lib/TLibCommon/TComRdCost.cpp: + Functions vectorized stage 1 : TcomRdCost::xGetSAD* (Replace * with + the numbers - 8, 16, 32, 64, 12, 24) + [95ba23e9aec0] + +2013-03-20 Steve Borho + + * source/encoder/threadpool.cpp: + threadpool: add CPU core count detection, remove need for friend + classes + [ba0dcc65b645] + + * build/buildSolution.bash: + switch build/buildSolution.bash to unix EOLN, no behavior change + [03a24f5c924e] + + * source/encoder/threadpool.cpp, source/encoder/threadpool.h: + threadpool: add worker thread implementation, non-blocking idle list + [e69134e545db] + + * source/App/README.txt, source/Lib/README.txt, + source/VectorClass/README.txt: + add README files describing where various code originated + [d627780e1c0d] + + * source/VectorClass/CMakeLists.txt: + cmake: update name of VectorClass lib + [64d3463a7437] + + * .hgignore: + ignore Mercurial merge/patch remnants + [74246322f5d6] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/Lib/TVectorClass/CMakeLists.txt, + source/Lib/TVectorClass/dispatch_example.cpp, + source/Lib/TVectorClass/instrset.h, + source/Lib/TVectorClass/instrset_detect.cpp, + source/Lib/TVectorClass/special/complexvec.h, + source/Lib/TVectorClass/special/decimal.h, + source/Lib/TVectorClass/special/quaternion.h, + source/Lib/TVectorClass/special/vector3d.h, + source/Lib/TVectorClass/special/vectormath.h, + source/Lib/TVectorClass/vectorclass.h, + source/Lib/TVectorClass/vectorf128.h, + source/Lib/TVectorClass/vectorf256.h, + source/Lib/TVectorClass/vectorf256e.h, + source/Lib/TVectorClass/vectori128.h, + source/Lib/TVectorClass/vectori256.h, + source/Lib/TVectorClass/vectori256e.h, + source/VectorClass/CMakeLists.txt, + source/VectorClass/dispatch_example.cpp, + source/VectorClass/instrset.h, + source/VectorClass/instrset_detect.cpp, + source/VectorClass/special/complexvec.h, + source/VectorClass/special/decimal.h, + source/VectorClass/special/quaternion.h, + source/VectorClass/special/vector3d.h, + source/VectorClass/special/vectormath.h, + source/VectorClass/vectorclass.h, source/VectorClass/vectorf128.h, + source/VectorClass/vectorf256.h, source/VectorClass/vectorf256e.h, + source/VectorClass/vectori128.h, source/VectorClass/vectori256.h, + source/VectorClass/vectori256e.h: + move TVectorClass folder from Lib/ folder to source/VectorClass + [d1f2dcfd8c66] + + * source/CMakeLists.txt, source/Lib/CMakeLists.txt, + source/Lib/PPA/CMakeLists.txt, source/Lib/PPA/ppa.cpp, + source/Lib/PPA/ppa.h, source/Lib/PPA/ppaApi.h, + source/Lib/PPA/ppaCPUEvents.h, source/PPA/CMakeLists.txt, + source/PPA/ppa.cpp, source/PPA/ppa.h, source/PPA/ppaApi.h, + source/PPA/ppaCPUEvents.h: + move PPA from Lib/ folder + [7aa3258f57d8] + +2013-03-20 sumalatha + + * Merge + [e9f96ed95f8a] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg: + included few configurations like deblock filter, quantisation, + motion search etc with default value in both cfg file + [99cfa30c5630] + +2013-03-20 Sumalatha Polureddy + + * Merged multicoreware/xhevc into default + [ca7efc3bd26f] + + * Merged multicoreware/xhevc into default + [364d82f2fb06] + +2013-03-19 Sumalatha Polureddy + + * Merged multicoreware/xhevc into default + [1d6ff1eaba4e] + +2013-03-19 MCW + + * cfg/encoder_intra_main.cfg, cfg/encoder_intra_main10.cfg, + cfg/encoder_lowdelay_P_main10.cfg, cfg/encoder_lowdelay_main.cfg, + cfg/encoder_lowdelay_main10.cfg, cfg/encoder_randomaccess_main.cfg, + cfg/encoder_randomaccess_main10.cfg: + Deleting redundant cfg files + [24be06d25080] + + * Merge + [a1108949be35] + + * cfg/encoder_lowdelay_P_main.cfg: + Removing encoder cfg file + [28407cd95bdc] + + * cfg/encoder_I_15P.cfg, cfg/encoder_all_I.cfg: + included the two config files for testing. "encoder_all_I.cfg" + encodes all frames as I frames. "encoder_I_15P.cfg" encodes "I" + frame followed by 15 "P" frames. This pattern is followed for entire + sequence. + [124fe50587b5] + +2013-03-19 Sumalatha Polureddy + + * Merged multicoreware/xhevc/default (954436fbb3df) into default + [6f5679aa1420] + +2013-03-20 nandaku2 + + * Merged in ggopu/xhevc_ggopu (pull request #2) + + New Script for To build the Windows Solution / Make files for all + the VS and MSYS Compilers + [c6c9c4f150e1] + +2013-03-19 ggopu + + * build/buildSolution.bash: + New Script for To build the Windows Solution / Make files for all + the VS and MSYS Compilers + [83904e69121d] + +2013-03-19 Steve Borho + + * source/encoder/CMakeLists.txt, source/encoder/threadpool.cpp, + source/encoder/threadpool.h: + partially completed thread pool + + needs PoolThread::ThreadMain(), ThreadPoolImpl::PokeIdleThread(), + and a shutdown/flush mechanism for the pool + [f9f791c32093] + +2013-03-19 Deepthi + + * Merge + [e70117d0f3d5] + +2013-03-18 Deepthi + + * source/Lib/TLibCommon/TComRdCost.h: + Backed out changeset: db7ddb189d7d + [954436fbb3df] + + * source/Lib/TLibCommon/TComRdCost.h: + Test commit + [db7ddb189d7d] + +2013-03-19 Steve Borho + + * source/encoder/threadpool.h: + threadpool: stub in interfaces for the thread pool and job providers + [c5d831ed2ce0] + +2013-03-18 Steve Borho + + * source/encoder/threading.h: + threading: remove copy assignment operator body + + I want a compile error generated if it is attempted + [80a0e976c24e] + + * source/encoder/CMakeLists.txt: + cmake: remove redundant lib prefix + + This was building liblibx265 + [bcabd73ba3f9] + + * source/encoder/threading.cpp, source/encoder/threading.h: + threading: fix compile problems with gcc on Linux + [23e6db8e39d2] + + * source/App/TAppEncoder/TAppEncCfg.cpp, + source/App/TAppEncoder/TAppEncCfg.h, + source/App/TAppEncoder/TAppEncTop.cpp, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TypeDef.h, source/Lib/TLibDecoder/SEIread.cpp, + source/Lib/TLibDecoder/SEIread.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, source/Lib/TLibEncoder/TEncCfg.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h: + take tip HM 10.0-dev changes (subversion HM-10.0-dev@3381) + [b1d8625402d8] + + * source/CMakeLists.txt, source/encoder/CMakeLists.txt, + source/encoder/threading.cpp, source/encoder/threading.h: + introduce encoder folder and libx265 library project with threading + classes + [24d5964cfb3b] + + * source/Lib/TVectorClass/CMakeLists.txt: + cmake: fix comment about /arch:AVX flag, it is only necessary for + VC11 + [4a68b817dea5] + + * source/Lib/TVectorClass/CMakeLists.txt: + cmake: white-space nits + [097f1ac9a970] + + * source/Lib/CMakeLists.txt, source/Lib/TVectorClass/CMakeLists.txt: + cmake: isolate /arch:AVX compile flag to code which includes vector + libs + + When the HM is compiled with /arch:AVX globally, it causes a runtime + exception when run in release mode but not in debug mode. Gah! + [25fc265da117] + + * source/Lib/TVectorClass/CMakeLists.txt: + cmake: no need to build dispatch_example.cpp + [0b93848cb7cc] + +2013-03-16 Steve Borho + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppEncoder/CMakeLists.txt, source/CMakeLists.txt, + source/Lib/CMakeLists.txt: + cmake: enable full GCC warnings globally, disable for HM source + where required + [23d5ac432957] + + * build/README.txt: + document MSYS build instructions + [e76417ca32c7] + + * source/Lib/CMakeLists.txt: + cmake: MSYS gcc requires -msse4 and does not support AVX2 intrinsics + (refs #1) + [94aef6522ed0] + + * source/Lib/CMakeLists.txt: + cmake: VC9 can handle only SSE4.2 intrinsics (closes #1) + + gcc was tested under CentOS 6.3 and it compiled fine as-is + [66263d046d28] + +2013-03-15 Steve Borho + + * source/Lib/CMakeLists.txt: + cmake: VC11 does support AVX2 intrinsics, but lib does not compile + (refs #1) + + The library does not compile cleanly when AVX2 is enabled, one + warning about an implicit int to bool conversion (easily ignored) + and then an error that a formal parameter cannot be aligned as + requested, which is much more serious. So I'm leaving AVX2 disabled + for VC11 + + /arch:AVX2 was not valid, but /arch:AVX seemed to enable AVX2 + intrinsics. + [b541bd207ee8] + + * source/Lib/CMakeLists.txt, source/Lib/TVectorClass/vectori128.h: + cmake: add INSTRSET definitions for various VC versions (refs #1) + + I'm guessing that VC9 supports AVX and VC11 supports AVX2. I'll have + to verify both of them. A similar section must be added for gcc. + [70b96b111797] + +2013-03-15 Deepthi + + * source/Lib/CMakeLists.txt: + Merge + [d465f7915365] + + * source/Lib/CMakeLists.txt, source/Lib/TVectorClass/CMakeLists.txt: + Adding Cmake build scripts for VectorClass + [b948ddbdb645] + + * source/Lib/TVectorClass/dispatch_example.cpp, + source/Lib/TVectorClass/instrset.h, + source/Lib/TVectorClass/instrset_detect.cpp, + source/Lib/TVectorClass/special/complexvec.h, + source/Lib/TVectorClass/special/decimal.h, + source/Lib/TVectorClass/special/quaternion.h, + source/Lib/TVectorClass/special/vector3d.h, + source/Lib/TVectorClass/special/vectormath.h, + source/Lib/TVectorClass/vectorclass.h, + source/Lib/TVectorClass/vectorf128.h, + source/Lib/TVectorClass/vectorf256.h, + source/Lib/TVectorClass/vectorf256e.h, + source/Lib/TVectorClass/vectori128.h, + source/Lib/TVectorClass/vectori256.h, + source/Lib/TVectorClass/vectori256e.h: + Adding GPL vectorclass source files + [ccf65d86f57b] + +2013-03-15 ShinYee Chung + + * source/CMakeLists.txt: + Build: Update GCC compilations with more warning options. + [aa8994f58577] + +2013-03-14 Steve Borho + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppEncoder/CMakeLists.txt, source/CMakeLists.txt, + source/Lib/CMakeLists.txt, source/Lib/TLibCommon/CMakeLists.txt, + source/Lib/TLibDecoder/CMakeLists.txt, + source/Lib/TLibEncoder/CMakeLists.txt: + move warning disablings as close as possible to the source that + needs them + + I want newly added files and folders to have full warnings by + default + [ae7aa0c6b26a] + + * source/App/TAppDecoder/TAppDecTop.cpp, source/CMakeLists.txt, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibDecoder/TDecCAVLC.cpp, + source/Lib/TLibEncoder/TEncSearch.cpp: + enable maximal warnings (and warnings as errors) on Windows + + Manually exclude each warning emitted by the HM code that we chose + to ignore, fix one warning to avoid ignoring all warnings of that + type in the App folder. Explicitly disable some dangerous warnings + directly in the files which emit them, so they can remain enabled + globally to catch bugs as they are created. + [053177b5e32c] + +2013-03-11 Steve Borho + + * source/Lib/PPA/ppa.cpp: + ppa: use API bindings from public Linux PPA package + + On Linux the PPA shared library is quiet on success (yeah!) but this + makes it a challenge to debug shared library path problems. So I + added some logging. + [cf22c53c7108] + +2013-03-07 Steve Borho + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppEncoder/CMakeLists.txt: + ppa: -ldl is necessary for UNIX systems; not necessarily gcc + + For instance, if building on MinGW gcc on Windows one does not need + -ldl since PPA will still not use the dlsym library loader + [07ec03019bda] + + * source/App/TAppEncoder/TAppEncTop.cpp, + source/Lib/PPA/ppaCPUEvents.h: + ppa: add some very high level CPU events in the encoder + [c1b56eec0474] + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppDecoder/decmain.cpp, + source/App/TAppEncoder/CMakeLists.txt, + source/App/TAppEncoder/encmain.cpp, source/CMakeLists.txt, + source/Lib/CMakeLists.txt, source/Lib/PPA/CMakeLists.txt, + source/Lib/PPA/ppa.cpp, source/Lib/PPA/ppa.h, + source/Lib/PPA/ppaApi.h, source/Lib/PPA/ppaCPUEvents.h: + introduce PPA as cmake build option + [3117ce6b9688] + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppEncoder/CMakeLists.txt, source/CMakeLists.txt, + source/Lib/CMakeLists.txt, source/Lib/TLibCommon/CMakeLists.txt, + source/Lib/TLibDecoder/CMakeLists.txt, + source/Lib/TLibEncoder/CMakeLists.txt: + convert CMakeLists.txt to unix EOLN + [04901970ea1a] + + * source/Lib/TLibEncoder/CMakeLists.txt: + fix case sensitivity of file paths in TLibEncoder/CMakeLists.txt + [81a323ceae15] + +2013-03-06 Steve Borho + + * source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppEncoder/CMakeLists.txt: + reorder library linking to help gcc ld to resolve dependencies + [f451d1272522] + + * source/CMakeLists.txt: + copy gcc warning flags from HM Makefiles + + Can now compile without warnings on MSYS on Windows, but cannot link + [99bb609ac6da] + + * .hgignore, COPYING, build/README.txt, cfg/encoder_intra_main.cfg, + cfg/encoder_intra_main10.cfg, cfg/encoder_lowdelay_P_main.cfg, + cfg/encoder_lowdelay_P_main10.cfg, cfg/encoder_lowdelay_main.cfg, + cfg/encoder_lowdelay_main10.cfg, cfg/encoder_randomaccess_main.cfg, + cfg/encoder_randomaccess_main10.cfg, cfg/per-sequence/BQMall.cfg, + cfg/per-sequence/BQSquare.cfg, cfg/per-sequence/BQTerrace.cfg, cfg + /per-sequence/BasketballDrill.cfg, cfg/per- + sequence/BasketballDrillText.cfg, cfg/per- + sequence/BasketballDrive.cfg, cfg/per-sequence/BasketballPass.cfg, + cfg/per-sequence/BlowingBubbles.cfg, cfg/per-sequence/Cactus.cfg, + cfg/per-sequence/ChinaSpeed.cfg, cfg/per-sequence/FourPeople.cfg, + cfg/per-sequence/Johnny.cfg, cfg/per-sequence/Kimono.cfg, cfg/per- + sequence/KristenAndSara.cfg, cfg/per- + sequence/NebutaFestival_10bit.cfg, cfg/per-sequence/ParkScene.cfg, + cfg/per-sequence/PartyScene.cfg, cfg/per- + sequence/PeopleOnStreet.cfg, cfg/per-sequence/RaceHorses.cfg, cfg + /per-sequence/RaceHorsesC.cfg, cfg/per-sequence/SlideEditing.cfg, + cfg/per-sequence/SlideShow.cfg, cfg/per- + sequence/SteamLocomotiveTrain_10bit.cfg, cfg/per- + sequence/Traffic.cfg, cfg/per-sequence/Vidyo1.cfg, cfg/per- + sequence/Vidyo3.cfg, cfg/per-sequence/Vidyo4.cfg, doc/Doxyfile, doc + /README_data-structure.ppt, doc/gop-structure-example.pdf, + doc/mainpage.h, doc/software-manual.pdf, + source/App/TAppDecoder/CMakeLists.txt, + source/App/TAppDecoder/TAppDecCfg.cpp, + source/App/TAppDecoder/TAppDecCfg.h, + source/App/TAppDecoder/TAppDecTop.cpp, + source/App/TAppDecoder/TAppDecTop.h, + source/App/TAppDecoder/decmain.cpp, + source/App/TAppEncoder/CMakeLists.txt, + source/App/TAppEncoder/TAppEncCfg.cpp, + source/App/TAppEncoder/TAppEncCfg.h, + source/App/TAppEncoder/TAppEncTop.cpp, + source/App/TAppEncoder/TAppEncTop.h, + source/App/TAppEncoder/encmain.cpp, source/CMakeLists.txt, + source/Lib/CMakeLists.txt, + source/Lib/TAppCommon/program_options_lite.cpp, + source/Lib/TAppCommon/program_options_lite.h, + source/Lib/TLibCommon/AccessUnit.h, + source/Lib/TLibCommon/CMakeLists.txt, + source/Lib/TLibCommon/CommonDef.h, + source/Lib/TLibCommon/ContextModel.cpp, + source/Lib/TLibCommon/ContextModel.h, + source/Lib/TLibCommon/ContextModel3DBuffer.cpp, + source/Lib/TLibCommon/ContextModel3DBuffer.h, + source/Lib/TLibCommon/ContextTables.h, source/Lib/TLibCommon/NAL.h, + source/Lib/TLibCommon/SEI.cpp, source/Lib/TLibCommon/SEI.h, + source/Lib/TLibCommon/TComBitCounter.h, + source/Lib/TLibCommon/TComBitStream.cpp, + source/Lib/TLibCommon/TComBitStream.h, + source/Lib/TLibCommon/TComCABACTables.cpp, + source/Lib/TLibCommon/TComCABACTables.h, + source/Lib/TLibCommon/TComDataCU.cpp, + source/Lib/TLibCommon/TComDataCU.h, + source/Lib/TLibCommon/TComInterpolationFilter.cpp, + source/Lib/TLibCommon/TComInterpolationFilter.h, + source/Lib/TLibCommon/TComList.h, + source/Lib/TLibCommon/TComLoopFilter.cpp, + source/Lib/TLibCommon/TComLoopFilter.h, + source/Lib/TLibCommon/TComMotionInfo.cpp, + source/Lib/TLibCommon/TComMotionInfo.h, + source/Lib/TLibCommon/TComMv.h, + source/Lib/TLibCommon/TComPattern.cpp, + source/Lib/TLibCommon/TComPattern.h, + source/Lib/TLibCommon/TComPic.cpp, source/Lib/TLibCommon/TComPic.h, + source/Lib/TLibCommon/TComPicSym.cpp, + source/Lib/TLibCommon/TComPicSym.h, + source/Lib/TLibCommon/TComPicYuv.cpp, + source/Lib/TLibCommon/TComPicYuv.h, + source/Lib/TLibCommon/TComPicYuvMD5.cpp, + source/Lib/TLibCommon/TComPrediction.cpp, + source/Lib/TLibCommon/TComPrediction.h, + source/Lib/TLibCommon/TComRdCost.cpp, + source/Lib/TLibCommon/TComRdCost.h, + source/Lib/TLibCommon/TComRdCostWeightPrediction.cpp, + source/Lib/TLibCommon/TComRdCostWeightPrediction.h, + source/Lib/TLibCommon/TComRom.cpp, source/Lib/TLibCommon/TComRom.h, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp, + source/Lib/TLibCommon/TComSampleAdaptiveOffset.h, + source/Lib/TLibCommon/TComSlice.cpp, + source/Lib/TLibCommon/TComSlice.h, + source/Lib/TLibCommon/TComTrQuant.cpp, + source/Lib/TLibCommon/TComTrQuant.h, + source/Lib/TLibCommon/TComWeightPrediction.cpp, + source/Lib/TLibCommon/TComWeightPrediction.h, + source/Lib/TLibCommon/TComYuv.cpp, source/Lib/TLibCommon/TComYuv.h, + source/Lib/TLibCommon/TypeDef.h, + source/Lib/TLibDecoder/AnnexBread.cpp, + source/Lib/TLibDecoder/AnnexBread.h, + source/Lib/TLibDecoder/CMakeLists.txt, + source/Lib/TLibDecoder/NALread.cpp, + source/Lib/TLibDecoder/NALread.h, + source/Lib/TLibDecoder/SEIread.cpp, + source/Lib/TLibDecoder/SEIread.h, + source/Lib/TLibDecoder/SyntaxElementParser.cpp, + source/Lib/TLibDecoder/SyntaxElementParser.h, + source/Lib/TLibDecoder/TDecBinCoder.h, + source/Lib/TLibDecoder/TDecBinCoderCABAC.cpp, + source/Lib/TLibDecoder/TDecBinCoderCABAC.h, + source/Lib/TLibDecoder/TDecCAVLC.cpp, + source/Lib/TLibDecoder/TDecCAVLC.h, + source/Lib/TLibDecoder/TDecCu.cpp, source/Lib/TLibDecoder/TDecCu.h, + source/Lib/TLibDecoder/TDecEntropy.cpp, + source/Lib/TLibDecoder/TDecEntropy.h, + source/Lib/TLibDecoder/TDecGop.cpp, + source/Lib/TLibDecoder/TDecGop.h, + source/Lib/TLibDecoder/TDecSbac.cpp, + source/Lib/TLibDecoder/TDecSbac.h, + source/Lib/TLibDecoder/TDecSlice.cpp, + source/Lib/TLibDecoder/TDecSlice.h, + source/Lib/TLibDecoder/TDecTop.cpp, + source/Lib/TLibDecoder/TDecTop.h, + source/Lib/TLibEncoder/AnnexBwrite.h, + source/Lib/TLibEncoder/CMakeLists.txt, + source/Lib/TLibEncoder/NALwrite.cpp, + source/Lib/TLibEncoder/NALwrite.h, + source/Lib/TLibEncoder/SEIwrite.cpp, + source/Lib/TLibEncoder/SEIwrite.h, + source/Lib/TLibEncoder/SyntaxElementWriter.cpp, + source/Lib/TLibEncoder/SyntaxElementWriter.h, + source/Lib/TLibEncoder/TEncAnalyze.cpp, + source/Lib/TLibEncoder/TEncAnalyze.h, + source/Lib/TLibEncoder/TEncBinCoder.h, + source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABAC.h, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.cpp, + source/Lib/TLibEncoder/TEncBinCoderCABACCounter.h, + source/Lib/TLibEncoder/TEncCavlc.cpp, + source/Lib/TLibEncoder/TEncCavlc.h, + source/Lib/TLibEncoder/TEncCfg.h, source/Lib/TLibEncoder/TEncCu.cpp, + source/Lib/TLibEncoder/TEncCu.h, + source/Lib/TLibEncoder/TEncEntropy.cpp, + source/Lib/TLibEncoder/TEncEntropy.h, + source/Lib/TLibEncoder/TEncGOP.cpp, + source/Lib/TLibEncoder/TEncGOP.h, + source/Lib/TLibEncoder/TEncPic.cpp, + source/Lib/TLibEncoder/TEncPic.h, + source/Lib/TLibEncoder/TEncPreanalyzer.cpp, + source/Lib/TLibEncoder/TEncPreanalyzer.h, + source/Lib/TLibEncoder/TEncRateCtrl.cpp, + source/Lib/TLibEncoder/TEncRateCtrl.h, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.cpp, + source/Lib/TLibEncoder/TEncSampleAdaptiveOffset.h, + source/Lib/TLibEncoder/TEncSbac.cpp, + source/Lib/TLibEncoder/TEncSbac.h, + source/Lib/TLibEncoder/TEncSearch.cpp, + source/Lib/TLibEncoder/TEncSearch.h, + source/Lib/TLibEncoder/TEncSlice.cpp, + source/Lib/TLibEncoder/TEncSlice.h, + source/Lib/TLibEncoder/TEncTop.cpp, + source/Lib/TLibEncoder/TEncTop.h, + source/Lib/TLibEncoder/WeightPredAnalysis.cpp, + source/Lib/TLibEncoder/WeightPredAnalysis.h, + source/Lib/TLibVideoIO/TVideoIOYuv.cpp, + source/Lib/TLibVideoIO/TVideoIOYuv.h, source/Lib/libmd5/MD5.h, + source/Lib/libmd5/libmd5.c, source/Lib/libmd5/libmd5.h, + source/compat/msvc/stdint.h: + commit JCT-VC HM source with cmake based build scripts + + HM-10.0-dev@3375 with some parts trimmed and some trivial folder + reorgs + + https://hevc.hhi.fraunhofer.de/svn/svn_HEVCSoftware/ + [09fe40627f03] + diff --git a/build/README.txt b/build/README.txt new file mode 100644 index 0000000..c087349 --- /dev/null +++ b/build/README.txt @@ -0,0 +1,70 @@ += Mandatory Prerequisites = + +* GCC, MSVC (9, 10, 11, 12), Xcode or Intel C/C++ +* CMake 2.8.8 or later http://www.cmake.org +* On linux, ccmake is helpful, usually a package named cmake-curses-gui + +Note: MSVC12 requires cmake 2.8.11 or later + + += Optional Prerequisites = + +1. Yasm 1.2.0 or later, to compile assembly primitives (performance) + + For Windows, download + http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win32.exe or + http://www.tortall.net/projects/yasm/releases/yasm-1.2.0-win64.exe + depending on your O/S and copy the EXE into C:\Windows or somewhere else + in your %PATH% that a 32-bit app (cmake) can find it. If it is not in the + path, you must manually tell cmake where to find it. + + For Linux, yasm-1.2.0 is likely too new to be packaged for your system so you + will need get http://www.tortall.net/projects/yasm/releases/yasm-1.2.0.tar.gz + compile, and install it. + + Once YASM is properly installed, run cmake to regenerate projects. If you + do not see the below line in the cmake output, YASM is not in the PATH. + + -- Found Yasm 1.2.0 to build assembly primitives + + Now build the encoder and run x265 -V. If you see "assembly" on this + line, you have YASM properly installed: + + x265 [info]: performance primitives: intrinsic assembly + +2. VisualLeakDetector (Windows Only) + + Download from https://vld.codeplex.com/releases and install. May need + to re-login in order for it to be in your %PATH%. Cmake will find it + and enable leak detection in debug builds without any additional work. + + If VisualLeakDetector is not installed, cmake will complain a bit, but + it is completely harmless. + + += Build Instructions Linux = + +1. Use cmake to generate Makefiles: cmake ../source +2. Build x265: make + + Or use our shell script which runs cmake then opens the curses GUI to + configure build options + +1. cd build/linux ; ./make-Makefiles.bash +2. make + + += Build Instructions Windows = + +We recommend you use one of the make-solutions.bat files in the appropriate +build/ sub-folder for your preferred compiler. They will open the cmake-gui +to configure build options, click configure until no more red options remain, +then click generate and exit. There should now be an x265.sln file in the +same folder, open this in Visual Studio and build it. + += Version number considerations = + +Note that cmake will update X265_VERSION each time cmake runs, if you are +building out of a Mercurial source repository. If you are building out of +a release source package, the version will not change. If Mercurial is not +found, the version will be "unknown". diff --git a/build/icl32/build-all.bat b/build/icl32/build-all.bat new file mode 100644 index 0000000..cbe9a59 --- /dev/null +++ b/build/icl32/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) +if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) +if "%ICL%" == "" ( + msg "%username%" "Intel C++ 2013 not detected" + exit 1 +) +if not exist Makefile ( + call make-makefile.bat +) +if exist Makefile ( + call "%ICL%\bin\compilervars.bat" ia32 + nmake +) diff --git a/build/icl32/make-makefile.bat b/build/icl32/make-makefile.bat new file mode 100644 index 0000000..799344e --- /dev/null +++ b/build/icl32/make-makefile.bat @@ -0,0 +1,15 @@ +@echo off +:: +:: run this batch file to create an Intel C++ 2013 NMake makefile for this project. +:: See the cmake documentation for other generator targets +:: +if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) +if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) +if "%ICL%" == "" ( + msg "%username%" "Intel C++ 2013 not detected" + exit 1 +) +call "%ICL%\bin\compilervars.bat" ia32 +set CC=icl +set CXX=icl +cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source diff --git a/build/icl64/build-all.bat b/build/icl64/build-all.bat new file mode 100644 index 0000000..d1d6b8d --- /dev/null +++ b/build/icl64/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) +if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) +if "%ICL%" == "" ( + msg "%username%" "Intel C++ 2013 not detected" + exit 1 +) +if not exist Makefile ( + call make-makefile.bat +) +if exist Makefile ( + call "%ICL%\bin\compilervars.bat" intel64 + nmake +) diff --git a/build/icl64/make-makefile.bat b/build/icl64/make-makefile.bat new file mode 100644 index 0000000..2d3f629 --- /dev/null +++ b/build/icl64/make-makefile.bat @@ -0,0 +1,17 @@ +@echo off +:: +:: run this batch file to create an Intel C++ 2013 NMake makefile for this project. +:: See the cmake documentation for other generator targets +:: +if not "%ICPP_COMPILER13%" == "" ( set ICL="%ICPP_COMPILER13" ) +if not "%ICPP_COMPILER14%" == "" ( set ICL="%ICPP_COMPILER14" ) +if "%ICL%" == "" ( + msg "%username%" "Intel C++ 2013 not detected" + pause + exit 1 +) +call "%ICL%\bin\compilervars.bat" intel64 +set CC=icl +set CXX=icl +cmake -G "NMake Makefiles" ..\..\source && cmake-gui ..\..\source +pause diff --git a/build/linux/make-Makefiles.bash b/build/linux/make-Makefiles.bash new file mode 100755 index 0000000..4315d61 --- /dev/null +++ b/build/linux/make-Makefiles.bash @@ -0,0 +1,3 @@ +#!/bin/bash +# Run this from within a bash shell +cmake -G "Unix Makefiles" ../../source && ccmake ../../source diff --git a/build/msys/make-Makefiles.sh b/build/msys/make-Makefiles.sh new file mode 100644 index 0000000..32fa3af --- /dev/null +++ b/build/msys/make-Makefiles.sh @@ -0,0 +1,3 @@ +#!/bin/sh +# Run this from within an MSYS bash shell +cmake -G "MSYS Makefiles" ../../source && cmake-gui ../../source diff --git a/build/msys/make-x86_64-w64-mingw32-Makefiles.sh b/build/msys/make-x86_64-w64-mingw32-Makefiles.sh new file mode 100644 index 0000000..d98eced --- /dev/null +++ b/build/msys/make-x86_64-w64-mingw32-Makefiles.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# This will generate a cross-compile environment, compiling an x86_64 +# Win64 target from a 32bit MinGW32 host environment. If your MinGW +# install is 64bit, you can use the native compiler batch file: +# make-Makefiles.sh + +cmake -G "MSYS Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain-x86_64-w64-mingw32.cmake ../../source && cmake-gui ../../source diff --git a/build/msys/toolchain-x86_64-w64-mingw32.cmake b/build/msys/toolchain-x86_64-w64-mingw32.cmake new file mode 100644 index 0000000..a3f768b --- /dev/null +++ b/build/msys/toolchain-x86_64-w64-mingw32.cmake @@ -0,0 +1,6 @@ +SET(CMAKE_SYSTEM_NAME Windows) +SET(CMAKE_C_COMPILER x86_64-w64-mingw32-gcc) +SET(CMAKE_CXX_COMPILER x86_64-w64-mingw32-g++) +SET(CMAKE_RC_COMPILER x86_64-w64-mingw32-windres) +SET(CMAKE_RANLIB x86_64-w64-mingw32-ranlib) +SET(CMAKE_ASM_YASM_COMPILER yasm) diff --git a/build/vc10-x86/build-all.bat b/build/vc10-x86/build-all.bat new file mode 100644 index 0000000..5a906e5 --- /dev/null +++ b/build/vc10-x86/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS100COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 10 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS100COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc10-x86/make-solutions.bat b/build/vc10-x86/make-solutions.bat new file mode 100644 index 0000000..9e1bc4a --- /dev/null +++ b/build/vc10-x86/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 10" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc10-x86_64/build-all.bat b/build/vc10-x86_64/build-all.bat new file mode 100644 index 0000000..5a906e5 --- /dev/null +++ b/build/vc10-x86_64/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS100COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 10 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS100COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc10-x86_64/make-solutions.bat b/build/vc10-x86_64/make-solutions.bat new file mode 100644 index 0000000..33f431c --- /dev/null +++ b/build/vc10-x86_64/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 10 Win64" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc11-x86/build-all.bat b/build/vc11-x86/build-all.bat new file mode 100644 index 0000000..4445c03 --- /dev/null +++ b/build/vc11-x86/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS110COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 11 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS110COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc11-x86/make-solutions.bat b/build/vc11-x86/make-solutions.bat new file mode 100644 index 0000000..11da4ce --- /dev/null +++ b/build/vc11-x86/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 11" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc11-x86_64/build-all.bat b/build/vc11-x86_64/build-all.bat new file mode 100644 index 0000000..4445c03 --- /dev/null +++ b/build/vc11-x86_64/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS110COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 11 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS110COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc11-x86_64/make-solutions.bat b/build/vc11-x86_64/make-solutions.bat new file mode 100644 index 0000000..bb60a88 --- /dev/null +++ b/build/vc11-x86_64/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 11 Win64" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc12-x86/build-all.bat b/build/vc12-x86/build-all.bat new file mode 100644 index 0000000..638e796 --- /dev/null +++ b/build/vc12-x86/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS120COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 12 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS120COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc12-x86/make-solutions.bat b/build/vc12-x86/make-solutions.bat new file mode 100644 index 0000000..7aa0a80 --- /dev/null +++ b/build/vc12-x86/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 12" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc12-x86_64/build-all.bat b/build/vc12-x86_64/build-all.bat new file mode 100644 index 0000000..638e796 --- /dev/null +++ b/build/vc12-x86_64/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS120COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 12 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS120COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc12-x86_64/make-solutions.bat b/build/vc12-x86_64/make-solutions.bat new file mode 100644 index 0000000..da68e73 --- /dev/null +++ b/build/vc12-x86_64/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 12 Win64" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc9-x86/build-all.bat b/build/vc9-x86/build-all.bat new file mode 100644 index 0000000..8af8a4e --- /dev/null +++ b/build/vc9-x86/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS90COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 9 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS90COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc9-x86/make-solutions.bat b/build/vc9-x86/make-solutions.bat new file mode 100644 index 0000000..2735c19 --- /dev/null +++ b/build/vc9-x86/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 9 2008" ..\..\source && cmake-gui ..\..\source diff --git a/build/vc9-x86_64/build-all.bat b/build/vc9-x86_64/build-all.bat new file mode 100644 index 0000000..8af8a4e --- /dev/null +++ b/build/vc9-x86_64/build-all.bat @@ -0,0 +1,14 @@ +@echo off +if "%VS90COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 9 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS90COMNTOOLS%\..\..\VC\vcvarsall.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +) diff --git a/build/vc9-x86_64/make-solutions.bat b/build/vc9-x86_64/make-solutions.bat new file mode 100644 index 0000000..f6a7af0 --- /dev/null +++ b/build/vc9-x86_64/make-solutions.bat @@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 9 2008 Win64" ..\..\source && cmake-gui ..\..\source diff --git a/build/xcode/make-project.sh b/build/xcode/make-project.sh new file mode 100755 index 0000000..f4a4f7f --- /dev/null +++ b/build/xcode/make-project.sh @@ -0,0 +1,2 @@ +#!/bin/sh +cmake -G "Xcode" ../../source && ccmake ../../source diff --git a/doc/intra/intra-16x16.txt b/doc/intra/intra-16x16.txt new file mode 100644 index 0000000..ce4e466 --- /dev/null +++ b/doc/intra/intra-16x16.txt @@ -0,0 +1,561 @@ +--- 16x16, Mode= 2 [F]--- +[ 0]: Fact= 0: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact= 0: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 2]: Fact= 0: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 3]: Fact= 0: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 4]: Fact= 0: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[ 5]: Fact= 0: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[ 6]: Fact= 0: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[ 7]: Fact= 0: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[ 8]: Fact= 0: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[ 9]: Fact= 0: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[10]: Fact= 0: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[11]: Fact= 0: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[12]: Fact= 0: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[13]: Fact= 0: -15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[14]: Fact= 0: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[15]: Fact= 0: -17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +--- 16x16, Mode= 3 [F]--- +[ 0]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 2]: Fact=14: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 3]: Fact= 8: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 4]: Fact= 2: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[ 5]: Fact=28: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[ 6]: Fact=22: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[ 7]: Fact=16: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[ 8]: Fact=10: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[ 9]: Fact= 4: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[10]: Fact=30: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[11]: Fact=24: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[12]: Fact=18: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[13]: Fact=12: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[14]: Fact= 6: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[15]: Fact= 0: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +--- 16x16, Mode= 4 [F]--- +[ 0]: Fact=21: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact=10: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 2]: Fact=31: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 3]: Fact=20: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 4]: Fact= 9: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 5]: Fact=30: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 6]: Fact=19: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[ 7]: Fact= 8: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[ 8]: Fact=29: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[ 9]: Fact=18: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[10]: Fact= 7: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[11]: Fact=28: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[12]: Fact=17: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[13]: Fact= 6: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[14]: Fact=27: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[15]: Fact=16: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +--- 16x16, Mode= 5 [F]--- +[ 0]: Fact=17: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact= 2: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 2]: Fact=19: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 3]: Fact= 4: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 4]: Fact=21: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 5]: Fact= 6: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 6]: Fact=23: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 7]: Fact= 8: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[ 8]: Fact=25: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[ 9]: Fact=10: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[10]: Fact=27: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[11]: Fact=12: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[12]: Fact=29: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[13]: Fact=14: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[14]: Fact=31: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[15]: Fact=16: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +--- 16x16, Mode= 6 [F]--- +[ 0]: Fact=13: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 2]: Fact= 7: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 3]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 4]: Fact= 1: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 5]: Fact=14: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 6]: Fact=27: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 7]: Fact= 8: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 8]: Fact=21: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[ 9]: Fact= 2: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[10]: Fact=15: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[11]: Fact=28: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[12]: Fact= 9: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[13]: Fact=22: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[14]: Fact= 3: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[15]: Fact=16: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +--- 16x16, Mode= 7 [F]--- +[ 0]: Fact= 9: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact=18: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 2]: Fact=27: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 3]: Fact= 4: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 4]: Fact=13: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 5]: Fact=22: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 6]: Fact=31: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 7]: Fact= 8: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 8]: Fact=17: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[ 9]: Fact=26: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[10]: Fact= 3: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[11]: Fact=12: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[12]: Fact=21: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[13]: Fact=30: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[14]: Fact= 7: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[15]: Fact=16: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +--- 16x16, Mode= 8 [F]--- +[ 0]: Fact= 5: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact=10: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 2]: Fact=15: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 3]: Fact=20: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 4]: Fact=25: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 5]: Fact=30: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 6]: Fact= 3: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 7]: Fact= 8: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 8]: Fact=13: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[ 9]: Fact=18: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[10]: Fact=23: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[11]: Fact=28: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[12]: Fact= 1: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[13]: Fact= 6: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[14]: Fact=11: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[15]: Fact=16: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +--- 16x16, Mode= 9 [ ]--- +[ 0]: Fact= 2: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 1]: Fact= 4: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 2]: Fact= 6: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 3]: Fact= 8: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 4]: Fact=10: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 5]: Fact=12: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 6]: Fact=14: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 7]: Fact=16: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 8]: Fact=18: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[ 9]: Fact=20: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[10]: Fact=22: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[11]: Fact=24: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[12]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[13]: Fact=28: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[14]: Fact=30: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[15]: Fact= 0: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +--- 16x16, Mode=10 [ ]--- +[ 0]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 2]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 3]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 4]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 5]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 6]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 7]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 8]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 9]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[10]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[11]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[12]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[13]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[14]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[15]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +--- 16x16, Mode=11 [ ]--- +[ 0]: Fact=30: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact=28: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 2]: Fact=26: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 3]: Fact=24: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 4]: Fact=22: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 5]: Fact=20: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 6]: Fact=18: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 7]: Fact=16: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 8]: Fact=14: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 9]: Fact=12: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[10]: Fact=10: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[11]: Fact= 8: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[12]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[13]: Fact= 4: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[14]: Fact= 2: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[15]: Fact= 0: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +--- 16x16, Mode=12 [F]--- +[ 0]: Fact=27: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact=22: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 2]: Fact=17: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 3]: Fact=12: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 4]: Fact= 7: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 5]: Fact= 2: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 6]: Fact=29: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 7]: Fact=24: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 8]: Fact=19: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 9]: Fact=14: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[10]: Fact= 9: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[11]: Fact= 4: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[12]: Fact=31: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[13]: Fact=26: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[14]: Fact=21: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[15]: Fact=16: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +--- 16x16, Mode=13 [F]--- +[ 0]: Fact=23: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact=14: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 2]: Fact= 5: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 3]: Fact=28: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 4]: Fact=19: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 5]: Fact=10: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 6]: Fact= 1: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 7]: Fact=24: 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 8]: Fact=15: 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 9]: Fact= 6: 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[10]: Fact=29: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[11]: Fact=20: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[12]: Fact=11: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[13]: Fact= 2: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[14]: Fact=25: 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[15]: Fact=16: 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +--- 16x16, Mode=14 [F]--- +[ 0]: Fact=19: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 2]: Fact=25: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 3]: Fact=12: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 4]: Fact=31: 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 5]: Fact=18: 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 6]: Fact= 5: 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 7]: Fact=24: 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 8]: Fact=11: 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 9]: Fact=30: 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[10]: Fact=17: 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[11]: Fact= 4: 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[12]: Fact=23: 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[13]: Fact=10: 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[14]: Fact=29: 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[15]: Fact=16: 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +--- 16x16, Mode=15 [F]--- +[ 0]: Fact=15: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact=30: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 2]: Fact=13: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 3]: Fact=28: 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 4]: Fact=11: 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 5]: Fact=26: 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 6]: Fact= 9: 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 7]: Fact=24: 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 8]: Fact= 7: 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 9]: Fact=22: 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[10]: Fact= 5: 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[11]: Fact=20: 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[12]: Fact= 3: 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[13]: Fact=18: 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[14]: Fact= 1: 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[15]: Fact=16: 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, * +--- 16x16, Mode=16 [F]--- +[ 0]: Fact=11: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact=22: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 2]: Fact= 1: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 3]: Fact=12: 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 4]: Fact=23: 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 5]: Fact= 2: 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 6]: Fact=13: 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 7]: Fact=24: 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 8]: Fact= 3: 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 9]: Fact=14: 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[10]: Fact=25: 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[11]: Fact= 4: 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[12]: Fact=15: 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[13]: Fact=26: 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[14]: Fact= 5: 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[15]: Fact=16: 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, * +--- 16x16, Mode=17 [F]--- +[ 0]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[ 1]: Fact=12: 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[ 2]: Fact=18: 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 3]: Fact=24: 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 4]: Fact=30: 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 5]: Fact= 4: 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 6]: Fact=10: 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 7]: Fact=16: 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 8]: Fact=22: 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 9]: Fact=28: 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[10]: Fact= 2: 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[11]: Fact= 8: 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, * +[12]: Fact=14: 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, * +[13]: Fact=20: 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, * +[14]: Fact=26: 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, * +[15]: Fact= 0: 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, * +--- 16x16, Mode=18 [F]--- +[ 0]: Fact= 0: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 1]: Fact= 0: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 2]: Fact= 0: -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 3]: Fact= 0: -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 4]: Fact= 0: -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 5]: Fact= 0: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 6]: Fact= 0: -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 7]: Fact= 0: -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 8]: Fact= 0: -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 9]: Fact= 0: -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, * +[10]: Fact= 0: -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, * +[11]: Fact= 0: -11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, * +[12]: Fact= 0: -12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, * +[13]: Fact= 0: -13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, * +[14]: Fact= 0: -14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, * +[15]: Fact= 0: -15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, * +--- 16x16, Mode=19 [F]--- +[ 0]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact=12: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 2]: Fact=18: -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 3]: Fact=24: -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 4]: Fact=30: -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 5]: Fact= 4: -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 6]: Fact=10: -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 7]: Fact=16: -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 8]: Fact=22: -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 9]: Fact=28: -10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[10]: Fact= 2: -10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[11]: Fact= 8: -11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, * +[12]: Fact=14: -12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, * +[13]: Fact=20: -14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, * +[14]: Fact=26: -15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, * +[15]: Fact= 0: -15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, * +--- 16x16, Mode=20 [F]--- +[ 0]: Fact=11: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact=22: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 2]: Fact= 1: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 3]: Fact=12: -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 4]: Fact=23: -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 5]: Fact= 2: -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 6]: Fact=13: -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 7]: Fact=24: -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 8]: Fact= 3: -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 9]: Fact=14: -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[10]: Fact=25: -11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[11]: Fact= 4: -11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[12]: Fact=15: -12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[13]: Fact=26: -14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[14]: Fact= 5: -14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[15]: Fact=16: -15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, * +--- 16x16, Mode=21 [F]--- +[ 0]: Fact=15: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact=30: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 2]: Fact=13: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 3]: Fact=28: -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 4]: Fact=11: -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 5]: Fact=26: -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 6]: Fact= 9: -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 7]: Fact=24: -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 8]: Fact= 7: -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 9]: Fact=22: -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[10]: Fact= 5: -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[11]: Fact=20: -11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[12]: Fact= 3: -11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[13]: Fact=18: -13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[14]: Fact= 1: -13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[15]: Fact=16: -15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, * +--- 16x16, Mode=22 [F]--- +[ 0]: Fact=19: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 2]: Fact=25: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 3]: Fact=12: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 4]: Fact=31: -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 5]: Fact=18: -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 6]: Fact= 5: -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 7]: Fact=24: -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 8]: Fact=11: -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 9]: Fact=30: -10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[10]: Fact=17: -10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[11]: Fact= 4: -10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[12]: Fact=23: -12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[13]: Fact=10: -12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[14]: Fact=29: -15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[15]: Fact=16: -15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +--- 16x16, Mode=23 [F]--- +[ 0]: Fact=23: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact=14: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 2]: Fact= 5: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 3]: Fact=28: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 4]: Fact=19: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 5]: Fact=10: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 6]: Fact= 1: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 7]: Fact=24: -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 8]: Fact=15: -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 9]: Fact= 6: -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[10]: Fact=29: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[11]: Fact=20: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[12]: Fact=11: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[13]: Fact= 2: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[14]: Fact=25: -14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[15]: Fact=16: -14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +--- 16x16, Mode=24 [F]--- +[ 0]: Fact=27: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact=22: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 2]: Fact=17: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 3]: Fact=12: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 4]: Fact= 7: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 5]: Fact= 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 6]: Fact=29: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 7]: Fact=24: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 8]: Fact=19: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[ 9]: Fact=14: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[10]: Fact= 9: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[11]: Fact= 4: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[12]: Fact=31: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[13]: Fact=26: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[14]: Fact=21: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[15]: Fact=16: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +--- 16x16, Mode=25 [ ]--- +[ 0]: Fact=30: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact=28: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 2]: Fact=26: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 3]: Fact=24: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 4]: Fact=22: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 5]: Fact=20: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 6]: Fact=18: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 7]: Fact=16: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 8]: Fact=14: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 9]: Fact=12: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[10]: Fact=10: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[11]: Fact= 8: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[12]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[13]: Fact= 4: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[14]: Fact= 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[15]: Fact= 0: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +--- 16x16, Mode=26 [ ]--- +[ 0]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 1]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 2]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 3]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 4]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 5]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 6]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 7]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 8]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[ 9]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[10]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[11]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[12]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[13]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[14]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[15]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +--- 16x16, Mode=27 [ ]--- +[ 0]: Fact= 2: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact= 4: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 2]: Fact= 6: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 3]: Fact= 8: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 4]: Fact=10: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 5]: Fact=12: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 6]: Fact=14: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 7]: Fact=16: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 8]: Fact=18: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 9]: Fact=20: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[10]: Fact=22: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[11]: Fact=24: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[12]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[13]: Fact=28: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[14]: Fact=30: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[15]: Fact= 0: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +--- 16x16, Mode=28 [F]--- +[ 0]: Fact= 5: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact=10: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 2]: Fact=15: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 3]: Fact=20: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 4]: Fact=25: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 5]: Fact=30: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 6]: Fact= 3: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 7]: Fact= 8: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 8]: Fact=13: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 9]: Fact=18: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[10]: Fact=23: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[11]: Fact=28: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[12]: Fact= 1: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[13]: Fact= 6: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[14]: Fact=11: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[15]: Fact=16: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +--- 16x16, Mode=29 [F]--- +[ 0]: Fact= 9: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact=18: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 2]: Fact=27: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 3]: Fact= 4: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 4]: Fact=13: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 5]: Fact=22: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 6]: Fact=31: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 7]: Fact= 8: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 8]: Fact=17: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 9]: Fact=26: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[10]: Fact= 3: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[11]: Fact=12: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[12]: Fact=21: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[13]: Fact=30: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[14]: Fact= 7: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[15]: Fact=16: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +--- 16x16, Mode=30 [F]--- +[ 0]: Fact=13: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 2]: Fact= 7: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 3]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 4]: Fact= 1: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 5]: Fact=14: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 6]: Fact=27: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 7]: Fact= 8: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 8]: Fact=21: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 9]: Fact= 2: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[10]: Fact=15: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[11]: Fact=28: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[12]: Fact= 9: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[13]: Fact=22: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[14]: Fact= 3: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[15]: Fact=16: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +--- 16x16, Mode=31 [F]--- +[ 0]: Fact=17: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact= 2: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 2]: Fact=19: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 3]: Fact= 4: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 4]: Fact=21: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 5]: Fact= 6: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 6]: Fact=23: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 7]: Fact= 8: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[ 8]: Fact=25: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[ 9]: Fact=10: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[10]: Fact=27: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[11]: Fact=12: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[12]: Fact=29: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[13]: Fact=14: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[14]: Fact=31: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[15]: Fact=16: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, * +--- 16x16, Mode=32 [F]--- +[ 0]: Fact=21: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact=10: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 2]: Fact=31: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 3]: Fact=20: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 4]: Fact= 9: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 5]: Fact=30: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 6]: Fact=19: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[ 7]: Fact= 8: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[ 8]: Fact=29: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[ 9]: Fact=18: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[10]: Fact= 7: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[11]: Fact=28: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[12]: Fact=17: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, * +[13]: Fact= 6: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, * +[14]: Fact=27: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, * +[15]: Fact=16: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, * +--- 16x16, Mode=33 [F]--- +[ 0]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 2]: Fact=14: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 3]: Fact= 8: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 4]: Fact= 2: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[ 5]: Fact=28: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[ 6]: Fact=22: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[ 7]: Fact=16: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[ 8]: Fact=10: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[ 9]: Fact= 4: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, * +[10]: Fact=30: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, * +[11]: Fact=24: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, * +[12]: Fact=18: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, * +[13]: Fact=12: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, * +[14]: Fact= 6: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, * +[15]: Fact= 0: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, * +--- 16x16, Mode=34 [F]--- +[ 0]: Fact= 0: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[ 1]: Fact= 0: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[ 2]: Fact= 0: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[ 3]: Fact= 0: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[ 4]: Fact= 0: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[ 5]: Fact= 0: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[ 6]: Fact= 0: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[ 7]: Fact= 0: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[ 8]: Fact= 0: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, * +[ 9]: Fact= 0: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, * +[10]: Fact= 0: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, * +[11]: Fact= 0: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, * +[12]: Fact= 0: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, * +[13]: Fact= 0: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, * +[14]: Fact= 0: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, * +[15]: Fact= 0: 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * diff --git a/doc/intra/intra-32x32.txt b/doc/intra/intra-32x32.txt new file mode 100644 index 0000000..83310f5 --- /dev/null +++ b/doc/intra/intra-32x32.txt @@ -0,0 +1,1089 @@ +--- 32x32, Mode= 2 [F]--- +[ 0]: Fact= 0: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact= 0: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 2]: Fact= 0: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 3]: Fact= 0: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 4]: Fact= 0: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[ 5]: Fact= 0: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[ 6]: Fact= 0: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[ 7]: Fact= 0: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[ 8]: Fact= 0: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[ 9]: Fact= 0: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[10]: Fact= 0: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[11]: Fact= 0: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[12]: Fact= 0: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[13]: Fact= 0: -15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46, * +[14]: Fact= 0: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47, * +[15]: Fact= 0: -17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48, * +[16]: Fact= 0: -18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49, * +[17]: Fact= 0: -19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50, * +[18]: Fact= 0: -20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51, * +[19]: Fact= 0: -21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52, * +[20]: Fact= 0: -22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53, * +[21]: Fact= 0: -23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54, * +[22]: Fact= 0: -24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55, * +[23]: Fact= 0: -25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56, * +[24]: Fact= 0: -26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57, * +[25]: Fact= 0: -27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58, * +[26]: Fact= 0: -28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59, * +[27]: Fact= 0: -29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60, * +[28]: Fact= 0: -30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61, * +[29]: Fact= 0: -31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62, * +[30]: Fact= 0: -32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63, * +[31]: Fact= 0: -33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64, * +--- 32x32, Mode= 3 [F]--- +[ 0]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 2]: Fact=14: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 3]: Fact= 8: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 4]: Fact= 2: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[ 5]: Fact=28: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[ 6]: Fact=22: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[ 7]: Fact=16: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[ 8]: Fact=10: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[ 9]: Fact= 4: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[10]: Fact=30: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[11]: Fact=24: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[12]: Fact=18: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[13]: Fact=12: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[14]: Fact= 6: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[15]: Fact= 0: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[16]: Fact=26: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46, * +[17]: Fact=20: -15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47, * +[18]: Fact=14: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48, * +[19]: Fact= 8: -17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49, * +[20]: Fact= 2: -18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50, * +[21]: Fact=28: -18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50, * +[22]: Fact=22: -19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51, * +[23]: Fact=16: -20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52, * +[24]: Fact=10: -21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53, * +[25]: Fact= 4: -22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54, * +[26]: Fact=30: -22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54, * +[27]: Fact=24: -23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55, * +[28]: Fact=18: -24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56, * +[29]: Fact=12: -25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57, * +[30]: Fact= 6: -26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58, * +[31]: Fact= 0: -27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58, +--- 32x32, Mode= 4 [F]--- +[ 0]: Fact=21: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact=10: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 2]: Fact=31: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 3]: Fact=20: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 4]: Fact= 9: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 5]: Fact=30: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 6]: Fact=19: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[ 7]: Fact= 8: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[ 8]: Fact=29: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[ 9]: Fact=18: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[10]: Fact= 7: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[11]: Fact=28: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[12]: Fact=17: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[13]: Fact= 6: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[14]: Fact=27: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[15]: Fact=16: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[16]: Fact= 5: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[17]: Fact=26: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[18]: Fact=15: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[19]: Fact= 4: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46, * +[20]: Fact=25: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46, * +[21]: Fact=14: -15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47, * +[22]: Fact= 3: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48, * +[23]: Fact=24: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48, * +[24]: Fact=13: -17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49, * +[25]: Fact= 2: -18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50, * +[26]: Fact=23: -18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50, * +[27]: Fact=12: -19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51, * +[28]: Fact= 1: -20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52, * +[29]: Fact=22: -20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52, * +[30]: Fact=11: -21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53, * +[31]: Fact= 0: -22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53, * +--- 32x32, Mode= 5 [F]--- +[ 0]: Fact=17: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact= 2: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 2]: Fact=19: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 3]: Fact= 4: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 4]: Fact=21: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 5]: Fact= 6: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 6]: Fact=23: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 7]: Fact= 8: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[ 8]: Fact=25: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[ 9]: Fact=10: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[10]: Fact=27: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[11]: Fact=12: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[12]: Fact=29: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[13]: Fact=14: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[14]: Fact=31: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[15]: Fact=16: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[16]: Fact= 1: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[17]: Fact=18: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[18]: Fact= 3: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[19]: Fact=20: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[20]: Fact= 5: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[21]: Fact=22: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[22]: Fact= 7: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[23]: Fact=24: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[24]: Fact= 9: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46, * +[25]: Fact=26: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46, * +[26]: Fact=11: -15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47, * +[27]: Fact=28: -15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47, * +[28]: Fact=13: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48, * +[29]: Fact=30: -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48, * +[30]: Fact=15: -17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49, * +[31]: Fact= 0: -18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49, * +--- 32x32, Mode= 6 [F]--- +[ 0]: Fact=13: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 2]: Fact= 7: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 3]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 4]: Fact= 1: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 5]: Fact=14: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 6]: Fact=27: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 7]: Fact= 8: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 8]: Fact=21: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[ 9]: Fact= 2: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[10]: Fact=15: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[11]: Fact=28: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[12]: Fact= 9: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[13]: Fact=22: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[14]: Fact= 3: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[15]: Fact=16: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[16]: Fact=29: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[17]: Fact=10: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[18]: Fact=23: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[19]: Fact= 4: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[20]: Fact=17: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[21]: Fact=30: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[22]: Fact=11: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[23]: Fact=24: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42, * +[24]: Fact= 5: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[25]: Fact=18: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[26]: Fact=31: -11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43, * +[27]: Fact=12: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[28]: Fact=25: -12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44, * +[29]: Fact= 6: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[30]: Fact=19: -13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +[31]: Fact= 0: -14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45, * +--- 32x32, Mode= 7 [F]--- +[ 0]: Fact= 9: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact=18: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 2]: Fact=27: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 3]: Fact= 4: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 4]: Fact=13: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 5]: Fact=22: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 6]: Fact=31: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 7]: Fact= 8: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 8]: Fact=17: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[ 9]: Fact=26: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[10]: Fact= 3: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[11]: Fact=12: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[12]: Fact=21: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[13]: Fact=30: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[14]: Fact= 7: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[15]: Fact=16: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[16]: Fact=25: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[17]: Fact= 2: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[18]: Fact=11: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[19]: Fact=20: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[20]: Fact=29: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38, * +[21]: Fact= 6: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[22]: Fact=15: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[23]: Fact=24: -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39, * +[24]: Fact= 1: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[25]: Fact=10: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[26]: Fact=19: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[27]: Fact=28: -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40, * +[28]: Fact= 5: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[29]: Fact=14: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[30]: Fact=23: -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +[31]: Fact= 0: -10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41, * +--- 32x32, Mode= 8 [F]--- +[ 0]: Fact= 5: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact=10: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 2]: Fact=15: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 3]: Fact=20: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 4]: Fact=25: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 5]: Fact=30: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 6]: Fact= 3: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 7]: Fact= 8: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 8]: Fact=13: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[ 9]: Fact=18: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[10]: Fact=23: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[11]: Fact=28: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[12]: Fact= 1: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[13]: Fact= 6: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[14]: Fact=11: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[15]: Fact=16: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[16]: Fact=21: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[17]: Fact=26: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[18]: Fact=31: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35, * +[19]: Fact= 4: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[20]: Fact= 9: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[21]: Fact=14: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[22]: Fact=19: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[23]: Fact=24: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[24]: Fact=29: -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36, * +[25]: Fact= 2: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[26]: Fact= 7: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[27]: Fact=12: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[28]: Fact=17: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[29]: Fact=22: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[30]: Fact=27: -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +[31]: Fact= 0: -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37, * +--- 32x32, Mode= 9 [F]--- +[ 0]: Fact= 2: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 1]: Fact= 4: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 2]: Fact= 6: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 3]: Fact= 8: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 4]: Fact=10: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 5]: Fact=12: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 6]: Fact=14: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 7]: Fact=16: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 8]: Fact=18: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[ 9]: Fact=20: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[10]: Fact=22: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[11]: Fact=24: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[12]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[13]: Fact=28: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[14]: Fact=30: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[15]: Fact= 0: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33, * +[16]: Fact= 2: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[17]: Fact= 4: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[18]: Fact= 6: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[19]: Fact= 8: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[20]: Fact=10: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[21]: Fact=12: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[22]: Fact=14: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[23]: Fact=16: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[24]: Fact=18: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[25]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[26]: Fact=22: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[27]: Fact=24: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[28]: Fact=26: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[29]: Fact=28: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[30]: Fact=30: -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +[31]: Fact= 0: -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34, * +--- 32x32, Mode=10 [ ]--- +[ 0]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 2]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 3]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 4]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 5]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 6]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 7]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 8]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 9]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[10]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[11]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[12]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[13]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[14]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[15]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[16]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[17]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[18]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[19]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[20]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[21]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[22]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[23]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[24]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[25]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[26]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[27]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[28]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[29]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[30]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[31]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +--- 32x32, Mode=11 [F]--- +[ 0]: Fact=30: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact=28: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 2]: Fact=26: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 3]: Fact=24: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 4]: Fact=22: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 5]: Fact=20: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 6]: Fact=18: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 7]: Fact=16: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 8]: Fact=14: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 9]: Fact=12: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[10]: Fact=10: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[11]: Fact= 8: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[12]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[13]: Fact= 4: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[14]: Fact= 2: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[15]: Fact= 0: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[16]: Fact=30: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[17]: Fact=28: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[18]: Fact=26: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[19]: Fact=24: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[20]: Fact=22: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[21]: Fact=20: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[22]: Fact=18: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[23]: Fact=16: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[24]: Fact=14: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[25]: Fact=12: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[26]: Fact=10: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[27]: Fact= 8: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[28]: Fact= 6: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[29]: Fact= 4: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[30]: Fact= 2: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[31]: Fact= 0: 16, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +--- 32x32, Mode=12 [F]--- +[ 0]: Fact=27: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact=22: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 2]: Fact=17: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 3]: Fact=12: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 4]: Fact= 7: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 5]: Fact= 2: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 6]: Fact=29: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 7]: Fact=24: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 8]: Fact=19: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 9]: Fact=14: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[10]: Fact= 9: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[11]: Fact= 4: 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[12]: Fact=31: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[13]: Fact=26: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[14]: Fact=21: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[15]: Fact=16: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[16]: Fact=11: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[17]: Fact= 6: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[18]: Fact= 1: 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[19]: Fact=28: 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[20]: Fact=23: 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[21]: Fact=18: 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[22]: Fact=13: 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[23]: Fact= 8: 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[24]: Fact= 3: 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[25]: Fact=30: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[26]: Fact=25: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[27]: Fact=20: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[28]: Fact=15: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[29]: Fact=10: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[30]: Fact= 5: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[31]: Fact= 0: 26, 19, 13, 6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +--- 32x32, Mode=13 [F]--- +[ 0]: Fact=23: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact=14: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 2]: Fact= 5: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 3]: Fact=28: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 4]: Fact=19: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 5]: Fact=10: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 6]: Fact= 1: 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 7]: Fact=24: 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 8]: Fact=15: 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 9]: Fact= 6: 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[10]: Fact=29: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[11]: Fact=20: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[12]: Fact=11: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[13]: Fact= 2: 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[14]: Fact=25: 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[15]: Fact=16: 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[16]: Fact= 7: 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[17]: Fact=30: 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[18]: Fact=21: 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[19]: Fact=12: 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[20]: Fact= 3: 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[21]: Fact=26: 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[22]: Fact=17: 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[23]: Fact= 8: 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[24]: Fact=31: 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[25]: Fact=22: 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[26]: Fact=13: 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[27]: Fact= 4: 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[28]: Fact=27: 28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[29]: Fact=18: 28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[30]: Fact= 9: 28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[31]: Fact= 0: 28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +--- 32x32, Mode=14 [F]--- +[ 0]: Fact=19: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 2]: Fact=25: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 3]: Fact=12: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 4]: Fact=31: 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 5]: Fact=18: 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 6]: Fact= 5: 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 7]: Fact=24: 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 8]: Fact=11: 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 9]: Fact=30: 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[10]: Fact=17: 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[11]: Fact= 4: 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[12]: Fact=23: 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[13]: Fact=10: 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[14]: Fact=29: 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[15]: Fact=16: 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[16]: Fact= 3: 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[17]: Fact=22: 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[18]: Fact= 9: 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[19]: Fact=28: 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[20]: Fact=15: 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[21]: Fact= 2: 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[22]: Fact=21: 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[23]: Fact= 8: 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[24]: Fact=27: 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[25]: Fact=14: 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[26]: Fact= 1: 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[27]: Fact=20: 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[28]: Fact= 7: 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[29]: Fact=26: 30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[30]: Fact=13: 30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[31]: Fact= 0: 30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +--- 32x32, Mode=15 [F]--- +[ 0]: Fact=15: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact=30: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 2]: Fact=13: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 3]: Fact=28: 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 4]: Fact=11: 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 5]: Fact=26: 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 6]: Fact= 9: 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 7]: Fact=24: 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[ 8]: Fact= 7: 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[ 9]: Fact=22: 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[10]: Fact= 5: 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[11]: Fact=20: 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[12]: Fact= 3: 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[13]: Fact=18: 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[14]: Fact= 1: 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[15]: Fact=16: 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[16]: Fact=31: 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[17]: Fact=14: 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[18]: Fact=29: 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[19]: Fact=12: 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[20]: Fact=27: 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[21]: Fact=10: 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[22]: Fact=25: 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[23]: Fact= 8: 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[24]: Fact=23: 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[25]: Fact= 6: 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[26]: Fact=21: 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[27]: Fact= 4: 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[28]: Fact=19: 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[29]: Fact= 2: 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[30]: Fact=17: 30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[31]: Fact= 0: 30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +--- 32x32, Mode=16 [F]--- +[ 0]: Fact=11: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact=22: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 2]: Fact= 1: 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 3]: Fact=12: 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 4]: Fact=23: 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 5]: Fact= 2: 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 6]: Fact=13: 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[ 7]: Fact=24: 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[ 8]: Fact= 3: 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[ 9]: Fact=14: 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[10]: Fact=25: 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[11]: Fact= 4: 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[12]: Fact=15: 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[13]: Fact=26: 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[14]: Fact= 5: 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[15]: Fact=16: 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[16]: Fact=27: 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[17]: Fact= 6: 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[18]: Fact=17: 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[19]: Fact=28: 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[20]: Fact= 7: 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[21]: Fact=18: 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[22]: Fact=29: 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[23]: Fact= 8: 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[24]: Fact=19: 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[25]: Fact=30: 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[26]: Fact= 9: 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[27]: Fact=20: 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[28]: Fact=31: 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[29]: Fact=10: 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[30]: Fact=21: 30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[31]: Fact= 0: 30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +--- 32x32, Mode=17 [F]--- +[ 0]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32, * +[ 1]: Fact=12: 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31, * +[ 2]: Fact=18: 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30, * +[ 3]: Fact=24: 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29, * +[ 4]: Fact=30: 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[ 5]: Fact= 4: 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28, * +[ 6]: Fact=10: 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27, * +[ 7]: Fact=16: 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26, * +[ 8]: Fact=22: 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25, * +[ 9]: Fact=28: 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[10]: Fact= 2: 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24, * +[11]: Fact= 8: 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23, * +[12]: Fact=14: 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22, * +[13]: Fact=20: 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21, * +[14]: Fact=26: 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20, * +[15]: Fact= 0: 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[16]: Fact= 6: 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19, * +[17]: Fact=12: 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17,-18, * +[18]: Fact=18: 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16,-17, * +[19]: Fact=24: 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15,-16, * +[20]: Fact=30: 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[21]: Fact= 4: 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15, * +[22]: Fact=10: 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[23]: Fact=16: 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[24]: Fact=22: 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[25]: Fact=28: 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[26]: Fact= 2: 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[27]: Fact= 8: 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[28]: Fact=14: 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[29]: Fact=20: 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[30]: Fact=26: 31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, * +[31]: Fact= 0: 31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2, -3, -4, -5, -6, * +--- 32x32, Mode=18 [F]--- +[ 0]: Fact= 0: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, * +[ 1]: Fact= 0: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, * +[ 2]: Fact= 0: -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, * +[ 3]: Fact= 0: -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, * +[ 4]: Fact= 0: -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, * +[ 5]: Fact= 0: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, * +[ 6]: Fact= 0: -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, * +[ 7]: Fact= 0: -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, * +[ 8]: Fact= 0: -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, * +[ 9]: Fact= 0: -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, * +[10]: Fact= 0: -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, * +[11]: Fact= 0: -11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, * +[12]: Fact= 0: -12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, * +[13]: Fact= 0: -13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, * +[14]: Fact= 0: -14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, * +[15]: Fact= 0: -15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, * +[16]: Fact= 0: -16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, * +[17]: Fact= 0: -17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[18]: Fact= 0: -18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[19]: Fact= 0: -19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[20]: Fact= 0: -20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[21]: Fact= 0: -21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[22]: Fact= 0: -22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[23]: Fact= 0: -23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[24]: Fact= 0: -24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, * +[25]: Fact= 0: -25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, * +[26]: Fact= 0: -26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, * +[27]: Fact= 0: -27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, * +[28]: Fact= 0: -28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, * +[29]: Fact= 0: -29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, * +[30]: Fact= 0: -30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, * +[31]: Fact= 0: -31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, * +--- 32x32, Mode=19 [F]--- +[ 0]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact=12: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 2]: Fact=18: -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 3]: Fact=24: -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 4]: Fact=30: -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[ 5]: Fact= 4: -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[ 6]: Fact=10: -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[ 7]: Fact=16: -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[ 8]: Fact=22: -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[ 9]: Fact=28: -10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[10]: Fact= 2: -10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[11]: Fact= 8: -11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[12]: Fact=14: -12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[13]: Fact=20: -14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[14]: Fact=26: -15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, +[15]: Fact= 0: -15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +[16]: Fact= 6: -16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +[17]: Fact=12: -17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, +[18]: Fact=18: -18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, +[19]: Fact=24: -20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, +[20]: Fact=30: -21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +[21]: Fact= 4: -21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +[22]: Fact=10: -22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, +[23]: Fact=16: -23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, +[24]: Fact=22: -25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, +[25]: Fact=28: -26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, +[26]: Fact= 2: -26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, +[27]: Fact= 8: -27,-26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, +[28]: Fact=14: -28,-27,-26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, +[29]: Fact=20: -30,-28,-27,-26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, +[30]: Fact=26: -31,-30,-28,-27,-26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, +[31]: Fact= 0: -31,-30,-28,-27,-26,-25,-23,-22,-21,-20,-18,-17,-16,-15,-14,-12,-11,-10, -9, -7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4, 5, 6, +--- 32x32, Mode=20 [F]--- +[ 0]: Fact=11: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact=22: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 2]: Fact= 1: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 3]: Fact=12: -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 4]: Fact=23: -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 5]: Fact= 2: -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 6]: Fact=13: -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[ 7]: Fact=24: -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[ 8]: Fact= 3: -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[ 9]: Fact=14: -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[10]: Fact=25: -11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[11]: Fact= 4: -11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[12]: Fact=15: -12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[13]: Fact=26: -14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[14]: Fact= 5: -14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[15]: Fact=16: -15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[16]: Fact=27: -17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[17]: Fact= 6: -17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[18]: Fact=17: -18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, +[19]: Fact=28: -20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +[20]: Fact= 7: -20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +[21]: Fact=18: -21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, +[22]: Fact=29: -23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, +[23]: Fact= 8: -23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, +[24]: Fact=19: -24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, +[25]: Fact=30: -26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +[26]: Fact= 9: -26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +[27]: Fact=20: -27,-26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, +[28]: Fact=31: -29,-27,-26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, +[29]: Fact=10: -29,-27,-26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, +[30]: Fact=21: -30,-29,-27,-26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, +[31]: Fact= 0: -30,-29,-27,-26,-24,-23,-21,-20,-18,-17,-15,-14,-12,-11, -9, -8, -6, -5, -3, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, +--- 32x32, Mode=21 [F]--- +[ 0]: Fact=15: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact=30: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 2]: Fact=13: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 3]: Fact=28: -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 4]: Fact=11: -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 5]: Fact=26: -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 6]: Fact= 9: -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 7]: Fact=24: -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[ 8]: Fact= 7: -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[ 9]: Fact=22: -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[10]: Fact= 5: -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[11]: Fact=20: -11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[12]: Fact= 3: -11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[13]: Fact=18: -13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[14]: Fact= 1: -13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[15]: Fact=16: -15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[16]: Fact=31: -17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[17]: Fact=14: -17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[18]: Fact=29: -19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[19]: Fact=12: -19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[20]: Fact=27: -21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[21]: Fact=10: -21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[22]: Fact=25: -23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, +[23]: Fact= 8: -23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, +[24]: Fact=23: -24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +[25]: Fact= 6: -24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +[26]: Fact=21: -26,-24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, +[27]: Fact= 4: -26,-24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, +[28]: Fact=19: -28,-26,-24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, +[29]: Fact= 2: -28,-26,-24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, +[30]: Fact=17: -30,-28,-26,-24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, +[31]: Fact= 0: -30,-28,-26,-24,-23,-21,-19,-17,-15,-13,-11, -9, -8, -6, -4, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +--- 32x32, Mode=22 [F]--- +[ 0]: Fact=19: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 2]: Fact=25: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 3]: Fact=12: -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 4]: Fact=31: -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 5]: Fact=18: -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 6]: Fact= 5: -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 7]: Fact=24: -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 8]: Fact=11: -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[ 9]: Fact=30: -10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[10]: Fact=17: -10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[11]: Fact= 4: -10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[12]: Fact=23: -12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[13]: Fact=10: -12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[14]: Fact=29: -15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[15]: Fact=16: -15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[16]: Fact= 3: -15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[17]: Fact=22: -17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[18]: Fact= 9: -17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[19]: Fact=28: -20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[20]: Fact=15: -20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[21]: Fact= 2: -20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[22]: Fact=21: -22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[23]: Fact= 8: -22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +[24]: Fact=27: -25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[25]: Fact=14: -25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[26]: Fact= 1: -25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, +[27]: Fact=20: -27,-25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[28]: Fact= 7: -27,-25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, +[29]: Fact=26: -30,-27,-25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, +[30]: Fact=13: -30,-27,-25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, +[31]: Fact= 0: -30,-27,-25,-22,-20,-17,-15,-12,-10, -7, -5, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +--- 32x32, Mode=23 [F]--- +[ 0]: Fact=23: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact=14: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 2]: Fact= 5: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 3]: Fact=28: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 4]: Fact=19: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 5]: Fact=10: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 6]: Fact= 1: -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 7]: Fact=24: -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 8]: Fact=15: -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[ 9]: Fact= 6: -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[10]: Fact=29: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[11]: Fact=20: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[12]: Fact=11: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[13]: Fact= 2: -11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[14]: Fact=25: -14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[15]: Fact=16: -14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[16]: Fact= 7: -14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[17]: Fact=30: -18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[18]: Fact=21: -18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[19]: Fact=12: -18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[20]: Fact= 3: -18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +[21]: Fact=26: -21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[22]: Fact=17: -21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[23]: Fact= 8: -21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, +[24]: Fact=31: -25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[25]: Fact=22: -25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[26]: Fact=13: -25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[27]: Fact= 4: -25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +[28]: Fact=27: -28,-25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[29]: Fact=18: -28,-25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[30]: Fact= 9: -28,-25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +[31]: Fact= 0: -28,-25,-21,-18,-14,-11, -7, -4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, +--- 32x32, Mode=24 [F]--- +[ 0]: Fact=27: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact=22: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 2]: Fact=17: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 3]: Fact=12: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 4]: Fact= 7: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 5]: Fact= 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 6]: Fact=29: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 7]: Fact=24: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 8]: Fact=19: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[ 9]: Fact=14: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[10]: Fact= 9: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[11]: Fact= 4: -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[12]: Fact=31: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[13]: Fact=26: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[14]: Fact=21: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[15]: Fact=16: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[16]: Fact=11: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[17]: Fact= 6: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[18]: Fact= 1: -13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +[19]: Fact=28: -19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[20]: Fact=23: -19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[21]: Fact=18: -19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[22]: Fact=13: -19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[23]: Fact= 8: -19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[24]: Fact= 3: -19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, +[25]: Fact=30: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[26]: Fact=25: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[27]: Fact=20: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[28]: Fact=15: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[29]: Fact=10: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[30]: Fact= 5: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +[31]: Fact= 0: -26,-19,-13, -6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, +--- 32x32, Mode=25 [F]--- +[ 0]: Fact=30: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 1]: Fact=28: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 2]: Fact=26: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 3]: Fact=24: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 4]: Fact=22: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 5]: Fact=20: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 6]: Fact=18: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 7]: Fact=16: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 8]: Fact=14: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[ 9]: Fact=12: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[10]: Fact=10: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[11]: Fact= 8: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[12]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[13]: Fact= 4: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[14]: Fact= 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +[15]: Fact= 0: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[16]: Fact=30: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[17]: Fact=28: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[18]: Fact=26: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[19]: Fact=24: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[20]: Fact=22: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[21]: Fact=20: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[22]: Fact=18: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[23]: Fact=16: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[24]: Fact=14: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[25]: Fact=12: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[26]: Fact=10: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[27]: Fact= 8: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[28]: Fact= 6: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[29]: Fact= 4: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[30]: Fact= 2: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +[31]: Fact= 0: -16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, +--- 32x32, Mode=26 [ ]--- +[ 0]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 1]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 2]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 3]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 4]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 5]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 6]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 7]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 8]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[ 9]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[10]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[11]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[12]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[13]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[14]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[15]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[16]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[17]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[18]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[19]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[20]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[21]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[22]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[23]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[24]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[25]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[26]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[27]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[28]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[29]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[30]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +[31]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, * +--- 32x32, Mode=27 [F]--- +[ 0]: Fact= 2: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact= 4: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 2]: Fact= 6: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 3]: Fact= 8: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 4]: Fact=10: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 5]: Fact=12: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 6]: Fact=14: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 7]: Fact=16: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 8]: Fact=18: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 9]: Fact=20: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[10]: Fact=22: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[11]: Fact=24: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[12]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[13]: Fact=28: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[14]: Fact=30: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[15]: Fact= 0: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[16]: Fact= 2: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[17]: Fact= 4: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[18]: Fact= 6: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[19]: Fact= 8: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[20]: Fact=10: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[21]: Fact=12: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[22]: Fact=14: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[23]: Fact=16: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[24]: Fact=18: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[25]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[26]: Fact=22: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[27]: Fact=24: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[28]: Fact=26: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[29]: Fact=28: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[30]: Fact=30: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[31]: Fact= 0: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +--- 32x32, Mode=28 [F]--- +[ 0]: Fact= 5: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact=10: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 2]: Fact=15: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 3]: Fact=20: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 4]: Fact=25: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 5]: Fact=30: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 6]: Fact= 3: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 7]: Fact= 8: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 8]: Fact=13: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 9]: Fact=18: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[10]: Fact=23: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[11]: Fact=28: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[12]: Fact= 1: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[13]: Fact= 6: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[14]: Fact=11: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[15]: Fact=16: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[16]: Fact=21: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[17]: Fact=26: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[18]: Fact=31: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[19]: Fact= 4: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[20]: Fact= 9: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[21]: Fact=14: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[22]: Fact=19: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[23]: Fact=24: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[24]: Fact=29: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[25]: Fact= 2: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[26]: Fact= 7: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[27]: Fact=12: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[28]: Fact=17: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[29]: Fact=22: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[30]: Fact=27: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[31]: Fact= 0: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +--- 32x32, Mode=29 [F]--- +[ 0]: Fact= 9: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact=18: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 2]: Fact=27: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 3]: Fact= 4: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 4]: Fact=13: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 5]: Fact=22: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 6]: Fact=31: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 7]: Fact= 8: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 8]: Fact=17: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 9]: Fact=26: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[10]: Fact= 3: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[11]: Fact=12: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[12]: Fact=21: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[13]: Fact=30: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[14]: Fact= 7: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[15]: Fact=16: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[16]: Fact=25: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[17]: Fact= 2: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[18]: Fact=11: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[19]: Fact=20: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[20]: Fact=29: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[21]: Fact= 6: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[22]: Fact=15: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[23]: Fact=24: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[24]: Fact= 1: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[25]: Fact=10: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[26]: Fact=19: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[27]: Fact=28: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[28]: Fact= 5: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[29]: Fact=14: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[30]: Fact=23: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[31]: Fact= 0: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +--- 32x32, Mode=30 [F]--- +[ 0]: Fact=13: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 2]: Fact= 7: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 3]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 4]: Fact= 1: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 5]: Fact=14: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 6]: Fact=27: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 7]: Fact= 8: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 8]: Fact=21: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 9]: Fact= 2: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[10]: Fact=15: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[11]: Fact=28: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[12]: Fact= 9: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[13]: Fact=22: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[14]: Fact= 3: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[15]: Fact=16: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[16]: Fact=29: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[17]: Fact=10: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[18]: Fact=23: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[19]: Fact= 4: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[20]: Fact=17: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[21]: Fact=30: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[22]: Fact=11: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[23]: Fact=24: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[24]: Fact= 5: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[25]: Fact=18: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[26]: Fact=31: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[27]: Fact=12: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[28]: Fact=25: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[29]: Fact= 6: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[30]: Fact=19: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[31]: Fact= 0: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +--- 32x32, Mode=31 [F]--- +[ 0]: Fact=17: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact= 2: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 2]: Fact=19: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 3]: Fact= 4: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 4]: Fact=21: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 5]: Fact= 6: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 6]: Fact=23: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 7]: Fact= 8: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[ 8]: Fact=25: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[ 9]: Fact=10: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[10]: Fact=27: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[11]: Fact=12: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[12]: Fact=29: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[13]: Fact=14: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[14]: Fact=31: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[15]: Fact=16: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[16]: Fact= 1: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[17]: Fact=18: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[18]: Fact= 3: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[19]: Fact=20: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[20]: Fact= 5: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[21]: Fact=22: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[22]: Fact= 7: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[23]: Fact=24: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[24]: Fact= 9: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, +[25]: Fact=26: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, +[26]: Fact=11: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, +[27]: Fact=28: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, +[28]: Fact=13: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, +[29]: Fact=30: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, +[30]: Fact=15: 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, +[31]: Fact= 0: 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, +--- 32x32, Mode=32 [F]--- +[ 0]: Fact=21: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact=10: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 2]: Fact=31: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 3]: Fact=20: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 4]: Fact= 9: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 5]: Fact=30: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 6]: Fact=19: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[ 7]: Fact= 8: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[ 8]: Fact=29: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[ 9]: Fact=18: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[10]: Fact= 7: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[11]: Fact=28: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[12]: Fact=17: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[13]: Fact= 6: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[14]: Fact=27: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[15]: Fact=16: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[16]: Fact= 5: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[17]: Fact=26: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[18]: Fact=15: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[19]: Fact= 4: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, +[20]: Fact=25: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, +[21]: Fact=14: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, +[22]: Fact= 3: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, +[23]: Fact=24: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, +[24]: Fact=13: 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, +[25]: Fact= 2: 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, +[26]: Fact=23: 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, +[27]: Fact=12: 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, +[28]: Fact= 1: 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, +[29]: Fact=22: 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, +[30]: Fact=11: 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, +[31]: Fact= 0: 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, +--- 32x32, Mode=33 [F]--- +[ 0]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 2]: Fact=14: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 3]: Fact= 8: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 4]: Fact= 2: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[ 5]: Fact=28: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[ 6]: Fact=22: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[ 7]: Fact=16: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[ 8]: Fact=10: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[ 9]: Fact= 4: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[10]: Fact=30: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[11]: Fact=24: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[12]: Fact=18: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[13]: Fact=12: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[14]: Fact= 6: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[15]: Fact= 0: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[16]: Fact=26: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, +[17]: Fact=20: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, +[18]: Fact=14: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, +[19]: Fact= 8: 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, +[20]: Fact= 2: 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, +[21]: Fact=28: 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, +[22]: Fact=22: 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, +[23]: Fact=16: 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, +[24]: Fact=10: 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, +[25]: Fact= 4: 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, +[26]: Fact=30: 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, +[27]: Fact=24: 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, +[28]: Fact=18: 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, +[29]: Fact=12: 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, +[30]: Fact= 6: 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, +[31]: Fact= 0: 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, +--- 32x32, Mode=34 [F]--- +[ 0]: Fact= 0: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, +[ 1]: Fact= 0: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, +[ 2]: Fact= 0: 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, +[ 3]: Fact= 0: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, +[ 4]: Fact= 0: 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, +[ 5]: Fact= 0: 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, +[ 6]: Fact= 0: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, +[ 7]: Fact= 0: 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +[ 8]: Fact= 0: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, +[ 9]: Fact= 0: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, +[10]: Fact= 0: 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, +[11]: Fact= 0: 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, +[12]: Fact= 0: 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, +[13]: Fact= 0: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, +[14]: Fact= 0: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, +[15]: Fact= 0: 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, +[16]: Fact= 0: 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, +[17]: Fact= 0: 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, +[18]: Fact= 0: 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, +[19]: Fact= 0: 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, +[20]: Fact= 0: 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, +[21]: Fact= 0: 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, +[22]: Fact= 0: 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, +[23]: Fact= 0: 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, +[24]: Fact= 0: 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, +[25]: Fact= 0: 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, +[26]: Fact= 0: 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, +[27]: Fact= 0: 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, +[28]: Fact= 0: 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, +[29]: Fact= 0: 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, +[30]: Fact= 0: 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +[31]: Fact= 0: 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, diff --git a/doc/intra/intra-4x4.txt b/doc/intra/intra-4x4.txt new file mode 100644 index 0000000..1609d0d --- /dev/null +++ b/doc/intra/intra-4x4.txt @@ -0,0 +1,166 @@ +--- 4x4, Mode= 2 --- +[ 0]: Fact= 0: -2, -3, -4, -5, * +[ 1]: Fact= 0: -3, -4, -5, -6, * +[ 2]: Fact= 0: -4, -5, -6, -7, * +[ 3]: Fact= 0: -5, -6, -7, -8, * +--- 4x4, Mode= 3 --- +[ 0]: Fact=26: -1, -2, -3, -4, -5, * +[ 1]: Fact=20: -2, -3, -4, -5, -6, * +[ 2]: Fact=14: -3, -4, -5, -6, -7, * +[ 3]: Fact= 8: -4, -5, -6, -7, -8, * +--- 4x4, Mode= 4 --- +[ 0]: Fact=21: -1, -2, -3, -4, -5, * +[ 1]: Fact=10: -2, -3, -4, -5, -6, * +[ 2]: Fact=31: -2, -3, -4, -5, -6, * +[ 3]: Fact=20: -3, -4, -5, -6, -7, * +--- 4x4, Mode= 5 --- +[ 0]: Fact=17: -1, -2, -3, -4, -5, * +[ 1]: Fact= 2: -2, -3, -4, -5, -6, * +[ 2]: Fact=19: -2, -3, -4, -5, -6, * +[ 3]: Fact= 4: -3, -4, -5, -6, -7, * +--- 4x4, Mode= 6 --- +[ 0]: Fact=13: -1, -2, -3, -4, -5, * +[ 1]: Fact=26: -1, -2, -3, -4, -5, * +[ 2]: Fact= 7: -2, -3, -4, -5, -6, * +[ 3]: Fact=20: -2, -3, -4, -5, -6, * +--- 4x4, Mode= 7 --- +[ 0]: Fact= 9: -1, -2, -3, -4, -5, * +[ 1]: Fact=18: -1, -2, -3, -4, -5, * +[ 2]: Fact=27: -1, -2, -3, -4, -5, * +[ 3]: Fact= 4: -2, -3, -4, -5, -6, * +--- 4x4, Mode= 8 --- +[ 0]: Fact= 5: -1, -2, -3, -4, -5, * +[ 1]: Fact=10: -1, -2, -3, -4, -5, * +[ 2]: Fact=15: -1, -2, -3, -4, -5, * +[ 3]: Fact=20: -1, -2, -3, -4, -5, * +--- 4x4, Mode= 9 --- +[ 0]: Fact= 2: -1, -2, -3, -4, -5, * +[ 1]: Fact= 4: -1, -2, -3, -4, -5, * +[ 2]: Fact= 6: -1, -2, -3, -4, -5, * +[ 3]: Fact= 8: -1, -2, -3, -4, -5, * +--- 4x4, Mode=10 --- +[ 0]: Fact= 0: -1, -2, -3, -4, * +[ 1]: Fact= 0: -1, -2, -3, -4, * +[ 2]: Fact= 0: -1, -2, -3, -4, * +[ 3]: Fact= 0: -1, -2, -3, -4, * +--- 4x4, Mode=11 --- +[ 0]: Fact=30: 0, -1, -2, -3, -4, * +[ 1]: Fact=28: 0, -1, -2, -3, -4, * +[ 2]: Fact=26: 0, -1, -2, -3, -4, * +[ 3]: Fact=24: 0, -1, -2, -3, -4, * +--- 4x4, Mode=12 --- +[ 0]: Fact=27: 0, -1, -2, -3, -4, * +[ 1]: Fact=22: 0, -1, -2, -3, -4, * +[ 2]: Fact=17: 0, -1, -2, -3, -4, * +[ 3]: Fact=12: 0, -1, -2, -3, -4, * +--- 4x4, Mode=13 --- +[ 0]: Fact=23: 0, -1, -2, -3, -4, * +[ 1]: Fact=14: 0, -1, -2, -3, -4, * +[ 2]: Fact= 5: 0, -1, -2, -3, -4, * +[ 3]: Fact=28: 4, 0, -1, -2, -3, * +--- 4x4, Mode=14 --- +[ 0]: Fact=19: 0, -1, -2, -3, -4, * +[ 1]: Fact= 6: 0, -1, -2, -3, -4, * +[ 2]: Fact=25: 2, 0, -1, -2, -3, * +[ 3]: Fact=12: 2, 0, -1, -2, -3, * +--- 4x4, Mode=15 --- +[ 0]: Fact=15: 0, -1, -2, -3, -4, * +[ 1]: Fact=30: 2, 0, -1, -2, -3, * +[ 2]: Fact=13: 2, 0, -1, -2, -3, * +[ 3]: Fact=28: 4, 2, 0, -1, -2, * +--- 4x4, Mode=16 --- +[ 0]: Fact=11: 0, -1, -2, -3, -4, * +[ 1]: Fact=22: 2, 0, -1, -2, -3, * +[ 2]: Fact= 1: 2, 0, -1, -2, -3, * +[ 3]: Fact=12: 3, 2, 0, -1, -2, * +--- 4x4, Mode=17 --- +[ 0]: Fact= 6: 0, -1, -2, -3, -4, * +[ 1]: Fact=12: 1, 0, -1, -2, -3, * +[ 2]: Fact=18: 2, 1, 0, -1, -2, * +[ 3]: Fact=24: 4, 2, 1, 0, -1, * +--- 4x4, Mode=18 --- +[ 0]: Fact= 0: 0, 1, 2, 3, * +[ 1]: Fact= 0: -1, 0, 1, 2, * +[ 2]: Fact= 0: -2, -1, 0, 1, * +[ 3]: Fact= 0: -3, -2, -1, 0, * +--- 4x4, Mode=19 --- +[ 0]: Fact= 6: 0, 1, 2, 3, 4, * +[ 1]: Fact=12: -1, 0, 1, 2, 3, * +[ 2]: Fact=18: -2, -1, 0, 1, 2, * +[ 3]: Fact=24: -4, -2, -1, 0, 1, * +--- 4x4, Mode=20 --- +[ 0]: Fact=11: 0, 1, 2, 3, 4, * +[ 1]: Fact=22: -2, 0, 1, 2, 3, * +[ 2]: Fact= 1: -2, 0, 1, 2, 3, * +[ 3]: Fact=12: -3, -2, 0, 1, 2, * +--- 4x4, Mode=21 --- +[ 0]: Fact=15: 0, 1, 2, 3, 4, * +[ 1]: Fact=30: -2, 0, 1, 2, 3, * +[ 2]: Fact=13: -2, 0, 1, 2, 3, * +[ 3]: Fact=28: -4, -2, 0, 1, 2, * +--- 4x4, Mode=22 --- +[ 0]: Fact=19: 0, 1, 2, 3, 4, * +[ 1]: Fact= 6: 0, 1, 2, 3, 4, * +[ 2]: Fact=25: -2, 0, 1, 2, 3, * +[ 3]: Fact=12: -2, 0, 1, 2, 3, * +--- 4x4, Mode=23 --- +[ 0]: Fact=23: 0, 1, 2, 3, 4, * +[ 1]: Fact=14: 0, 1, 2, 3, 4, * +[ 2]: Fact= 5: 0, 1, 2, 3, 4, * +[ 3]: Fact=28: -4, 0, 1, 2, 3, * +--- 4x4, Mode=24 --- +[ 0]: Fact=27: 0, 1, 2, 3, 4, * +[ 1]: Fact=22: 0, 1, 2, 3, 4, * +[ 2]: Fact=17: 0, 1, 2, 3, 4, * +[ 3]: Fact=12: 0, 1, 2, 3, 4, * +--- 4x4, Mode=25 --- +[ 0]: Fact=30: 0, 1, 2, 3, 4, * +[ 1]: Fact=28: 0, 1, 2, 3, 4, * +[ 2]: Fact=26: 0, 1, 2, 3, 4, * +[ 3]: Fact=24: 0, 1, 2, 3, 4, * +--- 4x4, Mode=26 --- +[ 0]: Fact= 0: 1, 2, 3, 4, * +[ 1]: Fact= 0: 1, 2, 3, 4, * +[ 2]: Fact= 0: 1, 2, 3, 4, * +[ 3]: Fact= 0: 1, 2, 3, 4, * +--- 4x4, Mode=27 --- +[ 0]: Fact= 2: 1, 2, 3, 4, 5, * +[ 1]: Fact= 4: 1, 2, 3, 4, 5, * +[ 2]: Fact= 6: 1, 2, 3, 4, 5, * +[ 3]: Fact= 8: 1, 2, 3, 4, 5, * +--- 4x4, Mode=28 --- +[ 0]: Fact= 5: 1, 2, 3, 4, 5, * +[ 1]: Fact=10: 1, 2, 3, 4, 5, * +[ 2]: Fact=15: 1, 2, 3, 4, 5, * +[ 3]: Fact=20: 1, 2, 3, 4, 5, * +--- 4x4, Mode=29 --- +[ 0]: Fact= 9: 1, 2, 3, 4, 5, * +[ 1]: Fact=18: 1, 2, 3, 4, 5, * +[ 2]: Fact=27: 1, 2, 3, 4, 5, * +[ 3]: Fact= 4: 2, 3, 4, 5, 6, * +--- 4x4, Mode=30 --- +[ 0]: Fact=13: 1, 2, 3, 4, 5, * +[ 1]: Fact=26: 1, 2, 3, 4, 5, * +[ 2]: Fact= 7: 2, 3, 4, 5, 6, * +[ 3]: Fact=20: 2, 3, 4, 5, 6, * +--- 4x4, Mode=31 --- +[ 0]: Fact=17: 1, 2, 3, 4, 5, * +[ 1]: Fact= 2: 2, 3, 4, 5, 6, * +[ 2]: Fact=19: 2, 3, 4, 5, 6, * +[ 3]: Fact= 4: 3, 4, 5, 6, 7, * +--- 4x4, Mode=32 --- +[ 0]: Fact=21: 1, 2, 3, 4, 5, * +[ 1]: Fact=10: 2, 3, 4, 5, 6, * +[ 2]: Fact=31: 2, 3, 4, 5, 6, * +[ 3]: Fact=20: 3, 4, 5, 6, 7, * +--- 4x4, Mode=33 --- +[ 0]: Fact=26: 1, 2, 3, 4, 5, * +[ 1]: Fact=20: 2, 3, 4, 5, 6, * +[ 2]: Fact=14: 3, 4, 5, 6, 7, * +[ 3]: Fact= 8: 4, 5, 6, 7, 8, * +--- 4x4, Mode=34 --- +[ 0]: Fact= 0: 2, 3, 4, 5, * +[ 1]: Fact= 0: 3, 4, 5, 6, * +[ 2]: Fact= 0: 4, 5, 6, 7, * +[ 3]: Fact= 0: 5, 6, 7, 8, * + diff --git a/doc/intra/intra-8x8.txt b/doc/intra/intra-8x8.txt new file mode 100644 index 0000000..7c6c147 --- /dev/null +++ b/doc/intra/intra-8x8.txt @@ -0,0 +1,298 @@ +--- 8x8, Mode= 2 --- +[ 0]: Fact= 0: -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact= 0: -3, -4, -5, -6, -7, -8, -9,-10, * +[ 2]: Fact= 0: -4, -5, -6, -7, -8, -9,-10,-11, * +[ 3]: Fact= 0: -5, -6, -7, -8, -9,-10,-11,-12, * +[ 4]: Fact= 0: -6, -7, -8, -9,-10,-11,-12,-13, * +[ 5]: Fact= 0: -7, -8, -9,-10,-11,-12,-13,-14, * +[ 6]: Fact= 0: -8, -9,-10,-11,-12,-13,-14,-15, * +[ 7]: Fact= 0: -9,-10,-11,-12,-13,-14,-15,-16, * +--- 8x8, Mode= 3 --- +[ 0]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 2]: Fact=14: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 3]: Fact= 8: -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 4]: Fact= 2: -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 5]: Fact=28: -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 6]: Fact=22: -6, -7, -8, -9,-10,-11,-12,-13,-14, * +[ 7]: Fact=16: -7, -8, -9,-10,-11,-12,-13,-14,-15, * +--- 8x8, Mode= 4 --- +[ 0]: Fact=21: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact=10: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 2]: Fact=31: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 3]: Fact=20: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 4]: Fact= 9: -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 5]: Fact=30: -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 6]: Fact=19: -5, -6, -7, -8, -9,-10,-11,-12,-13, * +[ 7]: Fact= 8: -6, -7, -8, -9,-10,-11,-12,-13,-14, * +--- 8x8, Mode= 5 --- +[ 0]: Fact=17: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact= 2: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 2]: Fact=19: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 3]: Fact= 4: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 4]: Fact=21: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 5]: Fact= 6: -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 6]: Fact=23: -4, -5, -6, -7, -8, -9,-10,-11,-12, * +[ 7]: Fact= 8: -5, -6, -7, -8, -9,-10,-11,-12,-13, * +--- 8x8, Mode= 6 --- +[ 0]: Fact=13: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact=26: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 2]: Fact= 7: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 3]: Fact=20: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 4]: Fact= 1: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 5]: Fact=14: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 6]: Fact=27: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +[ 7]: Fact= 8: -4, -5, -6, -7, -8, -9,-10,-11,-12, * +--- 8x8, Mode= 7 --- +[ 0]: Fact= 9: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact=18: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 2]: Fact=27: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 3]: Fact= 4: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 4]: Fact=13: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 5]: Fact=22: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 6]: Fact=31: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 7]: Fact= 8: -3, -4, -5, -6, -7, -8, -9,-10,-11, * +--- 8x8, Mode= 8 --- +[ 0]: Fact= 5: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact=10: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 2]: Fact=15: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 3]: Fact=20: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 4]: Fact=25: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 5]: Fact=30: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 6]: Fact= 3: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +[ 7]: Fact= 8: -2, -3, -4, -5, -6, -7, -8, -9,-10, * +--- 8x8, Mode= 9 --- +[ 0]: Fact= 2: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 1]: Fact= 4: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 2]: Fact= 6: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 3]: Fact= 8: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 4]: Fact=10: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 5]: Fact=12: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 6]: Fact=14: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +[ 7]: Fact=16: -1, -2, -3, -4, -5, -6, -7, -8, -9, * +--- 8x8, Mode=10 --- +[ 0]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 2]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 3]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 4]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 5]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 6]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +[ 7]: Fact= 0: -1, -2, -3, -4, -5, -6, -7, -8, * +--- 8x8, Mode=11 --- +[ 0]: Fact=30: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact=28: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 2]: Fact=26: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 3]: Fact=24: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 4]: Fact=22: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 5]: Fact=20: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 6]: Fact=18: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 7]: Fact=16: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +--- 8x8, Mode=12 --- +[ 0]: Fact=27: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact=22: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 2]: Fact=17: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 3]: Fact=12: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 4]: Fact= 7: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 5]: Fact= 2: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 6]: Fact=29: 6, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 7]: Fact=24: 6, 0, -1, -2, -3, -4, -5, -6, -7, * +--- 8x8, Mode=13 --- +[ 0]: Fact=23: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact=14: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 2]: Fact= 5: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 3]: Fact=28: 4, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 4]: Fact=19: 4, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 5]: Fact=10: 4, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 6]: Fact= 1: 4, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 7]: Fact=24: 7, 4, 0, -1, -2, -3, -4, -5, -6, * +--- 8x8, Mode=14 --- +[ 0]: Fact=19: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 2]: Fact=25: 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 3]: Fact=12: 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 4]: Fact=31: 5, 2, 0, -1, -2, -3, -4, -5, -6, * +[ 5]: Fact=18: 5, 2, 0, -1, -2, -3, -4, -5, -6, * +[ 6]: Fact= 5: 5, 2, 0, -1, -2, -3, -4, -5, -6, * +[ 7]: Fact=24: 7, 5, 2, 0, -1, -2, -3, -4, -5, * +--- 8x8, Mode=15 --- +[ 0]: Fact=15: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact=30: 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 2]: Fact=13: 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 3]: Fact=28: 4, 2, 0, -1, -2, -3, -4, -5, -6, * +[ 4]: Fact=11: 4, 2, 0, -1, -2, -3, -4, -5, -6, * +[ 5]: Fact=26: 6, 4, 2, 0, -1, -2, -3, -4, -5, * +[ 6]: Fact= 9: 6, 4, 2, 0, -1, -2, -3, -4, -5, * +[ 7]: Fact=24: 8, 6, 4, 2, 0, -1, -2, -3, -4, * +--- 8x8, Mode=16 --- +[ 0]: Fact=11: 0, -1, -2, -3, -4, -5, -6, -7, -8, * +[ 1]: Fact=22: 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 2]: Fact= 1: 2, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 3]: Fact=12: 3, 2, 0, -1, -2, -3, -4, -5, -6, * +[ 4]: Fact=23: 5, 3, 2, 0, -1, -2, -3, -4, -5, * +[ 5]: Fact= 2: 5, 3, 2, 0, -1, -2, -3, -4, -5, * +[ 6]: Fact=13: 6, 5, 3, 2, 0, -1, -2, -3, -4, * +[ 7]: Fact=24: 8, 6, 5, 3, 2, 0, -1, -2, -3, * +--- 8x8, Mode=17 --- +[ 0]: Fact= 6: 0, -1, -2, -3, -4, -5, -6, -7, -8, x +[ 1]: Fact=12: 1, 0, -1, -2, -3, -4, -5, -6, -7, * +[ 2]: Fact=18: 2, 1, 0, -1, -2, -3, -4, -5, -6, * +[ 3]: Fact=24: 4, 2, 1, 0, -1, -2, -3, -4, -5, * +[ 4]: Fact=30: 5, 4, 2, 1, 0, -1, -2, -3, -4, * +[ 5]: Fact= 4: 5, 4, 2, 1, 0, -1, -2, -3, -4, * +[ 6]: Fact=10: 6, 5, 4, 2, 1, 0, -1, -2, -3, * +[ 7]: Fact=16: 7, 6, 5, 4, 2, 1, 0, -1, -2, * +--- 8x8, Mode=18 --- +[ 0]: Fact= 0: 0, 1, 2, 3, 4, 5, 6, 7, * +[ 1]: Fact= 0: -1, 0, 1, 2, 3, 4, 5, 6, * +[ 2]: Fact= 0: -2, -1, 0, 1, 2, 3, 4, 5, * +[ 3]: Fact= 0: -3, -2, -1, 0, 1, 2, 3, 4, * +[ 4]: Fact= 0: -4, -3, -2, -1, 0, 1, 2, 3, * +[ 5]: Fact= 0: -5, -4, -3, -2, -1, 0, 1, 2, * +[ 6]: Fact= 0: -6, -5, -4, -3, -2, -1, 0, 1, * +[ 7]: Fact= 0: -7, -6, -5, -4, -3, -2, -1, 0, * +--- 8x8, Mode=19 --- +[ 0]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, x +[ 1]: Fact=12: -1, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 2]: Fact=18: -2, -1, 0, 1, 2, 3, 4, 5, 6, * +[ 3]: Fact=24: -4, -2, -1, 0, 1, 2, 3, 4, 5, * +[ 4]: Fact=30: -5, -4, -2, -1, 0, 1, 2, 3, 4, * +[ 5]: Fact= 4: -5, -4, -2, -1, 0, 1, 2, 3, 4, * +[ 6]: Fact=10: -6, -5, -4, -2, -1, 0, 1, 2, 3, * +[ 7]: Fact=16: -7, -6, -5, -4, -2, -1, 0, 1, 2, * +--- 8x8, Mode=20 --- +[ 0]: Fact=11: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact=22: -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 2]: Fact= 1: -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 3]: Fact=12: -3, -2, 0, 1, 2, 3, 4, 5, 6, * +[ 4]: Fact=23: -5, -3, -2, 0, 1, 2, 3, 4, 5, * +[ 5]: Fact= 2: -5, -3, -2, 0, 1, 2, 3, 4, 5, * +[ 6]: Fact=13: -6, -5, -3, -2, 0, 1, 2, 3, 4, * +[ 7]: Fact=24: -8, -6, -5, -3, -2, 0, 1, 2, 3, * +--- 8x8, Mode=21 --- +[ 0]: Fact=15: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact=30: -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 2]: Fact=13: -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 3]: Fact=28: -4, -2, 0, 1, 2, 3, 4, 5, 6, * +[ 4]: Fact=11: -4, -2, 0, 1, 2, 3, 4, 5, 6, * +[ 5]: Fact=26: -6, -4, -2, 0, 1, 2, 3, 4, 5, * +[ 6]: Fact= 9: -6, -4, -2, 0, 1, 2, 3, 4, 5, * +[ 7]: Fact=24: -8, -6, -4, -2, 0, 1, 2, 3, 4, * +--- 8x8, Mode=22 --- +[ 0]: Fact=19: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact= 6: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 2]: Fact=25: -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 3]: Fact=12: -2, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 4]: Fact=31: -5, -2, 0, 1, 2, 3, 4, 5, 6, * +[ 5]: Fact=18: -5, -2, 0, 1, 2, 3, 4, 5, 6, * +[ 6]: Fact= 5: -5, -2, 0, 1, 2, 3, 4, 5, 6, * +[ 7]: Fact=24: -7, -5, -2, 0, 1, 2, 3, 4, 5, * +--- 8x8, Mode=23 --- +[ 0]: Fact=23: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact=14: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 2]: Fact= 5: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 3]: Fact=28: -4, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 4]: Fact=19: -4, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 5]: Fact=10: -4, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 6]: Fact= 1: -4, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 7]: Fact=24: -7, -4, 0, 1, 2, 3, 4, 5, 6, * +--- 8x8, Mode=24 --- +[ 0]: Fact=27: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact=22: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 2]: Fact=17: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 3]: Fact=12: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 4]: Fact= 7: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 5]: Fact= 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 6]: Fact=29: -6, 0, 1, 2, 3, 4, 5, 6, 7, * +[ 7]: Fact=24: -6, 0, 1, 2, 3, 4, 5, 6, 7, * +--- 8x8, Mode=25 --- +[ 0]: Fact=30: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact=28: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 2]: Fact=26: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 3]: Fact=24: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 4]: Fact=22: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 5]: Fact=20: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 6]: Fact=18: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +[ 7]: Fact=16: 0, 1, 2, 3, 4, 5, 6, 7, 8, * +--- 8x8, Mode=26 --- +[ 0]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 1]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 2]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 3]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 4]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 5]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 6]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +[ 7]: Fact= 0: 1, 2, 3, 4, 5, 6, 7, 8, * +--- 8x8, Mode=27 --- +[ 0]: Fact= 2: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact= 4: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 2]: Fact= 6: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 3]: Fact= 8: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 4]: Fact=10: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 5]: Fact=12: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 6]: Fact=14: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 7]: Fact=16: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +--- 8x8, Mode=28 --- +[ 0]: Fact= 5: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact=10: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 2]: Fact=15: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 3]: Fact=20: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 4]: Fact=25: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 5]: Fact=30: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 6]: Fact= 3: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 7]: Fact= 8: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +--- 8x8, Mode=29 --- +[ 0]: Fact= 9: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact=18: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 2]: Fact=27: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 3]: Fact= 4: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 4]: Fact=13: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 5]: Fact=22: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 6]: Fact=31: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 7]: Fact= 8: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +--- 8x8, Mode=30 --- +[ 0]: Fact=13: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 2]: Fact= 7: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 3]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 4]: Fact= 1: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 5]: Fact=14: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 6]: Fact=27: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 7]: Fact= 8: 4, 5, 6, 7, 8, 9, 10, 11, 12, * +--- 8x8, Mode=31 --- +[ 0]: Fact=17: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact= 2: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 2]: Fact=19: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 3]: Fact= 4: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 4]: Fact=21: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 5]: Fact= 6: 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 6]: Fact=23: 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 7]: Fact= 8: 5, 6, 7, 8, 9, 10, 11, 12, 13, * +--- 8x8, Mode=32 --- +[ 0]: Fact=21: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact=10: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 2]: Fact=31: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 3]: Fact=20: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 4]: Fact= 9: 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 5]: Fact=30: 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 6]: Fact=19: 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 7]: Fact= 8: 6, 7, 8, 9, 10, 11, 12, 13, 14, * +--- 8x8, Mode=33 --- +[ 0]: Fact=26: 1, 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact=20: 2, 3, 4, 5, 6, 7, 8, 9, 10, * +[ 2]: Fact=14: 3, 4, 5, 6, 7, 8, 9, 10, 11, * +[ 3]: Fact= 8: 4, 5, 6, 7, 8, 9, 10, 11, 12, * +[ 4]: Fact= 2: 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 5]: Fact=28: 5, 6, 7, 8, 9, 10, 11, 12, 13, * +[ 6]: Fact=22: 6, 7, 8, 9, 10, 11, 12, 13, 14, * +[ 7]: Fact=16: 7, 8, 9, 10, 11, 12, 13, 14, 15, * +--- 8x8, Mode=34 --- +[ 0]: Fact= 0: 2, 3, 4, 5, 6, 7, 8, 9, * +[ 1]: Fact= 0: 3, 4, 5, 6, 7, 8, 9, 10, * +[ 2]: Fact= 0: 4, 5, 6, 7, 8, 9, 10, 11, * +[ 3]: Fact= 0: 5, 6, 7, 8, 9, 10, 11, 12, * +[ 4]: Fact= 0: 6, 7, 8, 9, 10, 11, 12, 13, * +[ 5]: Fact= 0: 7, 8, 9, 10, 11, 12, 13, 14, * +[ 6]: Fact= 0: 8, 9, 10, 11, 12, 13, 14, 15, * +[ 7]: Fact= 0: 9, 10, 11, 12, 13, 14, 15, 16, * + diff --git a/doc/reST/Makefile b/doc/reST/Makefile new file mode 100644 index 0000000..6b1d44c --- /dev/null +++ b/doc/reST/Makefile @@ -0,0 +1,97 @@ +# Makefile for (Sphinx based) restructured text documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = + +QCOLLECTIONGENERATOR = qcollectiongenerator + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html web pickle htmlhelp qthelp qhc latex changes linkcheck + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " qhc to make QHC file" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " changes to make an overview over all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + +clean: + -rm -rf build/* + +html: + mkdir -p build/html build/doctrees + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html + @echo + @echo "Build finished. The HTML pages are in build/html." + +zip: html + (cd build ; zip TortoiseHg.html.zip -r html) + +pickle: + mkdir -p build/pickle build/doctrees + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +web: pickle + +json: + mkdir -p build/json build/doctrees + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) build/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + mkdir -p build/htmlhelp build/doctrees + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in build/htmlhelp." + +qthelp: + mkdir -p build/qthelp build/doctrees + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) build/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in build/qthelp, like this:" + @echo "# qcollectiongenerator build/qthelp/foo.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile build/qthelp/foo.qhc" + +qhc: qthelp + $(QCOLLECTIONGENERATOR) build/qthelp/TortoiseHg.qhcp + @echo "Build finished. To view the help file:" + @echo "# assistant -collectionFile build/qthelp/TortoiseHg.qhc" + +latex: + mkdir -p build/latex build/doctrees + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex + @echo + @echo "Build finished; the LaTeX files are in build/latex." + @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ + "run these through (pdf)latex." + +changes: + mkdir -p build/changes build/doctrees + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes + @echo + @echo "The overview file is in build/changes." + +linkcheck: + mkdir -p build/linkcheck build/doctrees + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in build/linkcheck/output.txt." diff --git a/doc/reST/api.rst b/doc/reST/api.rst new file mode 100644 index 0000000..214881a --- /dev/null +++ b/doc/reST/api.rst @@ -0,0 +1,340 @@ +********************************* +Application Programming Interface +********************************* + +Introduction +============ + +x265 is written primarily in C++ and x86 assembly language but the +public facing programming interface is C for the widest possible +portability. This C interface is wholly defined within :file:`x265.h` +in the source/ folder of our source tree. All of the functions and +variables and enumerations meant to be used by the end-user are present +in this header. + +Where possible, x265 has tried to keep its public API as close as +possible to x264's public API. So those familiar with using x264 through +its C interface will find x265 quite familiar. + +This file is meant to be read in-order; the narrative follows linearly +through the various sections + +Build Considerations +==================== + +The choice of Main or Main10 profile encodes is made at compile time; +the internal pixel depth influences a great deal of variable sizes and +thus 8 and 10bit pixels are handled as different build options +(primarily to maintain the performance of the 8bit builds). libx265 +exports a variable **x265_max_bit_depth** which indicates how the +library was compiled (it will contain a value of 8 or 10). Further, +**x265_version_str** is a pointer to a string indicating the version of +x265 which was compiled, and **x265_build_info_str** is a pointer to a +string identifying the compiler and build options. + +x265 will accept input pixels of any depth between 8 and 16 bits +regardless of the depth of its internal pixels (8 or 10). It will shift +and mask input pixels as required to reach the internal depth. If +downshifting is being performed using our CLI application, the +:option:`--dither` option may be enabled to reduce banding. This feature +is not available through the C interface. + +Encoder +======= + +The primary object in x265 is the encoder object, and this is +represented in the public API as an opaque typedef **x265_encoder**. +Pointers of this type are passed to most encoder functions. + +A single encoder generates a single output bitstream from a sequence of +raw input pictures. Thus if you need multiple output bitstreams you +must allocate multiple encoders. You may pass the same input pictures +to multiple encoders, the encode function does not modify the input +picture structures (the pictures are copied into the encoder as the +first step of encode). + +Encoder allocation is a reentrant function, so multiple encoders may be +safely allocated in a single process. The encoder access functions are +not reentrant for a single encoder, so the recommended use case is to +allocate one client thread per encoder instance (one thread for all +encoder instances is possible, but some encoder access functions are +blocking and thus this would be less efficient). + +.. Note:: + + There is one caveat to having multiple encoders within a single + process. All of the encoders must use the same maximum CTU size + because many global variables are configured based on this size. + Encoder allocation will fail if a mis-matched CTU size is attempted. + +An encoder is allocated by calling **x265_encoder_open()**:: + + /* x265_encoder_open: + * create a new encoder handler, all parameters from x265_param are copied */ + x265_encoder* x265_encoder_open(x265_param *); + +The returned pointer is then passed to all of the functions pertaining +to this encode. A large amount of memory is allocated during this +function call, but the encoder will continue to allocate memory as the +first pictures are passed to the encoder; until its pool of picture +structures is large enough to handle all of the pictures it must keep +internally. The pool size is determined by the lookahead depth, the +number of frame threads, and the maximum number of references. + +As indicated in the comment, **x265_param** is copied internally so the user +may release their copy after allocating the encoder. Changes made to +their copy of the param structure have no affect on the encoder after it +has been allocated. + +Param +===== + +The **x265_param** structure describes everything the encoder needs to +know about the input pictures and the output bitstream and most +everything in between. + +The recommended way to handle these param structures is to allocate them +from libx265 via:: + + /* x265_param_alloc: + * Allocates an x265_param instance. The returned param structure is not + * special in any way, but using this method together with x265_param_free() + * and x265_param_parse() to set values by name allows the application to treat + * x265_param as an opaque data struct for version safety */ + x265_param *x265_param_alloc(); + +In this way, your application does not need to know the exact size of +the param structure (the build of x265 could potentially be a bit newer +than the copy of :file:`x265.h` that your application compiled against). + +Next you perform the initial *rough cut* configuration of the encoder by +chosing a performance preset and optional tune factor +**x265_preset_names** and **x265_tune_names** respectively hold the +string names of the presets and tune factors (see :ref:`presets +` for more detail on presets and tune factors):: + + /* returns 0 on success, negative on failure (e.g. invalid preset/tune name). */ + int x265_param_default_preset(x265_param *, const char *preset, const char *tune); + +Now you may optionally specify a profile. **x265_profile_names** +contains the string names this function accepts:: + + /* (can be NULL, in which case the function will do nothing) + * returns 0 on success, negative on failure (e.g. invalid profile name). */ + int x265_param_apply_profile(x265_param *, const char *profile); + +Finally you configure any remaining options by name using repeated calls to:: + + /* x265_param_parse: + * set one parameter by name. + * returns 0 on success, or returns one of the following errors. + * note: BAD_VALUE occurs only if it can't even parse the value, + * numerical range is not checked until x265_encoder_open(). + * value=NULL means "true" for boolean options, but is a BAD_VALUE for non-booleans. */ + #define X265_PARAM_BAD_NAME (-1) + #define X265_PARAM_BAD_VALUE (-2) + int x265_param_parse(x265_param *p, const char *name, const char *value); + +See :ref:`string options ` for the list of options (and their +descriptions) which can be set by **x265_param_parse()**. + +After the encoder has been created, you may release the param structure:: + + /* x265_param_free: + * Use x265_param_free() to release storage for an x265_param instance + * allocated by x265_param_alloc() */ + void x265_param_free(x265_param *); + +.. Note:: + + Using these methods to allocate and release the param structures + helps future-proof your code in many ways, but the x265 API is + versioned in such a way that we prevent linkage against a build of + x265 that does not match the version of the header you are compiling + against. This is function of the X265_BUILD macro. + +**x265_encoder_parameters()** may be used to get a copy of the param +structure from the encoder after it has been opened, in order to see the +changes made to the parameters for auto-detection and other reasons:: + + /* x265_encoder_parameters: + * copies the current internal set of parameters to the pointer provided + * by the caller. useful when the calling application needs to know + * how x265_encoder_open has changed the parameters. + * note that the data accessible through pointers in the returned param struct + * (e.g. filenames) should not be modified by the calling application. */ + void x265_encoder_parameters(x265_encoder *, x265_param *); + +Pictures +======== + +Raw pictures are passed to the encoder via the **x265_picture** structure. +Just like the param structure we recommend you allocate this structure +from the encoder to avoid potential size mismatches:: + + /* x265_picture_alloc: + * Allocates an x265_picture instance. The returned picture structure is not + * special in any way, but using this method together with x265_picture_free() + * and x265_picture_init() allows some version safety. New picture fields will + * always be added to the end of x265_picture */ + x265_picture *x265_picture_alloc(); + +Regardless of whether you allocate your picture structure this way or +whether you simply declare it on the stack, your next step is to +initialize the structure via:: + + /*** + * Initialize an x265_picture structure to default values. It sets the pixel + * depth and color space to the encoder's internal values and sets the slice + * type to auto - so the lookahead will determine slice type. + */ + void x265_picture_init(x265_param *param, x265_picture *pic); + +x265 does not perform any color space conversions, so the raw picture's +color space (chroma sampling) must match the color space specified in +the param structure used to allocate the encoder. **x265_picture_init** +initializes this field to the internal color space and it is best to +leave it unmodified. + +The picture bit depth is initialized to be the encoder's internal bit +depth but this value should be changed to the actual depth of the pixels +being passed into the encoder. If the picture bit depth is more than 8, +the encoder assumes two bytes are used to represent each sample +(little-endian shorts). + +The user is responsible for setting the plane pointers and plane strides +(in units of bytes, not pixels). The presentation time stamp (**pts**) +is optional, depending on whether you need accurate decode time stamps +(**dts**) on output. + +If you wish to override the lookahead or rate control for a given +picture you may specify a slicetype other than X265_TYPE_AUTO, or a +forceQP value other than 0. + +x265 does not modify the picture structure provided as input, so you may +reuse a single **x265_picture** for all pictures passed to a single +encoder, or even all pictures passed to multiple encoders. + +Structures allocated from the library should eventually be released:: + + /* x265_picture_free: + * Use x265_picture_free() to release storage for an x265_picture instance + * allocated by x265_picture_alloc() */ + void x265_picture_free(x265_picture *); + + +Analysis Buffers +================ + +Analysis information can be saved and reused to between encodes of the +same video sequence (generally for multiple bitrate encodes). The best +results are attained by saving the analysis information of the highest +bitrate encode and reuse it in lower bitrate encodes. + +When saving or loading analysis data, buffers must be allocated for +every picture passed into the encoder using:: + + /* x265_alloc_analysis_data: + * Allocate memory to hold analysis meta data, returns 1 on success else 0 */ + int x265_alloc_analysis_data(x265_picture*); + +Note that this is very different from the typical semantics of +**x265_picture**, which can be reused many times. The analysis buffers must +be re-allocated for every input picture. + +Analysis buffers passed to the encoder are owned by the encoder until +they pass the buffers back via an output **x265_picture**. The user is +responsible for releasing the buffers when they are finished with them +via:: + + /* x265_free_analysis_data: + * Use x265_free_analysis_data to release storage of members allocated by + * x265_alloc_analysis_data */ + void x265_free_analysis_data(x265_picture*); + + +Encode Process +============== + +The output of the encoder is a series of NAL packets, which are always +returned concatenated in consecutive memory. HEVC streams have SPS and +PPS and VPS headers which describe how the following packets are to be +decoded. If you specified :option:`--repeat-headers` then those headers +will be output with every keyframe. Otherwise you must explicitly query +those headers using:: + + /* x265_encoder_headers: + * return the SPS and PPS that will be used for the whole stream. + * *pi_nal is the number of NAL units outputted in pp_nal. + * returns negative on error, total byte size of payload data on success + * the payloads of all output NALs are guaranteed to be sequential in memory. */ + int x265_encoder_headers(x265_encoder *, x265_nal **pp_nal, uint32_t *pi_nal); + +Now we get to the main encode loop. Raw input pictures are passed to the +encoder in display order via:: + + /* x265_encoder_encode: + * encode one picture. + * *pi_nal is the number of NAL units outputted in pp_nal. + * returns negative on error, zero if no NAL units returned. + * the payloads of all output NALs are guaranteed to be sequential in memory. */ + int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out); + +These pictures are queued up until the lookahead is full, and then the +frame encoders in turn are filled, and then finally you begin receiving +a output NALs (corresponding to a single output picture) with each input +picture you pass into the encoder. + +Once the pipeline is completely full, **x265_encoder_encode()** will +block until the next output picture is complete. + +.. note:: + + Optionally, if the pointer of a second **x265_picture** structure is + provided, the encoder will fill it with data pertaining to the + output picture corresponding to the output NALs, including the + recontructed image, POC and decode timestamp. These pictures will be + in encode (or decode) order. + +When the last of the raw input pictures has been sent to the encoder, +**x265_encoder_encode()** must still be called repeatedly with a +*pic_in* argument of 0, indicating a pipeline flush, until the function +returns a value less than or equal to 0 (indicating the output bitstream +is complete). + +At any time during this process, the application may query running +statistics from the encoder:: + + /* x265_encoder_get_stats: + * returns encoder statistics */ + void x265_encoder_get_stats(x265_encoder *encoder, x265_stats *, uint32_t statsSizeBytes); + +Cleanup +======= + +At the end of the encode, the application will want to trigger logging +of the final encode statistics, if :option:`--csv` had been specified:: + + /* x265_encoder_log: + * write a line to the configured CSV file. If a CSV filename was not + * configured, or file open failed, or the log level indicated frame level + * logging, this function will perform no write. */ + void x265_encoder_log(x265_encoder *encoder, int argc, char **argv); + +Finally, the encoder must be closed in order to free all of its +resources. An encoder that has been flushed cannot be restarted and +reused. Once **x265_encoder_close()** has been called, the encoder +handle must be discarded:: + + /* x265_encoder_close: + * close an encoder handler */ + void x265_encoder_close(x265_encoder *); + +When the application has completed all encodes, it should call +**x265_cleanup()** to free process global resources like the thread pool; +particularly if a memory-leak detection tool is being used:: + + /*** + * Release library static allocations + */ + void x265_cleanup(void); diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst new file mode 100644 index 0000000..324b83a --- /dev/null +++ b/doc/reST/cli.rst @@ -0,0 +1,1243 @@ +********************* +Command Line Options +********************* + +.. _string-options-ref: + +Note that unless an option is listed as **CLI ONLY** the option is also +supported by x265_param_parse(). The CLI uses getopt to parse the +command line options so the short or long versions may be used and the +long options may be truncated to the shortest unambiguous abbreviation. +Users of the API must pass x265_param_parse() the full option name. + +Preset and tune have special implications. The API user must call +x265_param_default_preset() with the preset and tune parameters they +wish to use, prior to calling x265_param_parse() to set any additional +fields. The CLI does this for the user implicitly, so all CLI options +are applied after the user's preset and tune choices, regardless of the +order of the arguments on the command line. + +If there is an extra command line argument (not an option or an option +value) the CLI will treat it as the input filename. This effectively +makes the :option:`--input` specifier optional for the input file. If +there are two extra arguments, the second is treated as the output +bitstream filename, making :option:`--output` also optional if the input +filename was implied. This makes :command:`x265 in.y4m out.hevc` a valid +command line. If there are more than two extra arguments, the CLI will +consider this an error and abort. + +Generally, when an option expects a string value from a list of strings +the user may specify the integer ordinal of the value they desire. ie: +:option:`--log-level` 3 is equivalent to :option:`--log-level` debug. + +Standalone Executable Options +============================= + +.. option:: --help, -h + + Display help text + + **CLI ONLY** + +.. option:: --version, -V + + Display version details + + **CLI ONLY** + +.. option:: --asm , --no-asm + + x265 will use all detected CPU SIMD architectures by default. You can + disable all assembly by using :option:`--no-asm` or you can specify + a comma separated list of SIMD architectures to use, matching these + strings: MMX2, SSE, SSE2, SSE3, SSSE3, SSE4, SSE4.1, SSE4.2, AVX, XOP, FMA4, AVX2, FMA3 + + Some higher architectures imply lower ones being present, this is + handled implicitly. + + One may also directly supply the CPU capability bitmap as an integer. + +.. option:: --threads + + Number of threads to allocate for the worker thread pool This pool + is used for WPP and for distributed analysis and motion search: + :option:`--wpp` :option:`--pmode` and :option:`--pme` respectively. + + If :option:`--threads`=1 is specified, then no thread pool is + created. When no thread pool is created, all the thread pool + features are implicitly disabled. If all the pool features are + disabled by the user, then the pool is implicitly disabled. + + Default 0, one thread is allocated per detected hardware thread + (logical CPU cores) + +.. option:: --pmode, --no-pmode + + Parallel mode decision, or distributed mode analysis. When enabled + the encoder will distribute the analysis work of each CU (merge, + inter, intra) across multiple worker threads. Only recommended if + x265 is not already saturating the CPU cores. In RD levels 3 and 4 + it will be most effective if --rect was enabled. At RD levels 5 and + 6 there is generally always enough work to distribute to warrant the + overhead, assuming your CPUs are not already saturated. + + --pmode will increase utilization without reducing compression + efficiency. In fact, since the modes are all measured in parallel it + makes certain early-outs impractical and thus you usually get + slightly better compression when it is enabled (at the expense of + not skipping improbable modes). + + This feature is implicitly disabled when no thread pool is present. + + Default disabled + +.. option:: --pme, --no-pme + + Parallel motion estimation. When enabled the encoder will distribute + motion estimation across multiple worker threads when more than two + references require motion searches for a given CU. Only recommended + if x265 is not already saturating CPU cores. :option:`--pmode` is + much more effective than this option, since the amount of work it + distributes is substantially higher. With --pme it is not unusual + for the overhead of distributing the work to outweigh the + parallelism benefits. + + This feature is implicitly disabled when no thread pool is present. + + --pme will increase utilization on many core systems with no effect + on the output bitstream. + + Default disabled + +.. option:: --preset, -p + + Sets parameters to preselected values, trading off compression efficiency against + encoding speed. These parameters are applied before all other input parameters are + applied, and so you can override any parameters that these values control. + + 0. ultrafast + 1. superfast + 2. veryfast + 3. faster + 4. fast + 5. medium **(default)** + 6. slow + 7. slower + 8. veryslow + 9. placebo + +.. option:: --tune, -t + + Tune the settings for a particular type of source or situation. The changes will + be applied after :option:`--preset` but before all other parameters. Default none + + **Values:** psnr, ssim, zero-latency, fast-decode. + +.. option:: --frame-threads, -F + + Number of concurrently encoded frames. Using a single frame thread + gives a slight improvement in compression, since the entire reference + frames are always available for motion compensation, but it has + severe performance implications. Default is an autodetected count + based on the number of CPU cores and whether WPP is enabled or not. + + Over-allocation of frame threads will not improve performance, it + will generally just increase memory use. + +.. option:: --log-level + + Logging level. Debug level enables per-frame QP, metric, and bitrate + logging. If a CSV file is being generated, debug level makes the log + be per-frame rather than per-encode. Full level enables hash and + weight logging. -1 disables all logging, except certain fatal + errors, and can be specified by the string "none". + + 0. error + 1. warning + 2. info **(default)** + 3. debug + 4. full + +.. option:: --csv + + Writes encoding results to a comma separated value log file. Creates + the file if it doesnt already exist, else adds one line per run. if + :option:`--log-level` is debug or above, it writes one line per + frame. Default none + +.. option:: --cu-stats, --no-cu-stats + + Records statistics on how each CU was coded (split depths and other + mode decisions) and reports those statistics at the end of the + encode. Default disabled + +.. option:: --output, -o + + Bitstream output file name. If there are two extra CLI options, the + first is implicitly the input filename and the second is the output + filename, making the :option:`--output` option optional. + + The output file will always contain a raw HEVC bitstream, the CLI + does not support any container file formats. + + **CLI ONLY** + +.. option:: --no-progress + + Disable CLI periodic progress reports + + **CLI ONLY** + +Quality reporting metrics +========================= + +.. option:: --ssim, --no-ssim + + Calculate and report Structural Similarity values. It is + recommended to use :option:`--tune` ssim if you are measuring ssim, + else the results should not be used for comparison purposes. + Default disabled + +.. option:: --psnr, --no-psnr + + Calculate and report Peak Signal to Noise Ratio. It is recommended + to use :option:`--tune` psnr if you are measuring PSNR, else the + results should not be used for comparison purposes. Default + disabled + +Input Options +============= + +.. option:: --input + + Input filename, only raw YUV or Y4M supported. Use single dash for + stdin. This option name will be implied for the first "extra" + command line argument. + + **CLI ONLY** + +.. option:: --y4m + + Parse input stream as YUV4MPEG2 regardless of file extension, + primarily intended for use with stdin (ie: :option:`--input` - + :option:`--y4m`). This option is implied if the input filename has + a ".y4m" extension + + **CLI ONLY** + +.. option:: --input-depth + + YUV only: Bit-depth of input file or stream + + **Values:** any value between 8 and 16. Default is internal depth. + + **CLI ONLY** + +.. option:: --dither + + Enable high quality downscaling. Dithering is based on the diffusion + of errors from one row of pixels to the next row of pixels in a + picture. Only applicable when the input bit depth is larger than + 8bits and internal bit depth is 8bits. Default disabled + + **CLI ONLY** + +.. option:: --nr + + Noise reduction - an adaptive deadzone applied after DCT + (subtracting from DCT coefficients), before quantization, on inter + blocks. It does no pixel-level filtering, doesn't cross DCT block + boundaries, has no overlap, doesn't affect intra blocks. The higher + the strength value parameter, the more aggressively it will reduce + noise. + + Enabling noise reduction will make outputs diverge between different + numbers of frame threads. Outputs will be deterministic but the + outputs of -F2 will no longer match the outputs of -F3, etc. + + **Values:** any value in range of 100 to 1000. Default disabled. + +.. option:: --input-res + + YUV only: Source picture size [w x h] + + **CLI ONLY** + +.. option:: --input-csp + + YUV only: Source color space. Only i420, i422, and i444 are + supported at this time. The internal color space is always the + same as the source color space (libx265 does not support any color + space conversions). + + 0. i400 + 1. i420 **(default)** + 2. i422 + 3. i444 + 4. nv12 + 5. nv16 + +.. option:: --fps + + YUV only: Source frame rate + + **Range of values:** positive int or float, or num/denom + +.. option:: --interlaceMode , --no-interlaceMode + + **EXPERIMENTAL** Specify interlace type of source pictures. + + 0. progressive pictures **(default)** + 1. top field first + 2. bottom field first + + HEVC encodes interlaced content as fields. Fields must be provided to + the encoder in the correct temporal order. The source dimensions + must be field dimensions and the FPS must be in units of fields per + second. The decoder must re-combine the fields in their correct + orientation for display. + +.. option:: --seek + + Number of frames to skip at start of input file. Default 0 + + **CLI ONLY** + +.. option:: --frames, -f + + Number of frames to be encoded. Default 0 (all) + + **CLI ONLY** + +.. option:: --qpfile + + Specify a text file which contains frametypes and QPs for some or + all frames. The format of each line is: + + framenumber frametype QP + + Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame, + **b** is an unreferenced B frame. **I** is a keyframe (random + access point) while **i** is a I frame that is not a keyframe + (references are not broken). + + Specifying QP (integer) is optional, and if specified they are + clamped within the encoder to qpmin/qpmax. + +.. option:: --scaling-list + + Quantization scaling lists. HEVC supports 6 quantization scaling + lists to be defined; one each for Y, Cb, Cr for intra prediction and + one each for inter prediction. + + x265 does not use scaling lists by default, but this can also be + made explicit by :option:`--scaling-list` *off*. + + HEVC specifies a default set of scaling lists which may be enabled + without requiring them to be signaled in the SPS. Those scaling + lists can be enabled via :option:`--scaling-list` *default*. + + All other strings indicate a filename containing custom scaling + lists in the HM format. The encode will abort if the file is not + parsed correctly. Custom lists must be signaled in the SPS + +.. option:: --lambda-file + + Specify a text file containing values for x265_lambda_tab and + x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float + values. + + The text file syntax is simple. Comma is considered to be + white-space. All white-space is ignored. Lines must be less than 2k + bytes in length. Content following hash (#) characters are ignored. + The values read from the file are logged at :option:`--log-level` + debug. + + Note that the lambda tables are process-global and so the new values + affect all encoders running in the same process. + + Lambda values affect encoder mode decisions, the lower the lambda + the more bits it will try to spend on signaling information (motion + vectors and splits) and less on residual. This feature is intended + for experimentation. + +Profile, Level, Tier +==================== + +.. option:: --profile + + Enforce the requirements of the specified profile, ensuring the + output stream will be decodable by a decoder which supports that + profile. May abort the encode if the specified profile is + impossible to be supported by the compile options chosen for the + encoder (a high bit depth encoder will be unable to output + bitstreams compliant with Main or Mainstillpicture). + + API users must use x265_param_apply_profile() after configuring + their param structure. Any changes made to the param structure after + this call might make the encode non-compliant. + + **Values:** main, main10, mainstillpicture, main422-8, main422-10, main444-8, main444-10 + + **CLI ONLY** + +.. option:: --level-idc + + Minimum decoder requirement level. Defaults to 0, which implies + auto-detection by the encoder. If specified, the encoder will + attempt to bring the encode specifications within that specified + level. If the encoder is unable to reach the level it issues a + warning and aborts the encode. If the requested requirement level is + higher than the actual level, the actual requirement level is + signaled. + + Beware, specifying a decoder level will force the encoder to enable + VBV for constant rate factor encodes, which may introduce + non-determinism. + + The value is specified as a float or as an integer with the level + times 10, for example level **5.1** is specified as "5.1" or "51", + and level **5.0** is specified as "5.0" or "50". + + Annex A levels: 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6, 6.1, 6.2 + +.. option:: --high-tier, --no-high-tier + + If :option:`--level-idc` has been specified, the option adds the + intention to support the High tier of that level. If your specified + level does not support a High tier, a warning is issued and this + modifier flag is ignored. + +.. note:: + :option:`--profile`, :option:`--level-idc`, and + :option:`--high-tier` are only intended for use when you are + targeting a particular decoder (or decoders) with fixed resource + limitations and must constrain the bitstream within those limits. + Specifying a profile or level may lower the encode quality + parameters to meet those requirements but it will never raise + them. + +Quad-Tree analysis +================== + +.. option:: --wpp, --no-wpp + + Enable Wavefront Parallel Processing. The encoder may begin encoding + a row as soon as the row above it is at least two CTUs ahead in the + encode process. This gives a 3-5x gain in parallelism for about 1% + overhead in compression efficiency. Default: Enabled + +.. option:: --ctu, -s <64|32|16> + + Maximum CU size (width and height). The larger the maximum CU size, + the more efficiently x265 can encode flat areas of the picture, + giving large reductions in bitrate. However this comes at a loss of + parallelism with fewer rows of CUs that can be encoded in parallel, + and less frame parallelism as well. Because of this the faster + presets use a CU size of 32. Default: 64 + +.. option:: --tu-intra-depth <1..4> + + The transform unit (residual) quad-tree begins with the same depth + as the coding unit quad-tree, but the encoder may decide to further + split the transform unit tree if it improves compression efficiency. + This setting limits the number of extra recursion depth which can be + attempted for intra coded units. Default: 1, which means the + residual quad-tree is always at the same depth as the coded unit + quad-tree + + Note that when the CU intra prediction is NxN (only possible with + 8x8 CUs), a TU split is implied, and thus the residual quad-tree + begins at 4x4 and cannot split any futhrer. + +.. option:: --tu-inter-depth <1..4> + + The transform unit (residual) quad-tree begins with the same depth + as the coding unit quad-tree, but the encoder may decide to further + split the transform unit tree if it improves compression efficiency. + This setting limits the number of extra recursion depth which can be + attempted for inter coded units. Default: 1. which means the + residual quad-tree is always at the same depth as the coded unit + quad-tree unless the CU was coded with rectangular or AMP + partitions, in which case a TU split is implied and thus the + residual quad-tree begins one layer below the CU quad-tree. + +Temporal / motion search options +================================ + +.. option:: --me + + Motion search method. Generally, the higher the number the harder + the ME method will try to find an optimal match. Diamond search is + the simplest. Hexagon search is a little better. Uneven + Multi-Hexegon is an adaption of the search method used by x264 for + slower presets. Star is a three step search adapted from the HM + encoder: a star-pattern search followed by an optional radix scan + followed by an optional star-search refinement. Full is an + exhaustive search; an order of magnitude slower than all other + searches but not much better than umh or star. + + 0. dia + 1. hex **(default)** + 2. umh + 3. star + 4. full + +.. option:: --subme, -m <0..7> + + Amount of subpel refinement to perform. The higher the number the + more subpel iterations and steps are performed. Default 2 + + +----+------------+-----------+------------+-----------+-----------+ + | -m | HPEL iters | HPEL dirs | QPEL iters | QPEL dirs | HPEL SATD | + +====+============+===========+============+===========+===========+ + | 0 | 1 | 4 | 0 | 4 | false | + +----+------------+-----------+------------+-----------+-----------+ + | 1 | 1 | 4 | 1 | 4 | false | + +----+------------+-----------+------------+-----------+-----------+ + | 2 | 1 | 4 | 1 | 4 | true | + +----+------------+-----------+------------+-----------+-----------+ + | 3 | 2 | 4 | 1 | 4 | true | + +----+------------+-----------+------------+-----------+-----------+ + | 4 | 2 | 4 | 2 | 4 | true | + +----+------------+-----------+------------+-----------+-----------+ + | 5 | 1 | 8 | 1 | 8 | true | + +----+------------+-----------+------------+-----------+-----------+ + | 6 | 2 | 8 | 1 | 8 | true | + +----+------------+-----------+------------+-----------+-----------+ + | 7 | 2 | 8 | 2 | 8 | true | + +----+------------+-----------+------------+-----------+-----------+ + +.. option:: --merange + + Motion search range. Default 57 + + The default is derived from the default CTU size (64) minus the luma + interpolation half-length (4) minus maximum subpel distance (2) + minus one extra pixel just in case the hex search method is used. If + the search range were any larger than this, another CTU row of + latency would be required for reference frames. + + **Range of values:** an integer from 0 to 32768 + +.. option:: --max-merge <1..5> + + Maximum number of neighbor (spatial and temporal) candidate blocks + that the encoder may consider for merging motion predictions. If a + merge candidate results in no residual, it is immediately selected + as a "skip". Otherwise the merge candidates are tested as part of + motion estimation when searching for the least cost inter option. + The max candidate number is encoded in the SPS and determines the + bit cost of signaling merge CUs. Default 2 + +.. option:: --temporal-mvp, --no-temporal-mvp + + Enable temporal motion vector predictors in P and B slices. + This enables the use of the motion vector from the collocated block + in the previous frame to be used as a predictor. Default is enabled + +Spatial/intra options +===================== + +.. option:: --rdpenalty <0..2> + + When set to 1, transform units of size 32x32 are given a 4x bit cost + penalty compared to smaller transform units, in intra coded CUs in P + or B slices. + + When set to 2, transform units of size 32x32 are not even attempted, + unless otherwise required by the maximum recursion depth. For this + option to be effective with 32x32 intra CUs, + :option:`--tu-intra-depth` must be at least 2. For it to be + effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be + at least 3. + + Note that in HEVC an intra transform unit (a block of the residual + quad-tree) is also a prediction unit, meaning that the intra + prediction signal is generated for each TU block, the residual + subtracted and then coded. The coding unit simply provides the + prediction modes that will be used when predicting all of the + transform units within the CU. This means that when you prevent + 32x32 intra transform units, you are preventing 32x32 intra + predictions. + + Default 0, disabled. + + **Values:** 0:disabled 1:4x cost penalty 2:force splits + +.. option:: --b-intra, --no-b-intra + + Enables the evaluation of intra modes in B slices. Default disabled. + +.. option:: --tskip, --no-tskip + + Enable evaluation of transform skip (bypass DCT but still use + quantization) coding for 4x4 TU coded blocks. + + Only effective at RD levels 3 and above, which perform RDO mode + decisions. Default disabled + +.. option:: --tskip-fast, --no-tskip-fast + + Only evaluate transform skip for NxN intra predictions (4x4 blocks). + Only applicable if transform skip is enabled. For chroma, only + evaluate if luma used tskip. Inter block tskip analysis is + unmodified. Default disabled + +.. option:: --strong-intra-smoothing, --no-strong-intra-smoothing + + Enable strong intra smoothing for 32x32 intra blocks. Default enabled + +.. option:: --constrained-intra, --no-constrained-intra + + Constrained intra prediction. When generating intra predictions for + blocks in inter slices, only intra-coded reference pixels are used. + Inter-coded reference pixels are replaced with intra-coded neighbor + pixels or default values. The general idea is to block the + propagation of reference errors that may have resulted from lossy + signals. Default disabled + +Mode decision / Analysis +======================== + +.. option:: --rect, --no-rect + + Enable analysis of rectangular motion partitions Nx2N and 2NxN + (50/50 splits, two directions). Default disabled + +.. option:: --amp, --no-amp + + Enable analysis of asymmetric motion partitions (75/25 splits, four + directions). At RD levels 0 through 4, AMP partitions are only + considered at CU sizes 32x32 and below. At RD levels 5 and 6, it + will only consider AMP partitions as merge candidates (no motion + search) at 64x64, and as merge or inter candidates below 64x64. + + The AMP partitions which are searched are derived from the current + best inter partition. If Nx2N (vertical rectangular) is the best + current prediction, then left and right asymmetrical splits will be + evaluated. If 2NxN (horizontal rectangular) is the best current + prediction, then top and bottom asymmetrical splits will be + evaluated, If 2Nx2N is the best prediction, and the block is not a + merge/skip, then all four AMP partitions are evaluated. + + This setting has no effect if rectangular partitions are disabled. + Default disabled + +.. option:: --early-skip, --no-early-skip + + Measure full CU size (2Nx2N) merge candidates first; if no residual + is found the analysis is short circuited. Default disabled + +.. option:: --fast-cbf, --no-fast-cbf + + Short circuit analysis if a prediction is found that does not set + the coded block flag (aka: no residual was encoded). It prevents + the encoder from perhaps finding other predictions that also have no + residual but require less signaling bits or have less distortion. + Only applicable for RD levels 5 and 6. Default disabled + +.. option:: --fast-intra, --no-fast-intra + + Perform an initial scan of every fifth intra angular mode, then + check modes +/- 2 distance from the best mode, then +/- 1 distance + from the best mode, effectively performing a gradient descent. When + enabled 10 modes in total are checked. When disabled all 33 angular + modes are checked. Only applicable for :option:`--rd` levels 3 and + below (medium preset and faster). + +.. option:: --weightp, -w, --no-weightp + + Enable weighted prediction in P slices. This enables weighting + analysis in the lookahead, which influences slice decisions, and + enables weighting analysis in the main encoder which allows P + reference samples to have a weight function applied to them prior to + using them for motion compensation. In video which has lighting + changes, it can give a large improvement in compression efficiency. + Default is enabled + +.. option:: --weightb, --no-weightb + + Enable weighted prediction in B slices. Default disabled + +.. option:: --rd <0..6> + + Level of RDO in mode decision. The higher the value, the more + exhaustive the analysis and the more rate distortion optimization is + used. The lower the value the faster the encode, the higher the + value the smaller the bitstream (in general). Default 3 + + Note that this table aims for accuracy, but is not necessarily our + final target behavior for each mode. + + +-------+---------------------------------------------------------------+ + | Level | Description | + +=======+===============================================================+ + | 0 | sa8d mode and split decisions, intra w/ source pixels | + +-------+---------------------------------------------------------------+ + | 1 | recon generated (better intra), RDO merge/skip selection | + +-------+---------------------------------------------------------------+ + | 2 | RDO splits and merge/skip selection | + +-------+---------------------------------------------------------------+ + | 3 | RDO mode and split decisions | + +-------+---------------------------------------------------------------+ + | 4 | Adds RDO Quant | + +-------+---------------------------------------------------------------+ + | 5 | Adds RDO prediction decisions | + +-------+---------------------------------------------------------------+ + | 6 | Currently same as 5 | + +-------+---------------------------------------------------------------+ + + **Range of values:** 0: least .. 6: full RDO analysis + +.. option:: --cu-lossless, --no-cu-lossless + + For each CU, evaluate lossless (transform and quant bypass) encode + of the best non-lossless mode option as a potential rate distortion + optimization. If the global option :option:`--lossless` has been + specified, all CUs will be encoded as lossless unconditionally + regardless of whether this option was enabled. Default disabled. + + Only effective at RD levels 3 and above, which perform RDO mode + decisions. + +.. option:: --signhide, --no-signhide + + Hide sign bit of one coeff per TU (rdo). The last sign is implied. + This requires analyzing all the coefficients to determine if a sign + must be toggled, and then to determine which one can be toggled with + the least amount of distortion. Default enabled + +Psycho-visual options +===================== + +Left to its own devices, the encoder will make mode decisions based on a +simple rate distortion formula, trading distortion for bitrate. This is +generally effective except for the manner in which this distortion is +measured. It tends to favor blurred reconstructed blocks over blocks +which have wrong motion. The human eye generally prefers the wrong +motion over the blur and thus x265 offers psycho-visual adjustments to +the rate distortion algorithm. + +:option:`--psy-rd` will add an extra cost to reconstructed blocks which +do not match the visual energy of the source block. The higher the +strength of :option:`--psy-rd` the more strongly it will favor similar +energy over blur and the more aggressively it will ignore rate +distortion. If it is too high, it will introduce visal artifacts and +increase bitrate enough for rate control to increase quantization +globally, reducing overall quality. psy-rd will tend to reduce the use +of blurred prediction modes, like DC and planar intra and bi-directional +inter prediction. + +:option:`--psy-rdoq` will adjust the distortion cost used in +rate-distortion optimized quantization (RDO quant), enabled in +:option:`--rd` 4 and above, favoring the preservation of energy in the +reconstructed image. :option:`--psy-rdoq` prevents RDOQ from blurring +all of the encoding options which psy-rd has to chose from. At low +strength levels, psy-rdoq will influence the quantization level +decisions, favoring higher AC energy in the reconstructed image. As +psy-rdoq strength is increased, more non-zero coefficient levels are +added and fewer coefficients are zeroed by RDOQ's rate distortion +analysis. High levels of psy-rdoq can double the bitrate which can have +a drastic effect on rate control, forcing higher overall QP, and can +cause ringing artifacts. psy-rdoq is less accurate than psy-rd, it is +biasing towards energy in general while psy-rd biases towards the energy +of the source image. But very large psy-rdoq values can sometimes be +beneficial, preserving film grain for instance. + +As a general rule, when both psycho-visual features are disabled, the +encoder will tend to blur blocks in areas of difficult motion. Turning +on small amounts of psy-rd and psy-rdoq will improve the perceived +visual quality. Increasing psycho-visual strength further will improve +quality and begin introducing artifacts and increase bitrate, which may +force rate control to increase global QP. Finding the optimal +psycho-visual parameters for a given video requires experimentation. Our +recommended defaults (1.0 for both) are generally on the low end of the +spectrum. And generally the lower the bitrate, the lower the optimal +psycho-visual settings. + +.. option:: --psy-rd + + Influence rate distortion optimizated mode decision to preserve the + energy of the source image in the encoded image at the expense of + compression efficiency. It only has effect on presets which use + RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a + typical value. Default disabled. Experimental + + **Range of values:** 0 .. 2.0 + +.. option:: --psy-rdoq + + Influence rate distortion optimized quantization by favoring higher + energy in the reconstructed image. This generally improves perceived + visual quality at the cost of lower quality metric scores. It only + has effect on slower presets which use RDO Quantization + (:option:`--rd` 4, 5 and 6). 1.0 is a typical value. Default + disabled. High values can be beneficial in preserving high-frequency + detail like film grain. Experimental + + **Range of values:** 0 .. 50.0 + + +Slice decision options +====================== + +.. option:: --open-gop, --no-open-gop + + Enable open GOP, allow I-slices to be non-IDR. Default enabled + +.. option:: --keyint, -I + + Max intra period in frames. A special case of infinite-gop (single + keyframe at the beginning of the stream) can be triggered with + argument -1. Use 1 to force all-intra. Default 250 + +.. option:: --min-keyint, -i + + Minimum GOP size. Scenecuts closer together than this are coded as I + or P, not IDR. Minimum keyint is clamped to be at least half of + :option:`--keyint`. If you wish to force regular keyframe intervals + and disable adaptive I frame placement, you must use + :option:`--no-scenecut`. + + **Range of values:** >=0 (0: auto) + +.. option:: --scenecut , --no-scenecut + + How aggressively I-frames need to be inserted. The higher the + threshold value, the more aggressive the I-frame placement. + :option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive + I frame placement. Default 40 + +.. option:: --rc-lookahead + + Number of frames for slice-type decision lookahead (a key + determining factor for encoder latency). The longer the lookahead + buffer the more accurate scenecut decisions will be, and the more + effective cuTree will be at improving adaptive quant. Having a + lookahead larger than the max keyframe interval is not helpful. + Default 20 + + **Range of values:** Between the maximum consecutive bframe count (:option:`--bframes`) and 250 + +.. option:: --b-adapt + + Adaptive B frame scheduling. Default 2 + + **Values:** 0:none; 1:fast; 2:full(trellis) + +.. option:: --bframes, -b <0..16> + + Maximum number of consecutive b-frames. Use :option:`--bframes` 0 to + force all P/I low-latency encodes. Default 4. This parameter has a + quadratic effect on the amount of memory allocated and the amount of + work performed by the full trellis version of :option:`--b-adapt` + lookahead. + +.. option:: --bframe-bias + + Bias towards B frames in slicetype decision. The higher the bias the + more likely x265 is to use B frames. Can be any value between -90 + and 100 and is clipped to that range. Default 0 + +.. option:: --b-pyramid, --no-b-pyramid + + Use B-frames as references, when possible. Default enabled + +.. option:: --ref <1..16> + + Max number of L0 references to be allowed. This number has a linear + multiplier effect on the amount of work performed in motion search, + but will generally have a beneficial affect on compression and + distortion. Default 3 + +Quality, rate control and rate distortion options +================================================= + +.. option:: --bitrate + + Enables single-pass ABR rate control. Specify the target bitrate in + kbps. Default is 0 (CRF) + + **Range of values:** An integer greater than 0 + +.. option:: --crf <0..51.0> + + Quality-controlled variable bitrate. CRF is the default rate control + method; it does not try to reach any particular bitrate target, + instead it tries to achieve a given uniform quality and the size of + the bitstream is determined by the complexity of the source video. + The higher the rate factor the higher the quantization and the lower + the quality. Default rate factor is 28.0. + +.. option:: --crf-max <0..51.0> + + Specify an upper limit to the rate factor which may be assigned to + any given frame (ensuring a max QP). This is dangerous when CRF is + used in combination with VBV as it may result in buffer underruns. + Default disabled + +.. option:: --crf-min <0..51.0> + + Specify an lower limit to the rate factor which may be assigned to + any given frame (ensuring a min QP). This is dangerous when CRF is + used in combination with VBV as it may result in buffer underruns. + Default disabled + +.. option:: --vbv-bufsize + + Specify the size of the VBV buffer (kbits). Enables VBV in ABR + mode. In CRF mode, :option:`--vbv-maxrate` must also be specified. + Default 0 (vbv disabled) + +.. option:: --vbv-maxrate + + Maximum local bitrate (kbits/sec). Will be used only if vbv-bufsize + is also non-zero. Both vbv-bufsize and vbv-maxrate are required to + enable VBV in CRF mode. Default 0 (disabled) + +.. option:: --vbv-init + + Initial buffer occupancy. The portion of the decode buffer which + must be full before the decoder will begin decoding. Determines + absolute maximum frame size. May be specified as a fractional value + between 0 and 1, or in kbits. In other words these two option pairs + are equivalent:: + + :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 900 + :option:`--vbv-bufsize` 1000 :option:`--vbv-init` 0.9 + + Default 0.9 + + **Range of values:** fractional: 0 - 1.0, or kbits: 2 .. bufsize + +.. option:: --qp, -q + + Specify base quantization parameter for Constant QP rate control. + Using this option enables Constant QP rate control. The specified QP + is assigned to P slices. I and B slices are given QPs relative to P + slices using param->rc.ipFactor and param->rc.pbFactor unless QP 0 + is specified, in which case QP 0 is used for all slice types. Note + that QP 0 does not cause lossless encoding, it only disables + quantization. Default disabled (CRF) + + **Range of values:** an integer from 0 to 51 + +.. option:: --ipratio + + QP ratio factor between I and P slices. This ratio is used in all of + the rate control modes. Some :option:`--tune` options may change the + default value. It is not typically manually specified. Default 1.4 + +.. option:: --pbratio + + QP ratio factor between P and B slices. This ratio is used in all of + the rate control modes. Some :option:`--tune` options may change the + default value. It is not typically manually specified. Default 1.3 + +.. option:: --lossless, --no-lossless + + Enables true lossless coding by bypassing scaling, transform, + quantization and in-loop filter processes. This is used for + ultra-high bitrates with zero loss of quality. Reconstructed output + pictures are bit-exact to the input pictures. Lossless encodes + implicitly have no rate control, all rate control options are + ignored. Slower presets will generally achieve better compression + efficiency (and generate smaller bitstreams). Default disabled. + +.. option:: --aq-mode <0|1|2> + + Adaptive Quantization operating mode. Raise or lower per-block + quantization based on complexity analysis of the source image. The + more complex the block, the more quantization is used. This offsets + the tendency of the encoder to spend too many bits on complex areas + and not enough in flat areas. + + 0. disabled + 1. AQ enabled + 2. AQ enabled with auto-variance **(default)** + +.. option:: --aq-strength + + Adjust the strength of the adaptive quantization offsets. Setting + :option:`--aq-strength` to 0 disables AQ. Default 1.0. + + **Range of values:** 0.0 to 3.0 + +.. option:: --cutree, --no-cutree + + Enable the use of lookahead's lowres motion vector fields to + determine the amount of reuse of each block to tune adaptive + quantization factors. CU blocks which are heavily reused as motion + reference for later frames are given a lower QP (more bits) while CU + blocks which are quickly changed and are not referenced are given + less bits. This tends to improve detail in the backgrounds of video + with less detail in areas of high motion. Default enabled + +.. option:: --cbqpoffs + + Offset of Cb chroma QP from the luma QP selected by rate control. + This is a general way to spend more or less bits on the chroma + channel. Default 0 + + **Range of values:** -12 to 12 + +.. option:: --crqpoffs + + Offset of Cr chroma QP from the luma QP selected by rate control. + This is a general way to spend more or less bits on the chroma + channel. Default 0 + + **Range of values:** -12 to 12 + +.. option:: --pass + + Enable multipass rate control mode. Input is encoded multiple times, + storing the encoded information of each pass in a stats file from which + the consecutive pass tunes the qp of each frame to improve the quality + of the output. Default disabled + + 1. First pass, creates stats file + 2. Last pass, does not overwrite stats file + 3. Nth pass, overwrites stats file + + **Range of values:** 1 to 3 + +.. option:: --slow-firstpass, --no-slow-firstpass + + Enable a slow and more detailed first pass encode in Multipass rate + control mode. Speed of the first pass encode is slightly lesser and + quality midly improved when compared to the default settings in a + multipass encode. Default disabled (turbo mode enabled) + + When **turbo** first pass is not disabled, these options are + set on the first pass to improve performance: + + * :option:`--fast-intra` + * :option:`--no-rect` + * :option:`--no-amp` + * :option:`--early-skip` + * :option:`--ref` = 1 + * :option:`--max-merge` = 1 + * :option:`--me` = DIA + * :option:`--subme` = MIN(2, :option:`--subme`) + * :option:`--rd` = MIN(2, :option:`--rd`) + +.. option:: --analysis-mode + + Specify whether analysis information of each frame is output by encoder + or input for reuse. By reading the analysis data writen by an + earlier encode of the same sequence, substantial redundant work may + be avoided. + + The following data may be stored and reused: + I frames - split decisions and luma intra directions of all CUs. + P/B frames - motion vectors are dumped at each depth for all CUs. + + **Values:** off(0), save(1): dump analysis data, load(2): read analysis data + +.. option:: --analysis-file + + Specify a filename for analysis data (see :option:`--analysis-mode`) + If no filename is specified, x265_analysis.dat is used. + +Loop filters +============ + +.. option:: --lft, --no-lft + + Toggle deblocking loop filter, default enabled + +.. option:: --sao, --no-sao + + Toggle Sample Adaptive Offset loop filter, default enabled + +.. option:: --sao-non-deblock, --no-sao-non-deblock + + Specify how to handle depencency between SAO and deblocking filter. + When enabled, non-deblocked pixels are used for SAO analysis. When + disabled, SAO analysis skips the right/bottom boundary areas. + Default disabled + +VUI (Video Usability Information) options +========================================= + +x265 emits a VUI with only the timing info by default. If the SAR is +specified (or read from a Y4M header) it is also included. All other +VUI fields must be manually specified. + +.. option:: --sar + + Sample Aspect Ratio, the ratio of width to height of an individual + sample (pixel). The user may supply the width and height explicitly + or specify an integer from the predefined list of aspect ratios + defined in the HEVC specification. Default undefined (not signaled) + + 1. 1:1 (square) + 2. 12:11 + 3. 10:11 + 4. 16:11 + 5. 40:33 + 6. 24:11 + 7. 20:11 + 8. 32:11 + 9. 80:33 + 10. 18:11 + 11. 15:11 + 12. 64:33 + 13. 160:99 + 14. 4:3 + 15. 3:2 + 16. 2:1 + +.. option:: --crop-rect + + Define the (overscan) region of the image that does not contain + information because it was added to achieve certain resolution or + aspect ratio. The decoder may be directed to crop away this region + before displaying the images via the :option:`--overscan` option. + Default undefined (not signaled) + +.. option:: --overscan + + Specify whether it is appropriate for the decoder to display or crop + the overscan area. Default unspecified (not signaled) + +.. option:: --videoformat + + Specify the source format of the original analog video prior to + digitizing and encoding. Default undefined (not signaled) + + 0. component + 1. pal + 2. ntsc + 3. secam + 4. mac + 5. undefined + +.. option:: --range + + Specify output range of black level and range of luma and chroma + signals. Default undefined (not signaled) + +.. option:: --colorprim + + Specify color primitive to use when converting to RGB. Default + undefined (not signaled) + + 1. bt709 + 2. undef + 3. **reserved** + 4. bt470m + 5. bt470bg + 6. smpte170m + 7. smpte240m + 8. film + 9. bt2020 + +.. option:: --transfer + + Specify transfer characteristics. Default undefined (not signaled) + + 1. bt709 + 2. undef + 3. **reserved** + 4. bt470m + 5. bt470bg + 6. smpte170m + 7. smpte240m + 8. linear + 9. log100 + 10. log316 + 11. iec61966-2-4 + 12. bt1361e + 13. iec61966-2-1 + 14. bt2020-10 + 15. bt2020-12 + +.. option:: --colormatrix + + Specify color matrix setting i.e set the matrix coefficients used in + deriving the luma and chroma. Default undefined (not signaled) + + 0. GBR + 1. bt709 + 2. undef + 3. **reserved** + 4. fcc + 5. bt470bg + 6. smpte170m + 7. smpte240m + 8. YCgCo + 9. bt2020nc + 10. bt2020c + +.. option:: --chromalocs <0..5> + + Specify chroma sample location for 4:2:0 inputs. Consult the HEVC + specification for a description of these values. Default undefined + (not signaled) + +Bitstream options +================= + +.. option:: --repeat-headers, --no-repeat-headers + + If enabled, x265 will emit VPS, SPS, and PPS headers with every + keyframe. This is intended for use when you do not have a container + to keep the stream headers for you and you want keyframes to be + random access points. Default disabled + +.. option:: --info, --no-info + + Emit an informational SEI with the stream headers which describes + the encoder version, build info, and encode parameters. This is very + helpful for debugging purposes but encoding version numbers and + build info could make your bitstreams diverge and interfere with + regression testing. Default enabled + +.. option:: --hrd, --no-hrd + + Enable the signalling of HRD parameters to the decoder. The HRD + parameters are carried by the Buffering Period SEI messages and + Picture Timing SEI messages providing timing information to the + decoder. Default disabled + +.. option:: --aud, --no-aud + + Emit an access unit delimiter NAL at the start of each slice access + unit. If option:`--repeat-headers` is not enabled (indicating the + user will be writing headers manually at the start of the stream) + the very first AUD will be skipped since it cannot be placed at the + start of the access unit, where it belongs. Default disabled + +.. option:: --hash + + Emit decoded picture hash SEI, so the decoder may validate the + reconstructed pictures and detect data loss. Also useful as a + debug feature to validate the encoder state. Default None + + 1. MD5 + 2. CRC + 3. Checksum + +Debugging options +================= + +.. option:: --recon, -r + + Output file containing reconstructed images in display order. If the + file extension is ".y4m" the file will contain a YUV4MPEG2 stream + header and frame headers. Otherwise it will be a raw YUV file in the + encoder's internal bit depth. + + **CLI ONLY** + +.. option:: --recon-depth + + Bit-depth of output file. This value defaults to the internal bit + depth and currently cannot to be modified. + + **CLI ONLY** + +.. vim: noet diff --git a/doc/reST/conf.py b/doc/reST/conf.py new file mode 100644 index 0000000..561f7d0 --- /dev/null +++ b/doc/reST/conf.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# +# -- General configuration ----------------------------------------------------- + +source_suffix = '.rst' + +# Name of the master file +master_doc = 'index' + +# General information about the project. +project = u'x265' + +# This is the Copyright Information that will appear on the bottom of the document +copyright = u'2014 MulticoreWare Inc' + +# -- Options for HTML output --------------------------------------------------- +html_theme = "default" diff --git a/doc/reST/index.rst b/doc/reST/index.rst new file mode 100644 index 0000000..610f435 --- /dev/null +++ b/doc/reST/index.rst @@ -0,0 +1,11 @@ +x265 Documentation +====================== +.. toctree:: + :maxdepth: 2 + + introduction + cli + api + threading + presets + lossless diff --git a/doc/reST/introduction.rst b/doc/reST/introduction.rst new file mode 100644 index 0000000..1d953f4 --- /dev/null +++ b/doc/reST/introduction.rst @@ -0,0 +1,82 @@ +************ +Introduction +************ + +Increasing demand for high definition and ultra-high definition video, +along with an increasing desire for video on demand has led to +exponential growth in demand for bandwidth and storage requirements. +These challenges can be met by the new High Efficiency Video Coding +(HEVC) standard, also known as H.265. The x265 HEVC encoder project was +launched by MulticoreWare in 2013, aiming to provide the most efficient, +highest performance HEVC video encoder. + +About HEVC +========== + +The High Efficiency Video Coding (HEVC) was developed by the ISO/IEC +Moving Picture Experts Group (MPEG) and ITU-T Video Coding Experts Group +(VCEG), through their Joint Collaborative Team on Video Coding (JCT-VC). +HEVC is also known as ISO/IEC 23008-2 MPEG-H Part 2 and ITU-T H.265. +HEVC provides superior video quality and up to twice the data +compression as the previous standard (H.264/MPEG-4 AVC). HEVC can +support 8K Ultra High Definition video, with a picture size up to +8192x4320 pixels. + +About x265 +========== + +The primary objective of x265 is to become the best H.265/HEVC encoder +available anywhere, offering the highest compression efficiency and the +highest performance on a wide variety of hardware platforms. The x265 +encoder is available as an open source library, published under the +GPLv2 license. It is also available under a commercial license, enabling +commercial companies to utilize and distribute x265 in their solutions +without being subject to the restrictions of the GPL license. + +x265 is developed by `MulticoreWare `_, +leaders in high performance software solutions, with backing from +leading video technology providers including `Telestream +`_ and `Doremi Labs +`_ (and other companies who want to remain +anonymous at this time), and with contributions from open source +developers. x265 leverages many of the outstanding video encoding +features and optimizations from the x264 AVC encoder project. + +The x265 software is available for free under the GNU GPL 2 license, +from https://bitbucket.org/multicoreware/x265. For commercial companies +that wish to distribute x265 without being subject to the open source +requirements of the GPL 2 license, commercial licenses are available +with competitive terms. Contact license @ x265.com to inquire about +commercial license terms. + +While x265 is primarily designed as a video encoder software library, a +command-line executable is provided to facilitate testing and +development. We expect x265 to be utilized in many leading video +hardware and software products and services in the coming months. + +LEGAL NOTICES +============= + +The x265 software is owned and copyrighted by MulticoreWare, Inc. +MulticoreWare is committed to offering the x265 software under the GNU +GPL v2 license. Companies who do not wish to integrate the x265 +Software in their products under the terms of the GPL license can +contact MulticoreWare (license @ x265.com) to obtain a commercial +license agreement. Companies who use x265 under the GPL may also wish +to work with MulticoreWare to accelerate the development of specific +features or optimized support for specific hardware or software +platforms, or to contract for support. + +The GNU GPL v2 license or the x265 commercial license agreement govern +your rights to access the copyrighted x265 software source code, but do +not cover any patents that may be applicable to the function of binary +executable software created from the x265 source code. You are +responsible for understanding the laws in your country, and for +licensing all applicable patent rights needed for use or distribution of +software applications created from the x265 source code. A good place +to start is with the `Motion Picture Experts Group - Licensing Authority +- HEVC Licensing Program`_. + +x265 is a registered trademark of MulticoreWare, Inc. The x265 logo is +a trademark of MulticoreWare, and may only be used with explicit written +permission. All rights reserved. diff --git a/doc/reST/lossless.rst b/doc/reST/lossless.rst new file mode 100644 index 0000000..1e2a01b --- /dev/null +++ b/doc/reST/lossless.rst @@ -0,0 +1,162 @@ +Lossless +-------- + +Lossless Encoding +================= + +x265 can encode HEVC bitstreams that are entirely lossless (the +reconstructed images are bit-exact to the source images) by using the +:option:`--lossless` option. Lossless operation is theoretically +simple. Rate control, by definition, is disabled and the encoder +disables all quality metrics since they would only waste CPU cycles. +Instead, x265 reports only a compression factor at the end of the +encode. + +In HEVC, lossless coding means bypassing both the DCT transforms and +bypassing quantization (often referred to as transquant bypass). Normal +predictions are still allowed, so the encoder will find optimal inter or +intra predictions and then losslessly code the residual (with transquant +bypass). + +All :option:`--preset` options are capable of generating lossless video +streams, but in general the slower the preset the better the compression +ratio (and the slower the encode). Here are some examples:: + + ./x265 ../test-720p.y4m o.bin --preset ultrafast --lossless + ... ... + encoded 721 frames in 238.38s (3.02 fps), 57457.94 kb/s + + ./x265 ../test-720p.y4m o.bin --preset faster --lossless + ... ... + x265 [info]: lossless compression ratio 3.11::1 + encoded 721 frames in 258.46s (2.79 fps), 56787.65 kb/s + + ./x265 ../test-720p.y4m o.bin --preset slow --lossless + ... ... + x265 [info]: lossless compression ratio 3.36::1 + encoded 721 frames in 576.73s (1.25 fps), 52668.25 kb/s + + ./x265 ../test-720p.y4m o.bin --preset veryslow --lossless + x265 [info]: lossless compression ratio 3.76::1 + encoded 721 frames in 6298.22s (0.11 fps), 47008.65 kb/s + +.. Note:: + In HEVC, only QP=4 is truly lossless quantization, and thus when + encoding losslesly x265 uses QP=4 internally in its RDO decisions. + +Near-lossless Encoding +====================== + +Near-lossless conditions are a quite a bit more interesting. Normal ABR +rate control will allow one to scale the bitrate up to the point where +quantization is entirely bypassed (QP <= 4), but even at this point +there is a lot of SSIM left on the table because of the DCT transforms, +which are not lossless:: + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 40000 --ssim + encoded 721 frames in 326.62s (2.21 fps), 39750.56 kb/s, SSIM Mean Y: 0.9990703 (30.317 dB) + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 50000 --ssim + encoded 721 frames in 349.27s (2.06 fps), 44326.84 kb/s, SSIM Mean Y: 0.9994134 (32.316 dB) + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 60000 --ssim + encoded 721 frames in 360.04s (2.00 fps), 45394.50 kb/s, SSIM Mean Y: 0.9994823 (32.859 dB) + +For the encoder to get over this quality plateau, one must enable +lossless coding at the CU level with :option:`--cu-lossless`. It tells +the encoder to evaluate trans-quant bypass as a coding option for each +CU, and to pick the option with the best rate-distortion +characteristics. + +The :option:`--cu-lossless` option is very expensive, computationally, +and it only has a positive effect when the QP is extremely low, allowing +RDO to spend a large amount of bits to make small improvements to +quality. So this option should only be enabled when you are encoding +near-lossless bitstreams:: + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 40000 --ssim --cu-lossless + encoded 721 frames in 500.51s (1.44 fps), 40017.10 kb/s, SSIM Mean Y: 0.9997790 (36.557 dB) + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 50000 --ssim --cu-lossless + encoded 721 frames in 524.60s (1.37 fps), 46083.37 kb/s, SSIM Mean Y: 0.9999432 (42.456 dB) + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 60000 --ssim --cu-lossless + encoded 721 frames in 523.63s (1.38 fps), 46552.92 kb/s, SSIM Mean Y: 0.9999489 (42.917 dB) + +.. Note:: + It is not unusual for bitrate to drop as you increase lossless coding. + Having "perfectly coded" reference blocks reduces residual in later + frames. It is quite possible for a near-lossless encode to spend + more bits than a lossless encode. + +Enabling psycho-visual rate distortion will improve lossless coding. +:option:`--psy-rd` influences the RDO decisions in favor of energy +(detail) preservation over bit cost and results in more blocks being +losslessly coded. Our psy-rd feature is not yet assembly optimized, so +this makes the encodes run even slower:: + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 40000 --ssim --cu-lossless --psy-rd 1.0 + encoded 721 frames in 581.83s (1.24 fps), 40112.15 kb/s, SSIM Mean Y: 0.9998632 (38.638 dB) + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 50000 --ssim --cu-lossless --psy-rd 1.0 + encoded 721 frames in 587.54s (1.23 fps), 46284.55 kb/s, SSIM Mean Y: 0.9999663 (44.721 dB) + + ./x265 ../test-720p.y4m o.bin --preset medium --bitrate 60000 --ssim --cu-lossless --psy-rd 1.0 + encoded 721 frames in 592.93s (1.22 fps), 46839.51 kb/s, SSIM Mean Y: 0.9999707 (45.334 dB) + +:option:`--cu-lossless` will also be more effective at slower +presets which perform RDO at more levels and thus may find smaller +blocks that would benefit from lossless coding:: + + ./x265 ../test-720p.y4m o.bin --preset veryslow --bitrate 40000 --ssim --cu-lossless + encoded 721 frames in 12969.25s (0.06 fps), 37331.96 kb/s, SSIM Mean Y: 0.9998108 (37.231 dB) + + ./x265 ../test-720p.y4m o.bin --preset veryslow --bitrate 50000 --ssim --cu-lossless + encoded 721 frames in 46217.84s (0.05 fps), 42976.28 kb/s, SSIM Mean Y: 0.9999482 (42.856 dB) + + ./x265 ../test-720p.y4m o.bin --preset veryslow --bitrate 60000 --ssim --cu-lossless + encoded 721 frames in 13738.17s (0.05 fps), 43864.21 kb/s, SSIM Mean Y: 0.9999633 (44.348 dB) + +And with psy-rd and a slow preset together, very high SSIMs are +possible:: + + ./x265 ../test-720p.y4m o.bin --preset veryslow --bitrate 40000 --ssim --cu-lossless --psy-rd 1.0 + encoded 721 frames in 11675.81s (0.06 fps), 37819.45 kb/s, SSIM Mean Y: 0.9999181 (40.867 dB) + + ./x265 ../test-720p.y4m o.bin --preset veryslow --bitrate 50000 --ssim --cu-lossless --psy-rd 1.0 + encoded 721 frames in 12414.56s (0.06 fps), 42815.75 kb/s, SSIM Mean Y: 0.9999758 (46.168 dB) + + ./x265 ../test-720p.y4m o.bin --preset veryslow --bitrate 60000 --ssim --cu-lossless --psy-rd 1.0 + encoded 721 frames in 11684.89s (0.06 fps), 43324.48 kb/s, SSIM Mean Y: 0.9999793 (46.835 dB) + + +It's important to note in the end that it is easier (less work) for the +encoder to encode the video losslessly than it is to encode it +near-losslessly. If the encoder knows up front the encode must be +lossless, it does not need to evaluate any lossy coding methods. The +encoder only needs to find the most efficient prediction for each block +and then entropy code the residual. + +It is not feasible for :option:`--cu-lossless` to turn itself on when +the encoder determines it is encoding a near-lossless bitstream (ie: +when rate control nearly disables all quantization) because the feature +requires a flag to be enabled in the stream headers. At the time the +stream headers are being coded we do not know whether +:option:`--cu-lossless` would be a help or a hinder. If very few or no +blocks end up being coded as lossless, then having the feature enabled +is a net loss in compression efficiency because it adds a flag that must +be coded for every CU. So ignoring even the performance aspects of the +feature, it can be a compression loss if enabled without being used. So +it is up to the user to only enable this feature when they are coding at +near-lossless quality. + +Transform Skip +============== + +A somewhat related feature, :option:`--tskip` tells the encoder to +evaluate transform-skip (bypass DCT but with quantization still enabled) +when coding small 4x4 transform blocks. This feature is intended to +improve the coding efficiency of screen content (aka: text on a screen) +and is not really intended for lossless coding. This feature should +only be enabled if the content has a lot of very sharp edges in it, and +is mostly unrelated to lossless coding. diff --git a/doc/reST/presets.rst b/doc/reST/presets.rst new file mode 100644 index 0000000..99085a2 --- /dev/null +++ b/doc/reST/presets.rst @@ -0,0 +1,103 @@ +Preset Options +-------------- + +Presets +======= + +.. _preset-tune-ref: + +x265 has a number of predefined :option:`--preset` options that make +trade-offs between encode speed (encoded frames per second) and +compression efficiency (quality per bit in the bitstream). The default +preset is medium, it does a reasonably good job of finding the best +possible quality without spending enormous CPU cycles looking for the +absolute most efficient way to achieve that quality. As you go higher +than medium, the encoder takes shortcuts to improve performance at the +expense of quality and compression efficiency. As you go lower than +medium, the encoder tries harder and harder to achieve the best quailty +per bit compression ratio. + +The presets adjust encoder parameters to affect these trade-offs. + ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| | ultrafast | superfast | veryfast | faster | fast | medium | slow | slower | veryslow | placebo | ++==============+===========+===========+==========+========+======+========+======+========+==========+=========+ +| ctu | 32 | 32 | 32 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| bframes | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 8 | 8 | 8 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| b-adapt | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 2 | 2 | 2 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| rc-lookahead | 10 | 10 | 15 | 15 | 15 | 20 | 25 | 30 | 40 | 60 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| scenecut | 0 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| refs | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 | 5 | 5 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| me | dia | hex | hex | hex | hex | hex | star | star | star | star | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| merange | 25 | 44 | 57 | 57 | 57 | 57 | 57 | 57 | 57 | 92 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| subme | 0 | 1 | 1 | 2 | 2 | 2 | 3 | 3 | 4 | 5 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| rect | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| amp | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| max-merge | 2 | 2 | 2 | 2 | 2 | 2 | 3 | 3 | 4 | 5 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| early-skip | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| fast-intra | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| b-intra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| sao | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| signhide | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| weightp | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| weightb | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| aq-mode | 0 | 0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| cuTree | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| rdLevel | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 6 | 6 | 6 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| lft | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| tu-intra | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| tu-inter | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ + +Placebo mode enables transform-skip prediction evaluation. + +Tuning +====== + +There are a few :option:`--tune` options available, which are applied +after the preset. + +.. Note:: + + The *psnr* and *ssim* tune options disable all optimizations that + sacrafice metric scores for perceived visual quality (also known as + psycho-visual optimizations). By default x265 always tunes for + highest perceived visual quality but if one intends to measure an + encode using PSNR or SSIM for the purpose of benchmarking, we highly + recommend you configure x265 to tune for that particular metric. + ++--------------+-----------------------------------------------------+ +| --tune | effect | ++==============+=====================================================+ +| psnr | disables adaptive quant, psy-rd, and cutree | ++--------------+-----------------------------------------------------+ +| ssim | enables adaptive quant auto-mode, disables psy-rd | ++--------------+-----------------------------------------------------+ +| fastdecode | no loop filters, no weighted pred, no intra in B | ++--------------+-----------------------------------------------------+ +| zerolatency | no lookahead, no B frames, no cutree | ++--------------+-----------------------------------------------------+ diff --git a/doc/reST/threading.rst b/doc/reST/threading.rst new file mode 100644 index 0000000..cbb851d --- /dev/null +++ b/doc/reST/threading.rst @@ -0,0 +1,241 @@ +********* +Threading +********* + +Thread Pool +=========== + +x265 creates a pool of worker threads and shares this thread pool +with all encoders within the same process (it is process global, aka a +singleton). The number of threads within the thread pool is determined +by the encoder which first allocates the pool, which by definition is +the first encoder created within each process. + +:option:`--threads` specifies the number of threads the encoder will +try to allocate for its thread pool. If the thread pool was already +allocated this parameter is ignored. By default x265 allocates one +thread per (hyperthreaded) CPU core in your system. + +Work distribution is job based. Idle worker threads ask their parent +pool object for jobs to perform. When no jobs are available, idle +worker threads block and consume no CPU cycles. + +Objects which desire to distribute work to worker threads are known as +job providers (and they derive from the JobProvider class). When job +providers have work they enqueue themselves into the pool's provider +list (and dequeue themselves when they no longer have work). The thread +pool has a method to **poke** awake a blocked idle thread, and job +providers are recommended to call this method when they make new jobs +available. + +Worker jobs are not allowed to block except when abosultely necessary +for data locking. If a job becomes blocked, the worker thread is +expected to drop that job and go back to the pool and find more work. + +.. note:: + + x265_cleanup() frees the process-global thread pool, allowing + it to be reallocated if necessary, but only if no encoders are + allocated at the time it is called. + +Wavefront Parallel Processing +============================= + +New with HEVC, Wavefront Parallel Processing allows each row of CTUs to +be encoded in parallel, so long as each row stays at least two CTUs +behind the row above it, to ensure the intra references and other data +of the blocks above and above-right are available. WPP has almost no +effect on the analysis and compression of each CTU and so it has a very +small impact on compression efficiency relative to slices or tiles. The +compression loss from WPP has been found to be less than 1% in most of +our tests. + +WPP has three effects which can impact efficiency. The first is the row +starts must be signaled in the slice header, the second is each row must +be padded to an even byte in length, and the third is the state of the +entropy coder is transferred from the second CTU of each row to the +first CTU of the row below it. In some conditions this transfer of +state actually improves compression since the above-right state may have +better locality than the end of the previous row. + +Parabola Research have published an excellent HEVC +`animation `_ +which visualizes WPP very well. It even correctly visualizes some of +WPPs key drawbacks, such as: + +1. the low thread utilization at the start and end of each frame +2. a difficult block may stall the wave-front and it takes a while for + the wave-front to recover. +3. 64x64 CTUs are big! there are much fewer rows than with H.264 and + similar codecs + +Because of these stall issues you rarely get the full parallelisation +benefit one would expect from row threading. 30% to 50% of the +theoretical perfect threading is typical. + +In x265 WPP is enabled by default since it not only improves performance +at encode but it also makes it possible for the decoder to be threaded. + +If WPP is disabled by :option:`--no-wpp` the frame will be encoded in +scan order and the entropy overheads will be avoided. If frame +threading is not disabled, the encoder will change the default frame +thread count to be higher than if WPP was enabled. The exact formulas +are described in the next section. + +Parallel Mode Analysis +====================== + +When :option:`--pmode` is enabled, each CU (at all depths from 64x64 to +8x8) will distribute its analysis work to the thread pool. Each analysis +job will measure the cost of one prediction for the CU: merge, skip, +intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At slower presets, the amount +of increased parallelism is often enough to be able to reduce frame +parallelism while achieving the same overall CPU utilization. Reducing +frame threads is often beneficial to ABR and VBV rate control. + +Parallel Motion Estimation +========================== + +When :option:`--pme` is enabled all of the analysis functions which +perform motion searches to reference frames will distribute those motion +searches as jobs for worker threads (if more than two motion searches +are required). + +Frame Threading +=============== + +Frame threading is the act of encoding multiple frames at the same time. +It is a challenge because each frame will generally use one or more of +the previously encoded frames as motion references and those frames may +still be in the process of being encoded themselves. + +Previous encoders such as x264 worked around this problem by limiting +the motion search region within these reference frames to just one +macroblock row below the coincident row being encoded. Thus a frame +could be encoded at the same time as its reference frames so long as it +stayed one row behind the encode progress of its references (glossing +over a few details). + +x265 has the same frame threading mechanism, but we generally have much +less frame parallelism to exploit than x264 because of the size of our +CTU rows. For instance, with 1080p video x264 has 68 16x16 macroblock +rows available each frame while x265 only has 17 64x64 CTU rows. + +The second extenuating circumstance is the loop filters. The pixels used +for motion reference must be processed by the loop filters and the loop +filters cannot run until a full row has been encoded, and it must run a +full row behind the encode process so that the pixels below the row +being filtered are available. When you add up all the row lags each +frame ends up being 3 CTU rows behind its reference frames (the +equivalent of 12 macroblock rows for x264) + +The third extenuating circumstance is that when a frame being encoded +becomes blocked by a reference frame row being available, that frame's +wave-front becomes completely stalled and when the row becomes available +again it can take quite some time for the wave to be restarted, if it +ever does. This makes WPP many times less effective when frame +parallelism is in use. + +:option:`--merange` can have a negative impact on frame parallelism. If +the range is too large, more rows of CTU lag must be added to ensure +those pixels are available in the reference frames. + +.. note:: + + Even though the merange is used to determine the amount of reference + pixels that must be available in the reference frames, the actual + motion search is not necessarily centered around the coincident + block. The motion search is actually centered around the motion + predictor, but the available pixel area (mvmin, mvmax) is determined + by merange and the interpolation filter half-heights. + +When frame threading is disabled, the entirety of all reference frames +are always fully available (by definition) and thus the available pixel +area is not restricted at all, and this can sometimes improve +compression efficiency. Because of this, the output of encodes with +frame parallelism disabled will not match the output of encodes with +frame parallelism enabled; but when enabled the number of frame threads +should have no effect on the output bitstream except when using ABR or +VBV rate control or noise reduction. + +When :option:`--nr` is enabled, the outputs of each number of frame threads +will be deterministic but none of them will match becaue each frame +encoder maintains a cumulative noise reduction state. + +VBV introduces non-determinism in the encoder, at this point in time, +regardless of the amount of frame parallelism. + +By default frame parallelism and WPP are enabled together. The number of +frame threads used is auto-detected from the (hyperthreaded) CPU core +count, but may be manually specified via :option:`--frame-threads` + + +-------+--------+ + | Cores | Frames | + +=======+========+ + | > 32 | 6 | + +-------+--------+ + | >= 16 | 5 | + +-------+--------+ + | >= 8 | 3 | + +-------+--------+ + | >= 4 | 2 | + +-------+--------+ + +If WPP is disabled, then the frame thread count defaults to **min(cpuCount, ctuRows / 2)** + +Over-allocating frame threads can be very counter-productive. They +each allocate a large amount of memory and because of the limited number +of CTU rows and the reference lag, you generally get limited benefit +from adding frame encoders beyond the auto-detected count, and often +the extra frame encoders reduce performance. + +Given these considerations, you can understand why the faster presets +lower the max CTU size to 32x32 (making twice as many CTU rows available +for WPP and for finer grained frame parallelism) and reduce +:option:`--merange` + +Each frame encoder runs in its own thread (allocated separately from the +worker pool). This frame thread has some pre-processing responsibilities +and some post-processing responsibilities for each frame, but it spends +the bulk of its time managing the wave-front processing by making CTU +rows available to the worker threads when their dependencies are +resolved. The frame encoder threads spend nearly all of their time +blocked in one of 4 possible locations: + +1. blocked, waiting for a frame to process +2. blocked on a reference frame, waiting for a CTU row of reconstructed + and loop-filtered reference pixels to become available +3. blocked waiting for wave-front completion +4. blocked waiting for the main thread to consume an encoded frame + +Lookahead +========= + +The lookahead module of x265 (the lowres pre-encode which determines +scene cuts and slice types) uses the thread pool to distribute the +lowres cost analysis to worker threads. It follows the same wave-front +pattern as the main encoder except it works in reverse-scan order. + +The function slicetypeDecide() itself may also be performed by a worker +thread if your system has enough CPU cores to make this a beneficial +trade-off, else it runs within the context of the thread which calls the +x265_encoder_encode(). + +SAO +=== + +The Sample Adaptive Offset loopfilter has a large effect on encode +performance because of the peculiar way it must be analyzed and coded. + +SAO flags and data are encoded at the CTU level before the CTU itself is +coded, but SAO analysis (deciding whether to enable SAO and with what +parameters) cannot be performed until that CTU is completely analyzed +(reconstructed pixels are available) as well as the CTUs to the right +and below. So in effect the encoder must perform SAO analysis in a +wavefront at least a full row behind the CTU compression wavefront. + +This extra latency forces the encoder to save the encode data of every +CTU until the entire frame has been analyzed, at which point a function +can code the final slice bitstream with the decided SAO flags and data +interleaved between each CTU. This second pass over the CTUs can be +expensive, particularly at large resolutions and high bitrates. diff --git a/doc/uncrustify/codingstyle.cfg b/doc/uncrustify/codingstyle.cfg new file mode 100644 index 0000000..e517bcd --- /dev/null +++ b/doc/uncrustify/codingstyle.cfg @@ -0,0 +1,232 @@ +align_func_params=true +align_keep_tabs=false +align_left_shift=true +align_mix_var_proto=false +align_nl_cont=false +align_number_left=false +align_oc_decl_colon=false +align_on_operator=false +align_on_tabstop=false +align_right_cmt_mix=false +align_single_line_brace=false +align_single_line_func=false +align_var_def_attribute=false +align_var_def_colon=false +align_var_def_inline=false +align_with_tabs=false +cmt_c_group=true +cmt_c_nl_end=false +cmt_c_nl_start=true +cmt_cpp_group=true +cmt_cpp_nl_end=false +cmt_cpp_nl_start=false +cmt_cpp_to_c=false +cmt_indent_multi=false +cmt_insert_before_preproc=false +cmt_multi_check_last=true +cmt_reflow_mode=1 +cmt_sp_before_star_cont=0 +cmt_star_cont=true +cmt_width=130 +#code_width=130 +eat_blanks_after_open_brace=true +eat_blanks_before_close_brace=true +indent_access_spec_body=false +indent_align_assign=false +indent_align_string=false +indent_bool_paren=false +indent_brace_parent=false +indent_braces=false +indent_braces_no_class=false +indent_braces_no_func=false +indent_braces_no_struct=false +indent_class=true +indent_class_colon=false +indent_cmt_with_tabs=false +indent_col1_comment=false +indent_columns=4 +indent_comma_paren=false +indent_else_if=false +indent_extern=false +indent_first_bool_expr=false +indent_func_call_param=false +indent_func_class_param=false +indent_func_ctor_var_param=false +indent_func_def_param=false +indent_func_param_double=false +indent_func_proto_param=false +indent_namespace=false +indent_paren_nl=false +indent_preserve_sql=false +indent_relative_single_line_comments=false +indent_square_nl=false +indent_template_param=false +indent_var_def_cont=false +indent_with_tabs=0 +input_tab_size=2 +ls_for_split_full=true +ls_func_split_full=true +mod_add_long_ifdef_else_comment=10 +mod_add_long_ifdef_endif_comment=10 +mod_full_brace_do=add +mod_full_brace_for=add +mod_full_brace_if=ignore +mod_full_brace_if_chain=false +mod_full_brace_while=add +mod_full_paren_if_bool=false +mod_move_case_break=false +mod_paren_on_return=remove +mod_pawn_semicolon=false +mod_remove_empty_return=true +mod_remove_extra_semicolon=true +mod_sort_import=false +mod_sort_include=false +mod_sort_using=false +newlines=lf +nl_after_access_spec=2 +#nl_after_brace_close=ignore +#nl_after_brace_open=ignore +nl_after_brace_open_cmt=true +nl_after_case=false +nl_after_class=2 +nl_after_for=add +nl_after_func_body=2 +nl_after_func_body_one_liner=2 +nl_after_if=ignore +nl_after_multiline_comment=true +nl_after_return=false +nl_after_semicolon=true +nl_after_struct=2 +nl_after_switch=add +nl_after_vbrace_close=false +nl_after_vbrace_open=false +nl_after_vbrace_open_empty=false +nl_after_while=add +nl_assign_brace=add +nl_assign_leave_one_liners=true +nl_before_access_spec=2 +nl_before_block_comment=2 +nl_before_case=false +nl_brace_else=add +nl_brace_while=add +nl_case_colon_brace=add +nl_class_brace=add +nl_class_init_args=ignore +nl_class_leave_one_liners=true +nl_collapse_empty_body=false +nl_create_for_one_liner=false +nl_create_if_one_liner=false +nl_create_while_one_liner=false +nl_define_macro=false +nl_do_brace=add +nl_ds_struct_enum_close_brace=false +nl_ds_struct_enum_cmt=false +nl_else_brace=add +nl_else_if=remove +nl_elseif_brace=add +nl_end_of_file=add +nl_end_of_file_min=1 +nl_enum_brace=add +nl_enum_leave_one_liners=true +nl_fdef_brace=add +nl_for_brace=add +nl_func_decl_end=remove +nl_func_decl_start=remove +nl_func_def_paren=remove +nl_func_def_start=remove +nl_func_leave_one_liners=true +nl_func_paren=remove +nl_func_proto_type_name=remove +nl_func_type_name=remove +nl_func_type_name_class=remove +nl_func_var_def_blk=2 +nl_getset_leave_one_liners=true +nl_if_brace=add +nl_if_leave_one_liners=true +nl_max=2 +nl_multi_line_cond=false +nl_multi_line_define=false +nl_namespace_brace=remove +nl_return_expr=remove +nl_squeeze_ifdef=false +nl_start_of_file=remove +nl_struct_brace=add +nl_switch_brace=add +nl_template_class=add +nl_while_brace=add +pp_define_at_level=false +pp_if_indent_code=false +pp_indent=remove +pp_indent_at_level=false +pp_region_indent_code=false +sp_addr=remove +sp_after_angle=remove +sp_after_cast=remove +sp_after_class_colon=add +sp_after_comma=add +sp_after_dc=remove +sp_after_new=add +sp_after_operator=add +sp_after_operator_sym=remove +sp_after_type=ignore +sp_angle_paren=remove +sp_angle_word=add +sp_arith=add +sp_assign=add +sp_assign_default=add +sp_attribute_paren=remove +sp_balance_nested_parens=false +sp_before_angle=remove +sp_before_case_colon=remove +sp_before_class_colon=add +sp_before_comma=remove +sp_before_dc=remove +sp_before_nl_cont=add +sp_before_semi=remove +sp_before_semi_for=remove +sp_before_semi_for_empty=remove +sp_before_sparen=add +sp_before_square=remove +sp_before_squares=ignore +sp_before_tr_emb_cmt=add +sp_bool=add +sp_brace_else=add +sp_cmt_cpp_start=ignore +sp_compare=add +sp_cond_colon=add +sp_cond_question=add +sp_cpp_cast_paren=remove +sp_defined_paren=remove +sp_deref=remove +sp_else_brace=add +sp_endif_cmt=add +sp_enum_assign=add +sp_fparen_brace=add +sp_func_call_paren=remove +sp_func_class_paren=remove +sp_func_def_paren=remove +sp_func_proto_paren=remove +sp_incdec=remove +sp_inside_angle=remove +sp_inside_braces=add +#sp_inside_braces_empty=remove +sp_inside_fparen=remove +sp_inside_fparens=remove +sp_inside_paren=remove +sp_inside_paren_cast=remove +sp_inside_sparen=remove +sp_inside_square=remove +sp_inv=remove +sp_member=remove +sp_not=remove +sp_paren_brace=add +sp_paren_paren=remove +sp_pp_concat=add +sp_sign=remove +sp_sizeof_paren=remove +sp_special_semi=ignore +sp_template_angle=remove +tok_split_gte=false +utf8_bom=remove +utf8_byte=false +utf8_force=false diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt new file mode 100644 index 0000000..ba63f81 --- /dev/null +++ b/source/CMakeLists.txt @@ -0,0 +1,380 @@ +# vim: syntax=cmake +if(NOT CMAKE_BUILD_TYPE) + # default to Release build for GCC builds + set(CMAKE_BUILD_TYPE Release CACHE STRING + "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel." + FORCE) +endif() +message(STATUS "cmake version ${CMAKE_VERSION}") +if(POLICY CMP0025) + cmake_policy(SET CMP0025 OLD) # report Apple's Clang as just Clang +endif() +if(POLICY CMP0042) + cmake_policy(SET CMP0042 NEW) # MACOSX_RPATH +endif() + +project (x265) +cmake_minimum_required (VERSION 2.8.8) # OBJECT libraries require 2.8.8 +include(CheckIncludeFiles) +include(CheckFunctionExists) +include(CheckSymbolExists) +include(CheckCXXCompilerFlag) + +# X265_BUILD must be incremented each time the public API is changed +set(X265_BUILD 35) +configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" + "${PROJECT_BINARY_DIR}/x265.def") +configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" + "${PROJECT_BINARY_DIR}/x265_config.h") + +SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") + +option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF) +if(CHECKED_BUILD) + add_definitions(-DCHECKED_BUILD=1) +endif() + +# System architecture detection +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC) +set(X86_ALIASES x86 i386 i686 x86_64 amd64) +list(FIND X86_ALIASES "${SYSPROC}" X86MATCH) +if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1") + message(STATUS "Detected x86 target processor") + set(X86 1) + add_definitions(-DX265_ARCH_X86=1) + if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8) + set(X64 1) + add_definitions(-DX86_64=1) + endif() +elseif(${SYSPROC} STREQUAL "armv6l") + message(STATUS "Detected ARM target processor") + set(ARM 1) + add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1) +else() + message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown") + message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}") +endif() + +if(UNIX) + SET(PLATFORM_LIBS pthread) + find_library(LIBRT rt) + if(LIBRT) + set(PLATFORM_LIBS ${PLATFORM_LIBS} rt) + endif() +endif(UNIX) + +# Compiler detection +if(CMAKE_GENERATOR STREQUAL "Xcode") + set(XCODE 1) +endif() +if (APPLE) + add_definitions(-DMACOS) +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CLANG 1) +endif() +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") + set(INTEL_CXX 1) +endif() +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(GCC 1) +endif() + +if(INTEL_CXX AND WIN32) + # treat icl roughly like MSVC + set(MSVC 1) +endif() +if(MSVC) + option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) + if (STATIC_LINK_CRT) + set(CompilerFlags CMAKE_CXX_FLAGS_RELEASE CMAKE_C_FLAGS_RELEASE) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() + endif (STATIC_LINK_CRT) + add_definitions(/W4) # Full warnings + add_definitions(/Ob2) # always inline + add_definitions(/MP) # multithreaded build + + # disable Microsofts suggestions for proprietary secure APIs + add_definitions(/D_CRT_SECURE_NO_WARNINGS) + + check_include_files(stdint.h HAVE_STDINT_H) + if(NOT HAVE_STDINT_H) + include_directories(compat/msvc) + endif() +endif(MSVC) + +check_include_files(inttypes.h HAVE_INT_TYPES_H) +if(HAVE_INT_TYPES_H) + add_definitions(-DHAVE_INT_TYPES_H=1) +endif() + +if(INTEL_CXX AND UNIX) + set(GCC 1) # treat icpc roughly like gcc +elseif(CLANG) + set(GCC 1) # treat clang roughly like gcc +elseif(CMAKE_COMPILER_IS_GNUCXX) + set(GCC 1) +endif() +if(GCC) + add_definitions(-Wall -Wextra -Wshadow) + add_definitions(-D__STDC_LIMIT_MACROS=1) + if(X64 AND NOT WIN32) + add_definitions(-fPIC) + endif(X64 AND NOT WIN32) + if(X86 AND NOT X64) + add_definitions(-march=i686) + endif() + if(ARM) + add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) + endif() + check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) + check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) + if (CC_HAS_NO_ARRAY_BOUNDS) + add_definitions(-Wno-array-bounds) # these are unhelpful + endif() + check_cxx_compiler_flag(-ffast-math CC_HAS_FAST_MATH) + if (CC_HAS_FAST_MATH) + add_definitions(-ffast-math) + endif() + check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) + if (CC_HAS_STACK_REALIGN) + add_definitions(-mstackrealign) + endif() + # Disable exceptions. Reduce executable size, increase compability. + check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG) + if(CC_HAS_FNO_EXCEPTIONS_FLAG) + add_definitions(-fno-exceptions) + endif() + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE CC_VERSION) +endif(GCC) + +find_package(Yasm) +if(YASM_FOUND AND X86) + if (YASM_VERSION_STRING VERSION_LESS "1.2.0") + message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required") + option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) + else() + message(STATUS "Found Yasm ${YASM_VERSION_STRING} to build assembly primitives") + option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON) + endif() +endif() + +# Build options +set(LIB_INSTALL_DIR lib CACHE STRING "Install location of libraries") +set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables") + +if(X64) + # NOTE: We only officially support 16bit-per-pixel compiles of x265 + # on 64bit architectures. 16bpp plus large resolution plus slow + # preset plus 32bit address space usually means malloc failure. You + # can disable this if(X64) check if you desparately need a 32bit + # build with 10bit/12bit support, but this violates the "shrink wrap + # license" so to speak. If it breaks you get to keep both halves. + option(HIGH_BIT_DEPTH "Store pixels as 16bit values" OFF) +endif(X64) +if(HIGH_BIT_DEPTH) + add_definitions(-DHIGH_BIT_DEPTH=1) +else(HIGH_BIT_DEPTH) + add_definitions(-DHIGH_BIT_DEPTH=0) +endif(HIGH_BIT_DEPTH) + +option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF) +if(WARNINGS_AS_ERRORS) + if(GCC) + add_definitions(-Werror) + elseif(MSVC) + add_definitions(/WX) + endif() +endif(WARNINGS_AS_ERRORS) + + +option(ENABLE_PPA "Enable PPA profiling instrumentation" OFF) +if(ENABLE_PPA) + add_definitions(-DENABLE_PPA) + add_subdirectory(PPA) + SET(PLATFORM_LIBS ${PLATFORM_LIBS} PPA) + if(UNIX) + SET(PLATFORM_LIBS ${PLATFORM_LIBS} dl) + endif(UNIX) +endif(ENABLE_PPA) + +if (WIN32) + # Visual leak detector + find_package(VLD QUIET) + if(VLD_FOUND) + add_definitions(-DHAVE_VLD) + include_directories(${VLD_INCLUDE_DIRS}) + set(PLATFORM_LIBS ${PLATFORM_LIBS} ${VLD_LIBRARIES}) + link_directories(${VLD_LIBRARY_DIRS}) + endif() + option(WINXP_SUPPORT "Make binaries compatible with Windows XP" OFF) + if(WINXP_SUPPORT) + # force use of workarounds for CONDITION_VARIABLE and atomic + # intrinsics introduced after XP + add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WINXP) + endif() +endif() + +include(version) # determine X265_VERSION and X265_LATEST_TAG +include_directories(. common encoder "${PROJECT_BINARY_DIR}") +add_subdirectory(encoder) +add_subdirectory(common) + +if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY) + # this is required because of this cmake bug + # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170 + if(WIN32) + set(SUFFIX obj) + else() + set(SUFFIX o) + endif() + foreach(ASM ${MSVC_ASMS}) + set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) + list(APPEND YASM_SRCS ${YASM_SRC}) + list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) + add_custom_command( + OUTPUT ${ASM}.${SUFFIX} + COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX} + DEPENDS ${YASM_SRC}) + endforeach() +endif() + +source_group(ASM FILES ${YASM_SRCS}) +add_library(x265-static STATIC $ $ ${YASM_OBJS} ${YASM_SRCS}) +if(NOT MSVC) + set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265) +endif() +install(TARGETS x265-static + LIBRARY DESTINATION ${LIB_INSTALL_DIR} + ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) +install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include) + +if(CMAKE_RC_COMPILER) + # The resource compiler does not need CFLAGS or macro defines. It + # often breaks them + string(REPLACE "" "" CMAKE_RC_COMPILE_OBJECT "${CMAKE_RC_COMPILE_OBJECT}") + string(REPLACE "" "" CMAKE_RC_COMPILE_OBJECT "${CMAKE_RC_COMPILE_OBJECT}") + + # convert X265_LATEST_TAG (ex: 0.7) and X265_TAG_DISTANCE (ex: 103) to + # @X265_VERSION_MAJOR@,@X265_VERSION_MINOR@,@X265_BRANCH_ID@,@X265_TAG_DISTANCE@ + string(REPLACE "." ";" VERSION_LIST "${X265_LATEST_TAG}") + list(GET VERSION_LIST 0 X265_VERSION_MAJOR) + list(GET VERSION_LIST 1 X265_VERSION_MINOR) + set(X265_BRANCH_ID 0) # TODO: 0 - stable, 1 - default or other + set(X265_RC_FILE "${CMAKE_CURRENT_BINARY_DIR}/x265.rc") + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/x265.rc.in" "${X265_RC_FILE}" @ONLY) +endif() + +if(NOT (MSVC_IDE OR XCODE)) + add_custom_target(clean-generated COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/clean-generated.cmake) +endif() + +option(ENABLE_SHARED "Build shared library" ON) +if(ENABLE_SHARED) + add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS} + ${X265_RC_FILE} $ $) + target_link_libraries(x265-shared ${PLATFORM_LIBS}) + if(MSVC) + set_target_properties(x265-shared PROPERTIES OUTPUT_NAME libx265) + else() + set_target_properties(x265-shared PROPERTIES OUTPUT_NAME x265) + endif() + if(UNIX) + set_target_properties(x265-shared PROPERTIES VERSION ${X265_BUILD}) + if(APPLE) + set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1) + else() + set_target_properties(x265-shared PROPERTIES LINK_FLAGS "-Wl,-Bsymbolic,-znoexecstack") + endif() + endif() + set_target_properties(x265-shared PROPERTIES SOVERSION ${X265_BUILD}) + if(X265_LATEST_TAG) + if(WINDOWS) + set_target_properties(x265-shared PROPERTIES VERSION ${X265_LATEST_TAG}) + endif() + # shared library is not installed if a tag is not found + install(TARGETS x265-shared + LIBRARY DESTINATION ${LIB_INSTALL_DIR} + ARCHIVE DESTINATION ${LIB_INSTALL_DIR} + RUNTIME DESTINATION ${BIN_INSTALL_DIR}) + endif() +endif() + +if(X265_LATEST_TAG) + # convert lists of link libraries into -lstdc++ -lm etc.. + foreach(LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES} ${PLATFORM_LIBS}) + if(IS_ABSOLUTE ${LIB} AND EXISTS ${LIB}) + list(APPEND PLIBLIST "${LIB}") + else() + list(APPEND PLIBLIST "-l${LIB}") + endif() + endforeach() + if(PLIBLIST) + # blacklist of libraries that should not be in Libs.private + list(REMOVE_ITEM PLIBLIST "-lc" "-lpthread") + string(REPLACE ";" " " PRIVATE_LIBS "${PLIBLIST}") + else() + set(PRIVATE_LIBS "") + endif(PLIBLIST) + + # Produce a pkg-config file + configure_file("x265.pc.in" "x265.pc" @ONLY) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/x265.pc" + DESTINATION "${CMAKE_INSTALL_PREFIX}/${LIB_INSTALL_DIR}/pkgconfig") +endif() + +if(NOT WIN32) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake" + IMMEDIATE @ONLY) + add_custom_target(uninstall + "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake") +endif() + +# Main CLI application +option(ENABLE_CLI "Build standalone CLI application" ON) +if(ENABLE_CLI) + file(GLOB InputFiles input/*.cpp input/*.h) + file(GLOB OutputFiles output/*.cpp output/*.h) + file(GLOB FilterFiles filters/*.cpp filters/*.h) + source_group(input FILES ${InputFiles}) + source_group(output FILES ${OutputFiles}) + source_group(filters FILES ${FilterFiles}) + + check_include_files(getopt.h HAVE_GETOPT_H) + if(NOT HAVE_GETOPT_H) + if(MSVC) + set_source_files_properties(compat/getopt/getopt.c PROPERTIES COMPILE_FLAGS "/wd4100 /wd4131 -DHAVE_STRING_H=1") + endif(MSVC) + include_directories(compat/getopt) + set(GETOPT compat/getopt/getopt.c compat/getopt/getopt.h) + endif(NOT HAVE_GETOPT_H) + + if(XCODE) + # Xcode seems unable to link the CLI with libs, so link as one targget + add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h + $ $ ${YASM_OBJS} ${YASM_SRCS}) + else() + add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h) + if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX) + # The CLI cannot link to the shared library on Windows, it + # requires internal APIs not exported from the DLL + target_link_libraries(cli x265-static ${PLATFORM_LIBS}) + else() + target_link_libraries(cli x265-shared ${PLATFORM_LIBS}) + endif() + endif() + set_target_properties(cli PROPERTIES OUTPUT_NAME x265) + + install(TARGETS cli DESTINATION ${BIN_INSTALL_DIR}) +endif(ENABLE_CLI) + +if(ENABLE_ASSEMBLY AND NOT XCODE) + option(ENABLE_TESTS "Enable Unit Tests" OFF) + if(ENABLE_TESTS) + add_subdirectory(test) + endif() +endif() diff --git a/source/PPA/CMakeLists.txt b/source/PPA/CMakeLists.txt new file mode 100644 index 0000000..de13ddf --- /dev/null +++ b/source/PPA/CMakeLists.txt @@ -0,0 +1 @@ +add_library(PPA ppa.h ppaApi.h ppaCPUEvents.h ppa.cpp) diff --git a/source/PPA/ppa.cpp b/source/PPA/ppa.cpp new file mode 100644 index 0000000..607a946 --- /dev/null +++ b/source/PPA/ppa.cpp @@ -0,0 +1,145 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#if defined(ENABLE_PPA) + +#include "ppa.h" +#include + +#define PPA_REGISTER_CPU_EVENT2GROUP(x, y) # x, # y, +#define PPA_REGISTER_CPU_EVENT(x) PPA_REGISTER_CPU_EVENT2GROUP(x, NoGroup) +const char *PPACpuAndGroup[] = +{ +#include "ppaCPUEvents.h" + "" +}; +#undef PPA_REGISTER_CPU_EVENT +#undef PPA_REGISTER_CPU_EVENT2GROUP + +extern "C" { +typedef ppa::Base *(FUNC_PPALibInit)(const char **, int); +typedef void (FUNC_PPALibRelease)(ppa::Base* &); +} + +static FUNC_PPALibRelease *_pfuncPpaRelease; +ppa::Base *ppabase; + +static void _ppaReleaseAtExit() +{ + _pfuncPpaRelease(ppabase); +} + +#ifdef _WIN32 +#include + +#if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__) +# ifdef UNICODE +# define PPA_DLL_NAME L"ppa64.dll" +# else +# define PPA_DLL_NAME "ppa64.dll" +# endif +#else +# ifdef UNICODE +# define PPA_DLL_NAME L"ppa.dll" +# else +# define PPA_DLL_NAME "ppa.dll" +# endif +#endif // if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__) + +void initializePPA(void) +{ + if (ppabase) + return; + + HMODULE _ppaLibHandle = LoadLibrary(PPA_DLL_NAME); + if (!_ppaLibHandle) + return; + + FUNC_PPALibInit *_pfuncPpaInit = (FUNC_PPALibInit*)GetProcAddress(_ppaLibHandle, "InitPpaUtil"); + _pfuncPpaRelease = (FUNC_PPALibRelease*)GetProcAddress(_ppaLibHandle, "DeletePpa"); + + if (!_pfuncPpaInit || !_pfuncPpaRelease) + { + FreeLibrary(_ppaLibHandle); + return; + } + + ppabase = _pfuncPpaInit(PPACpuAndGroup, PPACpuGroupNums); + if (!ppabase) + { + FreeLibrary(_ppaLibHandle); + return; + } + + atexit(_ppaReleaseAtExit); +} + +#else /* linux & unix & cygwin */ +#include +#include + +#if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__) +# define PPA_LIB_NAME "libppa64.so" +#else +# define PPA_LIB_NAME "libppa.so" +#endif + +void initializePPA(void) +{ + if (ppabase) + { + printf("PPA: already initialized\n"); + return; + } + + void *_ppaDllHandle = dlopen(PPA_LIB_NAME, RTLD_LAZY); + if (!_ppaDllHandle) + { + printf("PPA: Unable to load %s\n", PPA_LIB_NAME); + return; + } + + FUNC_PPALibInit *_pfuncPpaInit = (FUNC_PPALibInit*)dlsym(_ppaDllHandle, "InitPpaUtil"); + _pfuncPpaRelease = (FUNC_PPALibRelease*)dlsym(_ppaDllHandle, "DeletePpa"); + + if (!_pfuncPpaInit || !_pfuncPpaRelease) + { + printf("PPA: Function bindings failed\n"); + dlclose(_ppaDllHandle); + return; + } + + ppabase = _pfuncPpaInit(PPACpuAndGroup, PPACpuGroupNums); + if (!ppabase) + { + printf("PPA: Init failed\n"); + dlclose(_ppaDllHandle); + return; + } + + atexit(_ppaReleaseAtExit); +} + +#endif /* !_WIN32 */ + +#endif /* defined(ENABLE_PPA) */ diff --git a/source/PPA/ppa.h b/source/PPA/ppa.h new file mode 100644 index 0000000..42f43b8 --- /dev/null +++ b/source/PPA/ppa.h @@ -0,0 +1,71 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _PPA_H_ +#define _PPA_H_ + +#if !defined(ENABLE_PPA) + +#define PPA_INIT() +#define PPAStartCpuEventFunc(e) +#define PPAStopCpuEventFunc(e) +#define PPAScopeEvent(e) + +#else + +/* declare enum list of users CPU events */ +#define PPA_REGISTER_CPU_EVENT(x) x, +enum PPACpuEventEnum +{ +#include "ppaCPUEvents.h" + PPACpuGroupNums +}; + +#undef PPA_REGISTER_CPU_EVENT + +#define PPA_INIT() initializePPA() +#define PPAStartCpuEventFunc(e) if (ppabase) ppabase->triggerStartEvent(ppabase->getEventId(e)) +#define PPAStopCpuEventFunc(e) if (ppabase) ppabase->triggerEndEvent(ppabase->getEventId(e)) +#define PPAScopeEvent(e) _PPAScope __scope_(e) + +#include "ppaApi.h" + +void initializePPA(); +extern ppa::Base *ppabase; + +class _PPAScope +{ +protected: + + ppa::EventID m_id; + +public: + + _PPAScope(int e) { if (ppabase) { m_id = ppabase->getEventId(e); ppabase->triggerStartEvent(m_id); } else m_id = 0; } + + ~_PPAScope() { if (ppabase) ppabase->triggerEndEvent(m_id); } +}; + +#endif // if !defined(ENABLE_PPA) + +#endif /* _PPA_H_ */ diff --git a/source/PPA/ppaApi.h b/source/PPA/ppaApi.h new file mode 100644 index 0000000..149de6d --- /dev/null +++ b/source/PPA/ppaApi.h @@ -0,0 +1,59 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _PPA_API_H_ +#define _PPA_API_H_ + +namespace ppa { +// PPA private namespace + +typedef unsigned short EventID; +typedef unsigned char GroupID; + +class Base +{ +public: + + virtual ~Base() {} + + virtual bool isEventFiltered(EventID eventId) const = 0; + virtual bool configEventById(EventID eventId, bool filtered) const = 0; + virtual int configGroupById(GroupID groupId, bool filtered) const = 0; + virtual void configAllEvents(bool filtered) const = 0; + virtual EventID registerEventByName(const char *pEventName) = 0; + virtual GroupID registerGroupByName(const char *pGroupName) = 0; + virtual EventID registerEventInGroup(const char *pEventName, GroupID groupId) = 0; + virtual void triggerStartEvent(EventID eventId) = 0; + virtual void triggerEndEvent(EventID eventId) = 0; + virtual void triggerTidEvent(EventID eventId, unsigned int data) = 0; + virtual void triggerDebugEvent(EventID eventId, unsigned int data0, unsigned int data1) = 0; + + virtual EventID getEventId(int index) const = 0; + +protected: + + virtual void init(const char **pNames, int eventCount) = 0; +}; +} + +#endif //_PPA_API_H_ diff --git a/source/PPA/ppaCPUEvents.h b/source/PPA/ppaCPUEvents.h new file mode 100644 index 0000000..1a47b39 --- /dev/null +++ b/source/PPA/ppaCPUEvents.h @@ -0,0 +1,25 @@ +PPA_REGISTER_CPU_EVENT(encode_block) +PPA_REGISTER_CPU_EVENT(bitstream_write) +PPA_REGISTER_CPU_EVENT(DPB_prepareEncode) +PPA_REGISTER_CPU_EVENT(FrameEncoder_compressFrame) +PPA_REGISTER_CPU_EVENT(FrameEncoder_compressRows) +PPA_REGISTER_CPU_EVENT(CompressCU) +PPA_REGISTER_CPU_EVENT(CompressCU_Depth1) +PPA_REGISTER_CPU_EVENT(CompressCU_Depth2) +PPA_REGISTER_CPU_EVENT(CompressCU_Depth3) +PPA_REGISTER_CPU_EVENT(CompressCU_Depth4) +PPA_REGISTER_CPU_EVENT(CompressIntraCU) +PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth1) +PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth2) +PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth3) +PPA_REGISTER_CPU_EVENT(CompressIntraCU_Depth4) +PPA_REGISTER_CPU_EVENT(CheckRDCostIntra) +PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth1) +PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth2) +PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth3) +PPA_REGISTER_CPU_EVENT(CheckRDCostIntra_Depth4) +PPA_REGISTER_CPU_EVENT(CalcRDCostIntra) +PPA_REGISTER_CPU_EVENT(Thread_ProcessRow) +PPA_REGISTER_CPU_EVENT(Thread_compressCU) +PPA_REGISTER_CPU_EVENT(Thread_encodeCU) +PPA_REGISTER_CPU_EVENT(Thread_filterCU) diff --git a/source/cmake/CMakeASM_YASMInformation.cmake b/source/cmake/CMakeASM_YASMInformation.cmake new file mode 100644 index 0000000..0af7c24 --- /dev/null +++ b/source/cmake/CMakeASM_YASMInformation.cmake @@ -0,0 +1,48 @@ +set(ASM_DIALECT "_YASM") +set(CMAKE_ASM${ASM_DIALECT}_SOURCE_FILE_EXTENSIONS asm) + +if(X64) + list(APPEND ASM_FLAGS -DARCH_X86_64=1 -DPIC) + if(APPLE) + set(ARGS -f macho64 -m amd64 -DPREFIX) + elseif(UNIX AND NOT CYGWIN) + set(ARGS -f elf64 -m amd64) + else() + set(ARGS -f win64 -m amd64) + endif() +else() + list(APPEND ASM_FLAGS -DARCH_X86_64=0) + if(APPLE) + set(ARGS -f macho -DPREFIX) + elseif(UNIX AND NOT CYGWIN) + set(ARGS -f elf32) + else() + set(ARGS -f win32 -DPREFIX) + endif() +endif() + +if(GCC) + list(APPEND ASM_FLAGS -DHAVE_ALIGNED_STACK=1) +else() + list(APPEND ASM_FLAGS -DHAVE_ALIGNED_STACK=0) +endif() + +if(HIGH_BIT_DEPTH) + list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10) +else() + list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8) +endif() +set(YASM_FLAGS ${ARGS} ${ASM_FLAGS} PARENT_SCOPE) +string(REPLACE ";" " " CMAKE_ASM_YASM_COMPILER_ARG1 "${ARGS}") + +# This section exists to override the one in CMakeASMInformation.cmake +# (the default Information file). This removes the +# thing so that your C compiler flags that have been set via +# set_target_properties don't get passed to yasm and confuse it. +if(NOT CMAKE_ASM${ASM_DIALECT}_COMPILE_OBJECT) + string(REPLACE ";" " " STR_ASM_FLAGS "${ASM_FLAGS}") + set(CMAKE_ASM${ASM_DIALECT}_COMPILE_OBJECT " ${STR_ASM_FLAGS} -o ") +endif() + +include(CMakeASMInformation) +set(ASM_DIALECT) diff --git a/source/cmake/CMakeDetermineASM_YASMCompiler.cmake b/source/cmake/CMakeDetermineASM_YASMCompiler.cmake new file mode 100644 index 0000000..a902ef8 --- /dev/null +++ b/source/cmake/CMakeDetermineASM_YASMCompiler.cmake @@ -0,0 +1,5 @@ +set(ASM_DIALECT "_YASM") +set(CMAKE_ASM${ASM_DIALECT}_COMPILER ${YASM_EXECUTABLE}) +set(CMAKE_ASM${ASM_DIALECT}_COMPILER_INIT ${_CMAKE_TOOLCHAIN_PREFIX}yasm) +include(CMakeDetermineASMCompiler) +set(ASM_DIALECT) diff --git a/source/cmake/CMakeTestASM_YASMCompiler.cmake b/source/cmake/CMakeTestASM_YASMCompiler.cmake new file mode 100644 index 0000000..c668668 --- /dev/null +++ b/source/cmake/CMakeTestASM_YASMCompiler.cmake @@ -0,0 +1,3 @@ +set(ASM_DIALECT "_YASM") +include(CMakeTestASMCompiler) +set(ASM_DIALECT) diff --git a/source/cmake/FindVLD.cmake b/source/cmake/FindVLD.cmake new file mode 100644 index 0000000..716625c --- /dev/null +++ b/source/cmake/FindVLD.cmake @@ -0,0 +1,123 @@ +# Module for locating Visual Leak Detector. +# +# Customizable variables: +# VLD_ROOT_DIR +# This variable points to the Visual Leak Detector root directory. By +# default, the module looks for the installation directory by examining the +# Program Files/Program Files (x86) folders and the VLDROOT environment +# variable. +# +# Read-only variables: +# VLD_FOUND +# Indicates that the library has been found. +# +# VLD_INCLUDE_DIRS +# Points to the Visual Leak Detector include directory. +# +# VLD_LIBRARY_DIRS +# Points to the Visual Leak Detector directory that contains the libraries. +# The content of this variable can be passed to link_directories. +# +# VLD_LIBRARIES +# Points to the Visual Leak Detector libraries that can be passed to +# target_link_libararies. +# +# +# Copyright (c) 2012 Sergiu Dotenco +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +INCLUDE (FindPackageHandleStandardArgs) + +SET (_VLD_POSSIBLE_LIB_SUFFIXES lib) + +# Version 2.0 uses vld_x86 and vld_x64 instead of simply vld as library names +IF (CMAKE_SIZEOF_VOID_P EQUAL 4) + LIST (APPEND _VLD_POSSIBLE_LIB_SUFFIXES lib/Win32) +ELSEIF (CMAKE_SIZEOF_VOID_P EQUAL 8) + LIST (APPEND _VLD_POSSIBLE_LIB_SUFFIXES lib/Win64) +ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4) + +FIND_PATH (VLD_ROOT_DIR + NAMES include/vld.h + PATHS ENV VLDROOT + "$ENV{PROGRAMFILES}/Visual Leak Detector" + "$ENV{PROGRAMFILES(X86)}/Visual Leak Detector" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Visual Leak Detector;InstallLocation]" + DOC "VLD root directory") + +FIND_PATH (VLD_INCLUDE_DIR + NAMES vld.h + HINTS ${VLD_ROOT_DIR} + PATH_SUFFIXES include + DOC "VLD include directory") + +FIND_LIBRARY (VLD_LIBRARY_DEBUG + NAMES vld + HINTS ${VLD_ROOT_DIR} + PATH_SUFFIXES ${_VLD_POSSIBLE_LIB_SUFFIXES} + DOC "VLD debug library") + +IF (VLD_ROOT_DIR) + SET (_VLD_VERSION_FILE ${VLD_ROOT_DIR}/CHANGES.txt) + + IF (EXISTS ${_VLD_VERSION_FILE}) + SET (_VLD_VERSION_REGEX + "Visual Leak Detector \\(VLD\\) Version (([0-9]+)\\.([0-9]+)([a-z]|(.([0-9]+)))?)") + FILE (STRINGS ${_VLD_VERSION_FILE} _VLD_VERSION_TMP REGEX + ${_VLD_VERSION_REGEX}) + + STRING (REGEX REPLACE ${_VLD_VERSION_REGEX} "\\1" _VLD_VERSION_TMP + "${_VLD_VERSION_TMP}") + + STRING (REGEX REPLACE "([0-9]+).([0-9]+).*" "\\1" VLD_VERSION_MAJOR + "${_VLD_VERSION_TMP}") + STRING (REGEX REPLACE "([0-9]+).([0-9]+).*" "\\2" VLD_VERSION_MINOR + "${_VLD_VERSION_TMP}") + + SET (VLD_VERSION ${VLD_VERSION_MAJOR}.${VLD_VERSION_MINOR}) + + IF ("${_VLD_VERSION_TMP}" MATCHES "^([0-9]+).([0-9]+).([0-9]+)$") + # major.minor.patch version numbering scheme + STRING (REGEX REPLACE "([0-9]+).([0-9]+).([0-9]+)" "\\3" + VLD_VERSION_PATCH "${_VLD_VERSION_TMP}") + SET (VLD_VERSION "${VLD_VERSION}.${VLD_VERSION_PATCH}") + SET (VLD_VERSION_COUNT 3) + ELSE ("${_VLD_VERSION_TMP}" MATCHES "^([0-9]+).([0-9]+).([0-9]+)$") + # major.minor version numbering scheme. The trailing letter is ignored. + SET (VLD_VERSION_COUNT 2) + ENDIF ("${_VLD_VERSION_TMP}" MATCHES "^([0-9]+).([0-9]+).([0-9]+)$") + ENDIF (EXISTS ${_VLD_VERSION_FILE}) +ENDIF (VLD_ROOT_DIR) + +IF (VLD_LIBRARY_DEBUG) + SET (VLD_LIBRARY debug ${VLD_LIBRARY_DEBUG} CACHE DOC "VLD library") + GET_FILENAME_COMPONENT (_VLD_LIBRARY_DIR ${VLD_LIBRARY_DEBUG} PATH) + SET (VLD_LIBRARY_DIR ${_VLD_LIBRARY_DIR} CACHE PATH "VLD library directory") +ENDIF (VLD_LIBRARY_DEBUG) + +SET (VLD_INCLUDE_DIRS ${VLD_INCLUDE_DIR}) +SET (VLD_LIBRARY_DIRS ${VLD_LIBRARY_DIR}) +SET (VLD_LIBRARIES ${VLD_LIBRARY}) + +MARK_AS_ADVANCED (VLD_INCLUDE_DIR VLD_LIBRARY_DIR VLD_LIBRARY_DEBUG VLD_LIBRARY) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS (VLD REQUIRED_VARS VLD_ROOT_DIR + VLD_INCLUDE_DIR VLD_LIBRARY VERSION_VAR VLD_VERSION) diff --git a/source/cmake/FindYasm.cmake b/source/cmake/FindYasm.cmake new file mode 100644 index 0000000..df858c2 --- /dev/null +++ b/source/cmake/FindYasm.cmake @@ -0,0 +1,25 @@ +include(FindPackageHandleStandardArgs) + +# Simple path search with YASM_ROOT environment variable override +find_program(YASM_EXECUTABLE + NAMES yasm yasm-1.2.0-win32 yasm-1.2.0-win64 + HINTS $ENV{YASM_ROOT} ${YASM_ROOT} + PATH_SUFFIXES bin +) + +if(YASM_EXECUTABLE) + execute_process(COMMAND ${YASM_EXECUTABLE} --version + OUTPUT_VARIABLE yasm_version + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(yasm_version MATCHES "^yasm ([0-9\\.]*)") + set(YASM_VERSION_STRING "${CMAKE_MATCH_1}") + endif() + unset(yasm_version) +endif() + +# Provide standardized success/failure messages +find_package_handle_standard_args(yasm + REQUIRED_VARS YASM_EXECUTABLE + VERSION_VAR YASM_VERSION_STRING) diff --git a/source/cmake/clean-generated.cmake b/source/cmake/clean-generated.cmake new file mode 100644 index 0000000..f75666c --- /dev/null +++ b/source/cmake/clean-generated.cmake @@ -0,0 +1,10 @@ +set(generated "${CMAKE_CURRENT_BINARY_DIR}/x265.rc" + "${CMAKE_CURRENT_BINARY_DIR}/x265.pc" + "${CMAKE_CURRENT_BINARY_DIR}/x265.def" + "${CMAKE_CURRENT_BINARY_DIR}/x265_config.h") + +foreach(file ${generated}) + if(EXISTS ${file}) + file(REMOVE ${file}) + endif() +endforeach(file) diff --git a/source/cmake/cmake_uninstall.cmake.in b/source/cmake/cmake_uninstall.cmake.in new file mode 100644 index 0000000..d2b7454 --- /dev/null +++ b/source/cmake/cmake_uninstall.cmake.in @@ -0,0 +1,19 @@ +if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + message(FATAL_ERROR "Cannot find install manifest: '@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt'") +endif() + +file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) +string(REGEX REPLACE "\n" ";" files "${files}") +foreach(file ${files}) + message(STATUS "Uninstalling $ENV{DESTDIR}${file}") + if(EXISTS "$ENV{DESTDIR}${file}" OR IS_SYMLINK "$ENV{DESTDIR}${file}") + exec_program("@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" + OUTPUT_VARIABLE rm_out + RETURN_VALUE rm_retval) + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Problem when removing '$ENV{DESTDIR}${file}'") + endif(NOT "${rm_retval}" STREQUAL 0) + else() + message(STATUS "File '$ENV{DESTDIR}${file}' does not exist.") + endif() +endforeach(file) diff --git a/source/cmake/version.cmake b/source/cmake/version.cmake new file mode 100644 index 0000000..b6adfb9 --- /dev/null +++ b/source/cmake/version.cmake @@ -0,0 +1,90 @@ +if(CMAKE_VERSION VERSION_LESS "2.8.10") + find_program(HG_EXECUTABLE hg) +else() + find_package(Hg QUIET) +endif() +find_package(Git QUIET) # present in 2.8.8 + +# defaults, in case everything below fails +set(X265_VERSION "unknown") +set(X265_LATEST_TAG "0.0") +set(X265_TAG_DISTANCE "0") + +if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt) + # read the lines of the archive summary file to extract the version + file(READ ${CMAKE_SOURCE_DIR}/../.hg_archival.txt archive) + STRING(REGEX REPLACE "\n" ";" archive "${archive}") + foreach(f ${archive}) + string(FIND "${f}" ": " pos) + string(SUBSTRING "${f}" 0 ${pos} key) + string(SUBSTRING "${f}" ${pos} -1 value) + string(SUBSTRING "${value}" 2 -1 value) + set(hg_${key} ${value}) + endforeach() + if(DEFINED hg_tag) + set(X265_VERSION ${hg_tag} CACHE STRING "x265 version string.") + set(X265_LATEST_TAG ${hg_tag}) + set(X265_TAG_DISTANCE "0") + elseif(DEFINED hg_node) + string(SUBSTRING "${hg_node}" 0 16 hg_id) + set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}") + endif() +elseif(HG_EXECUTABLE AND EXISTS ${CMAKE_SOURCE_DIR}/../.hg) + if(EXISTS "${HG_EXECUTABLE}.bat") + # mercurial source installs on Windows require .bat extension + set(HG_EXECUTABLE "${HG_EXECUTABLE}.bat") + endif() + message(STATUS "hg found at ${HG_EXECUTABLE}") + + execute_process(COMMAND + ${HG_EXECUTABLE} log -r. --template "{latesttag}" + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE X265_LATEST_TAG + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + execute_process(COMMAND + ${HG_EXECUTABLE} log -r. --template "{latesttagdistance}" + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE X265_TAG_DISTANCE + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + execute_process( + COMMAND + ${HG_EXECUTABLE} log -r. --template "{node|short}" + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE HG_REVISION_ID + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + if(X265_LATEST_TAG MATCHES "^r") + string(SUBSTRING ${X265_LATEST_TAG} 1 -1 X265_LATEST_TAG) + endif() + if(X265_TAG_DISTANCE STREQUAL "0") + set(X265_VERSION "${X265_LATEST_TAG}") + else() + set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${HG_REVISION_ID}") + endif() +elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_SOURCE_DIR}/../.git) + execute_process( + COMMAND + ${GIT_EXECUTABLE} describe --tags --abbrev=0 + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE X265_LATEST_TAG + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + execute_process( + COMMAND + ${GIT_EXECUTABLE} describe --tags + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE X265_VERSION + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) +endif() + +message(STATUS "x265 version ${X265_VERSION}") diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt new file mode 100644 index 0000000..46929ca --- /dev/null +++ b/source/common/CMakeLists.txt @@ -0,0 +1,110 @@ +# vim: syntax=cmake +set(SSE3 vec/dct-sse3.cpp) +set(SSSE3 vec/dct-ssse3.cpp) +set(SSE41 vec/dct-sse41.cpp) + +if(MSVC AND X86) + set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) + set(WARNDISABLE "/wd4100") # unreferenced formal parameter + if(INTEL_CXX) + add_definitions(/Qwd111) # statement is unreachable + add_definitions(/Qwd128) # loop is unreachable + add_definitions(/Qwd177) # declared function is unused + add_definitions(/Qwd185) # dynamic initialization in unreachable code + add_definitions(/Qwd280) # conditional expression is constant + endif() + if(X64) + set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE}") + else() + # x64 implies SSE4, so only add /arch:SSE2 if building for Win32 + set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2") + endif() +endif() +if(GCC AND X86) + if(CLANG) + # llvm intrinsic headers cause shadow warnings + set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter") + else() + set(WARNDISABLE "-Wno-unused-parameter") + endif() + if(INTEL_CXX OR CLANG OR (NOT CC_VERSION VERSION_LESS 4.3)) + set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) + set_source_files_properties(${SSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse3") + set_source_files_properties(${SSSE3} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -mssse3") + set_source_files_properties(${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} -msse4.1") + endif() +endif() +set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) +source_group(Intrinsics FILES ${VEC_PRIMITIVES}) + +if(ENABLE_ASSEMBLY) + set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) + set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm + mc-a2.asm pixel-util8.asm blockcopy8.asm + pixeladd8.asm dct8.asm) + if(HIGH_BIT_DEPTH) + set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm) + else() + set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm loopfilter.asm) + endif() + + if(NOT X64) + set(A_SRCS ${A_SRCS} pixel-32.asm) + endif() + + if(MSVC_IDE OR XCODE) + # MSVC requires custom build rules in the main cmake script for yasm + set(MSVC_ASMS "${A_SRCS}" CACHE INTERNAL "yasm sources") + set(A_SRCS) + endif() + + enable_language(ASM_YASM) + + foreach(SRC ${A_SRCS} ${C_SRCS}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC}) + endforeach() + source_group(Assembly FILES ${ASM_PRIMITIVES}) +endif(ENABLE_ASSEMBLY) + +check_symbol_exists(strtok_r "string.h" HAVE_STRTOK_R) +if(HAVE_STRTOK_R) + set_source_files_properties(param.cpp PROPERTIES COMPILE_FLAGS -DHAVE_STRTOK_R=1) +endif() + +if(GCC AND CC_HAS_NO_NARROWING) + set_source_files_properties(cpu.cpp PROPERTIES COMPILE_FLAGS -Wno-narrowing) +endif() +if(WIN32) + set(WINXP winxp.h winxp.cpp) +endif(WIN32) + +set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION}) + +add_library(common OBJECT + ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} + ${LIBCOMMON_SRC} ${LIBCOMMON_HDR} ${WINXP} + primitives.cpp primitives.h + pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp + constants.cpp constants.h + cpu.cpp cpu.h version.cpp + threading.cpp threading.h + threadpool.cpp threadpool.h + wavefront.h wavefront.cpp + md5.cpp md5.h + bitstream.h bitstream.cpp + yuv.cpp yuv.h + shortyuv.cpp shortyuv.h + picyuv.cpp picyuv.h + common.cpp common.h + param.cpp param.h + frame.cpp frame.h + framedata.cpp framedata.h + cudata.cpp cudata.h + slice.cpp slice.h + lowres.cpp lowres.h mv.h + piclist.cpp piclist.h + predict.cpp predict.h + scalinglist.cpp scalinglist.h + quant.cpp quant.h contexts.h + deblock.cpp deblock.h) diff --git a/source/common/bitstream.cpp b/source/common/bitstream.cpp new file mode 100644 index 0000000..44f90f0 --- /dev/null +++ b/source/common/bitstream.cpp @@ -0,0 +1,125 @@ +#include "common.h" +#include "bitstream.h" + +using namespace x265; + +#if defined(_MSC_VER) +#pragma warning(disable: 4244) +#endif + +#define MIN_FIFO_SIZE 1000 + +Bitstream::Bitstream() +{ + m_fifo = X265_MALLOC(uint8_t, MIN_FIFO_SIZE); + m_byteAlloc = MIN_FIFO_SIZE; + resetBits(); +} + +void Bitstream::push_back(uint8_t val) +{ + if (!m_fifo) + return; + + if (m_byteOccupancy >= m_byteAlloc) + { + /** reallocate buffer with doubled size */ + uint8_t *temp = X265_MALLOC(uint8_t, m_byteAlloc * 2); + if (temp) + { + ::memcpy(temp, m_fifo, m_byteOccupancy); + X265_FREE(m_fifo); + m_fifo = temp; + m_byteAlloc *= 2; + } + else + { + x265_log(NULL, X265_LOG_ERROR, "Unable to realloc bitstream buffer"); + return; + } + } + m_fifo[m_byteOccupancy++] = val; +} + +void Bitstream::write(uint32_t val, uint32_t numBits) +{ + X265_CHECK(numBits <= 32, "numBits out of range\n"); + X265_CHECK(numBits == 32 || ((val & (~0 << numBits)) == 0), "numBits & val out of range\n"); + + uint32_t totalPartialBits = m_partialByteBits + numBits; + uint32_t nextPartialBits = totalPartialBits & 7; + uint8_t nextHeldByte = val << (8 - nextPartialBits); + uint32_t writeBytes = totalPartialBits >> 3; + + if (writeBytes) + { + /* topword aligns m_partialByte with the msb of val */ + uint32_t topword = (numBits - nextPartialBits) & ~7; + uint32_t write_bits = (m_partialByte << topword) | (val >> nextPartialBits); + + switch (writeBytes) + { + case 4: push_back(write_bits >> 24); + case 3: push_back(write_bits >> 16); + case 2: push_back(write_bits >> 8); + case 1: push_back(write_bits); + } + + m_partialByte = nextHeldByte; + m_partialByteBits = nextPartialBits; + } + else + { + m_partialByte |= nextHeldByte; + m_partialByteBits = nextPartialBits; + } +} + +void Bitstream::writeByte(uint32_t val) +{ + // Only CABAC will call writeByte, the fifo must be byte aligned + X265_CHECK(!m_partialByteBits, "expecting m_partialByteBits = 0\n"); + + push_back(val); +} + +void Bitstream::writeAlignOne() +{ + uint32_t numBits = (8 - m_partialByteBits) & 0x7; + + write((1 << numBits) - 1, numBits); +} + +void Bitstream::writeAlignZero() +{ + if (m_partialByteBits) + { + push_back(m_partialByte); + m_partialByte = 0; + m_partialByteBits = 0; + } +} + +void Bitstream::writeByteAlignment() +{ + write(1, 1); + writeAlignZero(); +} + +void SyntaxElementWriter::writeUvlc(uint32_t code) +{ + uint32_t length = 1; + uint32_t temp = ++code; + + X265_CHECK(temp, "writing -1 code, will cause infinite loop\n"); + + while (1 != temp) + { + temp >>= 1; + length += 2; + } + + // Take care of cases where length > 32 + m_bitIf->write(0, length >> 1); + m_bitIf->write(code, (length + 1) >> 1); +} diff --git a/source/common/bitstream.h b/source/common/bitstream.h new file mode 100644 index 0000000..9117c42 --- /dev/null +++ b/source/common/bitstream.h @@ -0,0 +1,158 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Author: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_BITSTREAM_H +#define X265_BITSTREAM_H 1 + +namespace x265 { +// private namespace + +class BitInterface +{ +public: + + virtual void write(uint32_t val, uint32_t numBits) = 0; + virtual void writeByte(uint32_t val) = 0; + virtual void resetBits() = 0; + virtual uint32_t getNumberOfWrittenBits() const = 0; + virtual void writeAlignOne() = 0; + virtual void writeAlignZero() = 0; + virtual ~BitInterface() {} +}; + +class BitCounter : public BitInterface +{ +protected: + + uint32_t m_bitCounter; + +public: + + BitCounter() : m_bitCounter(0) {} + + void write(uint32_t, uint32_t num) { m_bitCounter += num; } + void writeByte(uint32_t) { m_bitCounter += 8; } + void resetBits() { m_bitCounter = 0; } + uint32_t getNumberOfWrittenBits() const { return m_bitCounter; } + void writeAlignOne() { } + void writeAlignZero() { } +}; + + +class Bitstream : public BitInterface +{ +public: + + Bitstream(); + ~Bitstream() { X265_FREE(m_fifo); } + + void resetBits() { m_partialByteBits = m_byteOccupancy = 0; m_partialByte = 0; } + uint32_t getNumberOfWrittenBytes() const { return m_byteOccupancy; } + uint32_t getNumberOfWrittenBits() const { return m_byteOccupancy * 8 + m_partialByteBits; } + const uint8_t* getFIFO() const { return m_fifo; } + + void write(uint32_t val, uint32_t numBits); + void writeByte(uint32_t val); + + void writeAlignOne(); // insert one bits until the bitstream is byte-aligned + void writeAlignZero(); // insert zero bits until the bitstream is byte-aligned + void writeByteAlignment(); // insert 1 bit, then pad to byte-align with zero + +private: + + uint8_t *m_fifo; + uint32_t m_byteAlloc; + uint32_t m_byteOccupancy; + uint32_t m_partialByteBits; + uint8_t m_partialByte; + + void push_back(uint8_t val); +}; + +static const uint8_t bitSize[256] = +{ + 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, +}; + +static inline int bs_size_ue(unsigned int val) +{ + return bitSize[val + 1]; +} + +static inline int bs_size_ue_big(unsigned int val) +{ + if (val < 255) + return bitSize[val + 1]; + else + return bitSize[(val + 1) >> 8] + 16; +} + +static inline int bs_size_se(int val) +{ + int tmp = 1 - val * 2; + + if (tmp < 0) tmp = val * 2; + if (tmp < 256) + return bitSize[tmp]; + else + return bitSize[tmp >> 8] + 16; +} + +class SyntaxElementWriter +{ +public: + + BitInterface* m_bitIf; + + SyntaxElementWriter() : m_bitIf(NULL) {} + + /* silently discard the name of the syntax element */ + inline void WRITE_CODE(uint32_t code, uint32_t length, const char *) { writeCode(code, length); } + inline void WRITE_UVLC(uint32_t code, const char *) { writeUvlc(code); } + inline void WRITE_SVLC(int32_t code, const char *) { writeSvlc(code); } + inline void WRITE_FLAG(bool flag, const char *) { writeFlag(flag); } + + void writeCode(uint32_t code, uint32_t length) { m_bitIf->write(code, length); } + void writeUvlc(uint32_t code); + void writeSvlc(int32_t code) { uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1; writeUvlc(ucode); } + void writeFlag(bool code) { m_bitIf->write(code, 1); } +}; + +} + +#endif // ifndef X265_BITSTREAM_H diff --git a/source/common/common.cpp b/source/common/common.cpp new file mode 100644 index 0000000..2fb0a7a --- /dev/null +++ b/source/common/common.cpp @@ -0,0 +1,208 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Nandakumar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "slice.h" +#include "threading.h" +#include "x265.h" + +#if _WIN32 +#include +#include +#else +#include +#endif + +int64_t x265_mdate(void) +{ +#if _WIN32 + struct timeb tb; + ftime(&tb); + return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000; +#else + struct timeval tv_date; + gettimeofday(&tv_date, NULL); + return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec; +#endif +} + +using namespace x265; + +#define X265_ALIGNBYTES 32 + +#if _WIN32 +#if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR) +#define _aligned_malloc __mingw_aligned_malloc +#define _aligned_free __mingw_aligned_free +#include "malloc.h" +#endif + +void *x265_malloc(size_t size) +{ + return _aligned_malloc(size, X265_ALIGNBYTES); +} + +void x265_free(void *ptr) +{ + if (ptr) _aligned_free(ptr); +} + +#else // if _WIN32 +void *x265_malloc(size_t size) +{ + void *ptr; + + if (posix_memalign((void**)&ptr, X265_ALIGNBYTES, size) == 0) + return ptr; + else + return NULL; +} + +void x265_free(void *ptr) +{ + if (ptr) free(ptr); +} + +#endif // if _WIN32 + +/* Not a general-purpose function; multiplies input by -1/6 to convert + * qp to qscale. */ +int x265_exp2fix8(double x) +{ + int i = (int)(x * (-64.f / 6.f) + 512.5f); + + if (i < 0) return 0; + if (i > 1023) return 0xffff; + return (x265_exp2_lut[i & 63] + 256) << (i >> 6) >> 8; +} + +void x265_log(const x265_param *param, int level, const char *fmt, ...) +{ + if (param && level > param->logLevel) + return; + const char *log_level; + switch (level) + { + case X265_LOG_ERROR: + log_level = "error"; + break; + case X265_LOG_WARNING: + log_level = "warning"; + break; + case X265_LOG_INFO: + log_level = "info"; + break; + case X265_LOG_DEBUG: + log_level = "debug"; + break; + case X265_LOG_FULL: + log_level = "full"; + break; + default: + log_level = "unknown"; + break; + } + + fprintf(stderr, "x265 [%s]: ", log_level); + va_list arg; + va_start(arg, fmt); + vfprintf(stderr, fmt, arg); + va_end(arg); +} + +double x265_ssim2dB(double ssim) +{ + double inv_ssim = 1 - ssim; + + if (inv_ssim <= 0.0000000001) /* Max 100dB */ + return 100; + + return -10.0 * log10(inv_ssim); +} + +/* The qscale - qp conversion is specified in the standards. + * Approx qscale increases by 12% with every qp increment */ +double x265_qScale2qp(double qScale) +{ + return 12.0 + 6.0 * (double)X265_LOG2(qScale / 0.85); +} + +double x265_qp2qScale(double qp) +{ + return 0.85 * pow(2.0, (qp - 12.0) / 6.0); +} + +uint32_t x265_picturePlaneSize(int csp, int width, int height, int plane) +{ + uint32_t size = (uint32_t)(width >> x265_cli_csps[csp].width[plane]) * (height >> x265_cli_csps[csp].height[plane]); + + return size; +} + +char* x265_slurp_file(const char *filename) +{ + if (!filename) + return NULL; + + int bError = 0; + size_t fSize; + char *buf = NULL; + + FILE *fh = fopen(filename, "rb"); + if (!fh) + { + x265_log(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename); + return NULL; + } + + bError |= fseek(fh, 0, SEEK_END) < 0; + bError |= (fSize = ftell(fh)) <= 0; + bError |= fseek(fh, 0, SEEK_SET) < 0; + if (bError) + goto error; + + buf = X265_MALLOC(char, fSize + 2); + if (!buf) + { + x265_log(NULL, X265_LOG_ERROR, "unable to allocate memory\n"); + goto error; + } + + bError |= fread(buf, 1, fSize, fh) != fSize; + if (buf[fSize - 1] != '\n') + buf[fSize++] = '\n'; + buf[fSize] = 0; + fclose(fh); + + if (bError) + { + x265_log(NULL, X265_LOG_ERROR, "unable to read the file\n"); + X265_FREE(buf); + buf = NULL; + } + return buf; + +error: + fclose(fh); + return NULL; +} diff --git a/source/common/common.h b/source/common/common.h new file mode 100644 index 0000000..b447bb3 --- /dev/null +++ b/source/common/common.h @@ -0,0 +1,403 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Nandakumar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_H +#define X265_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "x265.h" + +#define FENC_STRIDE 64 +#define NUM_INTRA_MODE 35 + +#if defined(__GNUC__) +#define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8))) +#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16))) +#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32))) + +#if X265_ARCH_X86 && !defined(X86_64) +extern "C" intptr_t x265_stack_align(void (*func)(), ...); +#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__) +#else +#define x265_stack_align(func, ...) func(__VA_ARGS__) +#endif + +#elif defined(_MSC_VER) + +#define ALIGN_VAR_8(T, var) __declspec(align(8)) T var +#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var +#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var +#define x265_stack_align(func, ...) func(__VA_ARGS__) +#define fseeko _fseeki64 + +#endif // if defined(__GNUC__) + +#if HAVE_INT_TYPES_H +#define __STDC_FORMAT_MACROS +#include +#define X265_LL "%" PRIu64 +#else +#define X265_LL "%lld" +#endif + +#if _DEBUG && defined(_MSC_VER) +#define DEBUG_BREAK() __debugbreak() +#elif __APPLE_CC__ +#define DEBUG_BREAK() __builtin_trap(); +#else +#define DEBUG_BREAK() +#endif + +/* If compiled with CHECKED_BUILD perform run-time checks and log any that + * fail, both to stderr and to a file */ +#if CHECKED_BUILD || _DEBUG +#define X265_CHECK(expr, ...) if (!(expr)) { \ + x265_log(NULL, X265_LOG_ERROR, __VA_ARGS__); \ + DEBUG_BREAK(); \ + FILE *fp = fopen("x265_check_failures.txt", "a"); \ + if (fp) { fprintf(fp, "%s:%d\n", __FILE__, __LINE__); fprintf(fp, __VA_ARGS__); fclose(fp); } \ +} +#if _MSC_VER +#pragma warning(disable: 4127) // some checks have constant conditions +#endif +#else +#define X265_CHECK(expr, ...) +#endif + +#if HIGH_BIT_DEPTH +typedef uint16_t pixel; +typedef uint32_t sum_t; +typedef uint64_t sum2_t; +typedef uint64_t pixel4; +typedef int64_t ssum2_t; +#define X265_DEPTH 10 // compile time configurable bit depth +#else +typedef uint8_t pixel; +typedef uint16_t sum_t; +typedef uint32_t sum2_t; +typedef uint32_t pixel4; +typedef int32_t ssum2_t; //Signed sum +#define X265_DEPTH 8 // compile time configurable bit depth +#endif // if HIGH_BIT_DEPTH + +#ifndef NULL +#define NULL 0 +#endif + +#define MAX_UINT 0xFFFFFFFFU // max. value of unsigned 32-bit integer +#define MAX_INT 2147483647 // max. value of signed 32-bit integer +#define MAX_INT64 0x7FFFFFFFFFFFFFFFLL // max. value of signed 64-bit integer +#define MAX_DOUBLE 1.7e+308 // max. value of double-type value + +#define QP_MIN 0 +#define QP_MAX_SPEC 51 /* max allowed signaled QP in HEVC */ +#define QP_MAX_MAX 69 /* max allowed QP to be output by rate control */ + +#define MIN_QPSCALE 0.21249999999999999 +#define MAX_MAX_QPSCALE 615.46574234477100 + +#define BITS_FOR_POC 8 + +template +inline pixel Clip(T x) +{ + return (pixel)std::min(T((1 << X265_DEPTH) - 1), std::max(T(0), x)); +} + +template +inline T Clip3(T minVal, T maxVal, T a) +{ + return std::min(std::max(minVal, a), maxVal); +} + +template +inline T x265_min(T a, T b) { return a < b ? a : b; } + +template +inline T x265_max(T a, T b) { return a > b ? a : b; } + +typedef int16_t coeff_t; // transform coefficient + +#define X265_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define X265_MAX(a, b) ((a) > (b) ? (a) : (b)) +#define COPY1_IF_LT(x, y) if ((y) < (x)) (x) = (y); +#define COPY2_IF_LT(x, y, a, b) \ + if ((y) < (x)) \ + { \ + (x) = (y); \ + (a) = (b); \ + } +#define COPY3_IF_LT(x, y, a, b, c, d) \ + if ((y) < (x)) \ + { \ + (x) = (y); \ + (a) = (b); \ + (c) = (d); \ + } +#define COPY4_IF_LT(x, y, a, b, c, d, e, f) \ + if ((y) < (x)) \ + { \ + (x) = (y); \ + (a) = (b); \ + (c) = (d); \ + (e) = (f); \ + } +#define X265_MIN3(a, b, c) X265_MIN((a), X265_MIN((b), (c))) +#define X265_MAX3(a, b, c) X265_MAX((a), X265_MAX((b), (c))) +#define X265_MIN4(a, b, c, d) X265_MIN((a), X265_MIN3((b), (c), (d))) +#define X265_MAX4(a, b, c, d) X265_MAX((a), X265_MAX3((b), (c), (d))) +#define QP_BD_OFFSET (6 * (X265_DEPTH - 8)) +#define MAX_CHROMA_LAMBDA_OFFSET 36 + +// arbitrary, but low because SATD scores are 1/4 normal +#define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET) +#define X265_LOOKAHEAD_MAX 250 + +// Use the same size blocks as x264. Using larger blocks seems to give artificially +// high cost estimates (intra and inter both suffer) +#define X265_LOWRES_CU_SIZE 8 +#define X265_LOWRES_CU_BITS 3 + +#define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * (count)) +#define X265_FREE(ptr) x265_free(ptr) +#define CHECKED_MALLOC(var, type, count) \ + { \ + var = (type*)x265_malloc(sizeof(type) * (count)); \ + if (!var) \ + { \ + x265_log(NULL, X265_LOG_ERROR, "malloc of size %d failed\n", sizeof(type) * (count)); \ + goto fail; \ + } \ + } +#define CHECKED_MALLOC_ZERO(var, type, count) \ + { \ + var = (type*)x265_malloc(sizeof(type) * (count)); \ + if (var) \ + memset((void*)var, 0, sizeof(type) * (count)); \ + else \ + { \ + x265_log(NULL, X265_LOG_ERROR, "malloc of size %d failed\n", sizeof(type) * (count)); \ + goto fail; \ + } \ + } + +#if defined(_MSC_VER) +#define X265_LOG2F(x) (logf((float)(x)) * 1.44269504088896405f) +#define X265_LOG2(x) (log((double)(x)) * 1.4426950408889640513713538072172) +#else +#define X265_LOG2F(x) log2f(x) +#define X265_LOG2(x) log2(x) +#endif + +#define NUM_CU_DEPTH 4 // maximum number of CU depths +#define NUM_FULL_DEPTH 5 // maximum number of full depths +#define MIN_LOG2_CU_SIZE 3 // log2(minCUSize) +#define MAX_LOG2_CU_SIZE 6 // log2(maxCUSize) +#define MIN_CU_SIZE (1 << MIN_LOG2_CU_SIZE) // minimum allowable size of CU +#define MAX_CU_SIZE (1 << MAX_LOG2_CU_SIZE) // maximum allowable size of CU + +#define LOG2_UNIT_SIZE 2 // log2(unitSize) +#define UNIT_SIZE (1 << LOG2_UNIT_SIZE) // unit size of CU partition + +#define MAX_NUM_PARTITIONS 256 +#define NUM_CU_PARTITIONS (1U << (g_maxFullDepth << 1)) + +#define MIN_PU_SIZE 4 +#define MIN_TU_SIZE 4 +#define MAX_NUM_SPU_W (MAX_CU_SIZE / MIN_PU_SIZE) // maximum number of SPU in horizontal line + +#define MAX_LOG2_TR_SIZE 5 +#define MAX_LOG2_TS_SIZE 2 // TODO: RExt +#define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE) +#define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE) + +#define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */ +#define MAX_NUM_TR_CATEGORIES 8 /* 32, 16, 8, 4 transform categories each for luma and chroma */ + +#define COEF_REMAIN_BIN_REDUCTION 3 // indicates the level at which the VLC + // transitions from Golomb-Rice to TU+EG(k) + +#define SBH_THRESHOLD 4 // fixed sign bit hiding controlling threshold + +#define C1FLAG_NUMBER 8 // maximum number of largerThan1 flag coded in one chunk: 16 in HM5 +#define C2FLAG_NUMBER 1 // maximum number of largerThan2 flag coded in one chunk: 16 in HM5 + +#define SAO_ENCODING_RATE 0.75 +#define SAO_ENCODING_RATE_CHROMA 0.5 + +#define MLS_GRP_NUM 64 // Max number of coefficient groups, max(16, 64) +#define MLS_CG_SIZE 4 // Coefficient group size of 4x4 +#define MLS_CG_LOG2_SIZE 2 + +#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20 +#define QUANT_SHIFT 14 // Q(4) = 2^14 +#define SCALE_BITS 15 // Inherited from TMuC, presumably for fractional bit estimates in RDOQ +#define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit) + +#define SHIFT_INV_1ST 7 // Shift after first inverse transform stage +#define SHIFT_INV_2ND 12 // Shift after second inverse transform stage + +#define AMVP_DECIMATION_FACTOR 4 + +#define SCAN_SET_SIZE 16 +#define LOG2_SCAN_SET_SIZE 4 + +#define ALL_IDX -1 +#define PLANAR_IDX 0 +#define VER_IDX 26 // index for intra VERTICAL mode +#define HOR_IDX 10 // index for intra HORIZONTAL mode +#define DC_IDX 1 // index for intra DC mode +#define NUM_CHROMA_MODE 5 // total number of chroma modes +#define DM_CHROMA_IDX 36 // chroma mode index for derived from luma intra mode + +#define MDCS_ANGLE_LIMIT 4 // distance from true angle that horiz or vertical scan is allowed +#define MDCS_LOG2_MAX_SIZE 3 // TUs with log2 of size greater than this can only use diagonal scan + +#define MAX_NUM_REF_PICS 16 // max. number of pictures used for reference +#define MAX_NUM_REF 16 // max. number of entries in picture reference list + +#define REF_NOT_VALID -1 + +#define AMVP_NUM_CANDS 2 // number of AMVP candidates +#define MRG_MAX_NUM_CANDS 5 // max number of final merge candidates + +#define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422) +#define CHROMA_V_SHIFT(x) (x == X265_CSP_I420) + +namespace x265 { + +enum { SAO_NUM_OFFSET = 4 }; + +// NOTE: MUST be alignment to 16 or 32 bytes for asm code +struct NoiseReduction +{ + /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32 + * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */ + uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; + uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; + uint32_t count[MAX_NUM_TR_CATEGORIES]; +}; + +enum SaoMergeMode +{ + SAO_MERGE_NONE, + SAO_MERGE_LEFT, + SAO_MERGE_UP +}; + +struct SaoCtuParam +{ + SaoMergeMode mergeMode; + int typeIdx; + uint32_t bandPos; // BO band position + int offset[SAO_NUM_OFFSET]; + + void reset() + { + mergeMode = SAO_MERGE_NONE; + typeIdx = -1; + bandPos = 0; + offset[0] = 0; + offset[1] = 0; + offset[2] = 0; + offset[3] = 0; + } +}; + +struct SAOParam +{ + SaoCtuParam* ctuParam[3]; + bool bSaoFlag[2]; + int numCuInWidth; + + SAOParam() + { + for (int i = 0; i < 3; i++) + ctuParam[i] = NULL; + } + + ~SAOParam() + { + delete[] ctuParam[0]; + delete[] ctuParam[1]; + delete[] ctuParam[2]; + } +}; + +enum TextType +{ + TEXT_LUMA = 0, // luma + TEXT_CHROMA_U = 1, // chroma U + TEXT_CHROMA_V = 2, // chroma V + MAX_NUM_COMPONENT = 3 +}; + +// coefficient scanning type used in ACS +enum ScanType +{ + SCAN_DIAG = 0, // up-right diagonal scan + SCAN_HOR = 1, // horizontal first scan + SCAN_VER = 2, // vertical first scan + NUM_SCAN_TYPE = 3 +}; + +enum SignificanceMapContextType +{ + CONTEXT_TYPE_4x4 = 0, + CONTEXT_TYPE_8x8 = 1, + CONTEXT_TYPE_NxN = 2, + CONTEXT_NUMBER_OF_TYPES = 3 +}; +} + +/* outside x265 namespace, but prefixed. defined in common.cpp */ +int64_t x265_mdate(void); +void x265_log(const x265_param *param, int level, const char *fmt, ...); +int x265_exp2fix8(double x); + +double x265_ssim2dB(double ssim); +double x265_qScale2qp(double qScale); +double x265_qp2qScale(double qp); +uint32_t x265_picturePlaneSize(int csp, int width, int height, int plane); + +void* x265_malloc(size_t size); +void x265_free(void *ptr); +char* x265_slurp_file(const char *filename); + +#include "constants.h" + +#endif // ifndef X265_COMMON_H diff --git a/source/common/constants.cpp b/source/common/constants.cpp new file mode 100644 index 0000000..4252cb4 --- /dev/null +++ b/source/common/constants.cpp @@ -0,0 +1,503 @@ +/***************************************************************************** +* Copyright (C) 2014 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "constants.h" +#include "threading.h" + +namespace x265 { + +static int initialized /* = 0 */; + +// initialize ROM variables +void initROM() +{ + if (ATOMIC_CAS32(&initialized, 0, 1) == 1) + return; +} + +void destroyROM() +{ + if (ATOMIC_CAS32(&initialized, 1, 0) == 0) + return; +} + + +// lambda = pow(2, (double)q / 6 - 2); +double x265_lambda_tab[QP_MAX_MAX + 1] = +{ + 0.2500, 0.2806, 0.3150, 0.3536, 0.3969, + 0.4454, 0.5000, 0.5612, 0.6300, 0.7071, + 0.7937, 0.8909, 1.0000, 1.1225, 1.2599, + 1.4142, 1.5874, 1.7818, 2.0000, 2.2449, + 2.5198, 2.8284, 3.1748, 3.5636, 4.0000, + 4.4898, 5.0397, 5.6569, 6.3496, 7.1272, + 8.0000, 8.9797, 10.0794, 11.3137, 12.6992, + 14.2544, 16.0000, 17.9594, 20.1587, 22.6274, + 25.3984, 28.5088, 32.0000, 35.9188, 40.3175, + 45.2548, 50.7968, 57.0175, 64.0000, 71.8376, + 80.6349, 90.5097, 101.5937, 114.0350, 128.0000, + 143.6751, 161.2699, 181.0193, 203.1873, 228.0701, + 256.0000, 287.3503, 322.5398, 362.0387, 406.3747, + 456.1401, 512.0000, 574.7006, 645.0796, 724.0773 +}; + +// lambda2 = pow(lambda, 2) * scale (0.85); +double x265_lambda2_tab[QP_MAX_MAX + 1] = +{ + 0.0531, 0.0669, 0.0843, 0.1063, 0.1339, + 0.1687, 0.2125, 0.2677, 0.3373, 0.4250, + 0.5355, 0.6746, 0.8500, 1.0709, 1.3493, + 1.7000, 2.1419, 2.6986, 3.4000, 4.2837, + 5.3970, 6.8000, 8.5675, 10.7943, 13.6000, + 17.1345, 21.5887, 27.2004, 34.2699, 43.1773, + 54.4000, 68.5397, 86.3551, 108.7998, 137.0792, + 172.7097, 217.6000, 274.1590, 345.4172, 435.1993, + 548.3169, 690.8389, 870.4000, 1096.6362, 1381.6757, + 1740.7974, 2193.2676, 2763.3460, 3481.6000, 4386.5446, + 5526.6890, 6963.2049, 8773.0879, 11053.3840, 13926.4000, + 17546.1542, 22106.7835, 27852.7889, 35092.3170, 44213.5749, + 55705.6000, 70184.6657, 88427.1342, 111411.2172, 140369.3373, + 176854.2222, 222822.4000, 280738.6627, 353708.5368, 445644.7459 +}; + +const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = +{ + 16, 20, 25, 32, 40, 50, + 64, 80, 101, 128, 161, 203, + 256, 322, 406, 512, 645, 812, + 1024, 1290, 1625, 2048, 2580, 3250, + 4096, 5160, 6501, 8192, 10321, 13003, + 16384, 20642, 26007, 32768, 41285, 52015, + 65535 +}; + +uint32_t g_maxLog2CUSize = MAX_LOG2_CU_SIZE; +uint32_t g_maxCUSize = MAX_CU_SIZE; +uint32_t g_maxFullDepth = NUM_FULL_DEPTH - 1; +uint32_t g_maxCUDepth = NUM_CU_DEPTH - 1; +uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS] = { 0, }; +uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS] = { 0, }; + +const uint8_t g_zscanToPelX[MAX_NUM_PARTITIONS] = +{ + 0, 4, 0, 4, 8, 12, 8, 12, 0, 4, 0, 4, 8, 12, 8, 12, + 16, 20, 16, 20, 24, 28, 24, 28, 16, 20, 16, 20, 24, 28, 24, 28, + 0, 4, 0, 4, 8, 12, 8, 12, 0, 4, 0, 4, 8, 12, 8, 12, + 16, 20, 16, 20, 24, 28, 24, 28, 16, 20, 16, 20, 24, 28, 24, 28, + 32, 36, 32, 36, 40, 44, 40, 44, 32, 36, 32, 36, 40, 44, 40, 44, + 48, 52, 48, 52, 56, 60, 56, 60, 48, 52, 48, 52, 56, 60, 56, 60, + 32, 36, 32, 36, 40, 44, 40, 44, 32, 36, 32, 36, 40, 44, 40, 44, + 48, 52, 48, 52, 56, 60, 56, 60, 48, 52, 48, 52, 56, 60, 56, 60, + 0, 4, 0, 4, 8, 12, 8, 12, 0, 4, 0, 4, 8, 12, 8, 12, + 16, 20, 16, 20, 24, 28, 24, 28, 16, 20, 16, 20, 24, 28, 24, 28, + 0, 4, 0, 4, 8, 12, 8, 12, 0, 4, 0, 4, 8, 12, 8, 12, + 16, 20, 16, 20, 24, 28, 24, 28, 16, 20, 16, 20, 24, 28, 24, 28, + 32, 36, 32, 36, 40, 44, 40, 44, 32, 36, 32, 36, 40, 44, 40, 44, + 48, 52, 48, 52, 56, 60, 56, 60, 48, 52, 48, 52, 56, 60, 56, 60, + 32, 36, 32, 36, 40, 44, 40, 44, 32, 36, 32, 36, 40, 44, 40, 44, + 48, 52, 48, 52, 56, 60, 56, 60, 48, 52, 48, 52, 56, 60, 56, 60 +}; + +const uint8_t g_zscanToPelY[MAX_NUM_PARTITIONS] = +{ + 0, 0, 4, 4, 0, 0, 4, 4, 8, 8, 12, 12, 8, 8, 12, 12, + 0, 0, 4, 4, 0, 0, 4, 4, 8, 8, 12, 12, 8, 8, 12, 12, + 16, 16, 20, 20, 16, 16, 20, 20, 24, 24, 28, 28, 24, 24, 28, 28, + 16, 16, 20, 20, 16, 16, 20, 20, 24, 24, 28, 28, 24, 24, 28, 28, + 0, 0, 4, 4, 0, 0, 4, 4, 8, 8, 12, 12, 8, 8, 12, 12, + 0, 0, 4, 4, 0, 0, 4, 4, 8, 8, 12, 12, 8, 8, 12, 12, + 16, 16, 20, 20, 16, 16, 20, 20, 24, 24, 28, 28, 24, 24, 28, 28, + 16, 16, 20, 20, 16, 16, 20, 20, 24, 24, 28, 28, 24, 24, 28, 28, + 32, 32, 36, 36, 32, 32, 36, 36, 40, 40, 44, 44, 40, 40, 44, 44, + 32, 32, 36, 36, 32, 32, 36, 36, 40, 40, 44, 44, 40, 40, 44, 44, + 48, 48, 52, 52, 48, 48, 52, 52, 56, 56, 60, 60, 56, 56, 60, 60, + 48, 48, 52, 52, 48, 48, 52, 52, 56, 56, 60, 60, 56, 56, 60, 60, + 32, 32, 36, 36, 32, 32, 36, 36, 40, 40, 44, 44, 40, 40, 44, 44, + 32, 32, 36, 36, 32, 32, 36, 36, 40, 40, 44, 44, 40, 40, 44, 44, + 48, 48, 52, 52, 48, 48, 52, 52, 56, 56, 60, 60, 56, 56, 60, 60, + 48, 48, 52, 52, 48, 48, 52, 52, 56, 56, 60, 60, 56, 56, 60, 60 +}; + +void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx) +{ + uint32_t stride = 1 << maxFullDepth; + + if (depth > maxFullDepth) + { + curIdx[0] = startVal; + curIdx++; + } + else + { + int step = stride >> depth; + initZscanToRaster(maxFullDepth, depth + 1, startVal, curIdx); + initZscanToRaster(maxFullDepth, depth + 1, startVal + step, curIdx); + initZscanToRaster(maxFullDepth, depth + 1, startVal + step * stride, curIdx); + initZscanToRaster(maxFullDepth, depth + 1, startVal + step * stride + step, curIdx); + } +} + +void initRasterToZscan(uint32_t maxFullDepth) +{ + uint32_t numPartitions = 1 << (maxFullDepth * 2); + + for (uint32_t i = 0; i < numPartitions; i++) + g_rasterToZscan[g_zscanToRaster[i]] = i; +} + +const int16_t g_lumaFilter[4][NTAPS_LUMA] = +{ + { 0, 0, 0, 64, 0, 0, 0, 0 }, + { -1, 4, -10, 58, 17, -5, 1, 0 }, + { -1, 4, -11, 40, 40, -11, 4, -1 }, + { 0, 1, -5, 17, 58, -10, 4, -1 } +}; + +const int16_t g_chromaFilter[8][NTAPS_CHROMA] = +{ + { 0, 64, 0, 0 }, + { -2, 58, 10, -2 }, + { -4, 54, 16, -2 }, + { -6, 46, 28, -4 }, + { -4, 36, 36, -4 }, + { -4, 28, 46, -6 }, + { -2, 16, 54, -4 }, + { -2, 10, 58, -2 } +}; + +const int16_t g_t4[4][4] = +{ + { 64, 64, 64, 64 }, + { 83, 36, -36, -83 }, + { 64, -64, -64, 64 }, + { 36, -83, 83, -36 } +}; + +const int16_t g_t8[8][8] = +{ + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 89, 75, 50, 18, -18, -50, -75, -89 }, + { 83, 36, -36, -83, -83, -36, 36, 83 }, + { 75, -18, -89, -50, 50, 89, 18, -75 }, + { 64, -64, -64, 64, 64, -64, -64, 64 }, + { 50, -89, 18, 75, -75, -18, 89, -50 }, + { 36, -83, 83, -36, -36, 83, -83, 36 }, + { 18, -50, 75, -89, 89, -75, 50, -18 } +}; + +const int16_t g_t16[16][16] = +{ + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 }, + { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, + { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 }, + { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, + { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 }, + { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, + { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 }, + { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, + { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 }, + { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, + { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 }, + { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, + { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 }, + { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, + { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 } +}; + +const int16_t g_t32[32][32] = +{ + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, + { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, + { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, + { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, + { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, + { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, + { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, + { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, + { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, + { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, + { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, + { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, + { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, + { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, + { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, + { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, + { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, + { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, + { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, + { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, + { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, + { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, + { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, + { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, + { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, + { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, + { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, + { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, + { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, + { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, + { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 } +}; + +const uint8_t g_chromaScale[ChromaQPMappingTableSize] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51 +}; + +const uint8_t g_chroma422IntraAngleMappingTable[AngleMapping422TableSize] = +{ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31, DM_CHROMA_IDX }; + +const uint8_t g_log2Size[MAX_CU_SIZE + 1] = +{ + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6 +}; + +const uint16_t g_scan2x2[][2*2] = +{ + { 0, 2, 1, 3 }, + { 0, 1, 2, 3 }, +}; + +const uint16_t g_scan8x8[NUM_SCAN_TYPE][8 * 8] = +{ + { 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 26, 19, 27, 32, 40, 33, 48, 41, 34, 56, 49, 42, 35, 57, 50, 43, 58, 51, 59, + 4, 12, 5, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, 36, 44, 37, 52, 45, 38, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63 }, + { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, + 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63 }, + { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 32, 40, 48, 56, 33, 41, 49, 57, 34, 42, 50, 58, 35, 43, 51, 59, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 } +}; + +const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4] = +{ + { 0, 4, 1, 8, 5, 2, 12, 9, 6, 3, 13, 10, 7, 14, 11, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 } +}; + +const uint16_t g_scan16x16[16 * 16] = +{ + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 50, 35, 51, + 64, 80, 65, 96, 81, 66, 112, 97, 82, 67, 113, 98, 83, 114, 99, 115, + 4, 20, 5, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 54, 39, 55, + 128, 144, 129, 160, 145, 130, 176, 161, 146, 131, 177, 162, 147, 178, 163, 179, + 68, 84, 69, 100, 85, 70, 116, 101, 86, 71, 117, 102, 87, 118, 103, 119, + 8, 24, 9, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 58, 43, 59, + 192,208, 193,224,209, 194,240,225,210, 195,241,226,211,242,227,243, + 132, 148, 133, 164, 149, 134, 180, 165, 150, 135, 181, 166, 151, 182, 167, 183, + 72, 88, 73, 104, 89, 74, 120, 105, 90, 75, 121, 106, 91, 122, 107, 123, + 12, 28, 13, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, + 196,212, 197,228,213, 198,244,229,214, 199,245,230,215,246,231,247, + 136, 152, 137, 168, 153, 138, 184, 169, 154, 139, 185, 170, 155, 186, 171, 187, + 76, 92, 77, 108, 93, 78, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, + 200,216,201,232,217,202,248,233,218,203,249,234,219,250,235,251, + 140, 156, 141, 172, 157, 142, 188, 173, 158, 143, 189, 174, 159, 190, 175, 191, + 204,220,205,236,221,206,252,237,222,207,253,238,223,254,239,255 +}; + +const uint16_t g_scan8x8diag[8 * 8] = +{ + 0, 8, 1, 16, 9, 2, 24, 17, + 10, 3, 32, 25, 18, 11, 4, 40, + 33, 26, 19, 12, 5, 48, 41, 34, + 27, 20, 13, 6, 56, 49, 42, 35, + 28, 21, 14, 7, 57, 50, 43, 36, + 29, 22, 15, 58, 51, 44, 37, 30, + 23, 59, 52, 45, 38, 31, 60, 53, + 46, 39, 61, 54, 47, 62, 55, 63 +}; + +const uint16_t g_scan32x32[32 * 32] = +{ + 0,32,1,64,33,2,96,65,34,3,97,66,35,98,67,99,128,160,129,192,161,130,224,193,162,131,225,194,163,226,195,227, + 4,36,5,68,37,6,100,69,38,7,101,70,39,102,71,103,256,288,257,320,289,258,352,321,290,259,353,322,291,354,323,355, + 132,164,133,196,165,134,228,197,166,135,229,198,167,230,199,231,8,40,9,72,41,10,104,73,42,11,105,74,43,106,75,107, + 384,416,385,448,417,386,480,449,418,387,481,450,419,482,451,483,260,292,261,324,293,262,356,325,294,263,357,326,295,358,327,359, + 136,168,137,200,169,138,232,201,170,139,233,202,171,234,203,235,12,44,13,76,45,14,108,77,46,15,109,78,47,110,79,111, + 512,544,513,576,545,514,608,577,546,515,609,578,547,610,579,611,388,420,389,452,421,390,484,453,422,391,485,454,423,486,455,487, + 264,296,265,328,297,266,360,329,298,267,361,330,299,362,331,363,140,172,141,204,173,142,236,205,174,143,237,206,175,238,207,239, + 16,48,17,80,49,18,112,81,50,19,113,82,51,114,83,115,640,672,641,704,673,642,736,705,674,643,737,706,675,738,707,739, + 516,548,517,580,549,518,612,581,550,519,613,582,551,614,583,615,392,424,393,456,425,394,488,457,426,395,489,458,427,490,459,491, + 268,300,269,332,301,270,364,333,302,271,365,334,303,366,335,367,144,176,145,208,177,146,240,209,178,147,241,210,179,242,211,243, + 20,52,21,84,53,22,116,85,54,23,117,86,55,118,87,119,768,800,769,832,801,770,864,833,802,771,865,834,803,866,835,867, + 644,676,645,708,677,646,740,709,678,647,741,710,679,742,711,743,520,552,521,584,553,522,616,585,554,523,617,586,555,618,587,619, + 396,428,397,460,429,398,492,461,430,399,493,462,431,494,463,495,272,304,273,336,305,274,368,337,306,275,369,338,307,370,339,371, + 148,180,149,212,181,150,244,213,182,151,245,214,183,246,215,247,24,56,25,88,57,26,120,89,58,27,121,90,59,122,91,123, + 896,928,897,960,929,898,992,961,930,899,993,962,931,994,963,995,772,804,773,836,805,774,868,837,806,775,869,838,807,870,839,871, + 648,680,649,712,681,650,744,713,682,651,745,714,683,746,715,747,524,556,525,588,557,526,620,589,558,527,621,590,559,622,591,623, + 400,432,401,464,433,402,496,465,434,403,497,466,435,498,467,499,276,308,277,340,309,278,372,341,310,279,373,342,311,374,343,375, + 152,184,153,216,185,154,248,217,186,155,249,218,187,250,219,251,28,60,29,92,61,30,124,93,62,31,125,94,63,126,95,127, + 900,932,901,964,933,902,996,965,934,903,997,966,935,998,967,999,776,808,777,840,809,778,872,841,810,779,873,842,811,874,843,875, + 652,684,653,716,685,654,748,717,686,655,749,718,687,750,719,751,528,560,529,592,561,530,624,593,562,531,625,594,563,626,595,627, + 404,436,405,468,437,406,500,469,438,407,501,470,439,502,471,503,280,312,281,344,313,282,376,345,314,283,377,346,315,378,347,379, + 156,188,157,220,189,158,252,221,190,159,253,222,191,254,223,255,904,936,905,968,937,906,1000,969,938,907,1001,970,939,1002,971,1003, + 780,812,781,844,813,782,876,845,814,783,877,846,815,878,847,879,656,688,657,720,689,658,752,721,690,659,753,722,691,754,723,755, + 532,564,533,596,565,534,628,597,566,535,629,598,567,630,599,631,408,440,409,472,441,410,504,473,442,411,505,474,443,506,475,507, + 284,316,285,348,317,286,380,349,318,287,381,350,319,382,351,383,908,940,909,972,941,910,1004,973,942,911,1005,974,943,1006,975,1007, + 784,816,785,848,817,786,880,849,818,787,881,850,819,882,851,883,660,692,661,724,693,662,756,725,694,663,757,726,695,758,727,759, + 536,568,537,600,569,538,632,601,570,539,633,602,571,634,603,635,412,444,413,476,445,414,508,477,446,415,509,478,447,510,479,511, + 912,944,913,976,945,914,1008,977,946,915,1009,978,947,1010,979,1011,788,820,789,852,821,790,884,853,822,791,885,854,823,886,855,887, + 664,696,665,728,697,666,760,729,698,667,761,730,699,762,731,763,540,572,541,604,573,542,636,605,574,543,637,606,575,638,607,639, + 916,948,917,980,949,918,1012,981,950,919,1013,982,951,1014,983,1015,792,824,793,856,825,794,888,857,826,795,889,858,827,890,859,891, + 668,700,669,732,701,670,764,733,702,671,765,734,703,766,735,767,920,952,921,984,953,922,1016,985,954,923,1017,986,955,1018,987,1019, + 796,828,797,860,829,798,892,861,830,799,893,862,831,894,863,895,924,956,925,988,957,926,1020,989,958,927,1021,990,959,1022,991,1023 +}; + +const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE] = +{ + { g_scan4x4[0], g_scan8x8[0], g_scan16x16, g_scan32x32 }, + { g_scan4x4[1], g_scan8x8[1], g_scan16x16, g_scan32x32 }, + { g_scan4x4[2], g_scan8x8[2], g_scan16x16, g_scan32x32 } +}; + +const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE] = +{ + { g_scan4x4[0], g_scan2x2[0], g_scan4x4[0], g_scan8x8diag }, + { g_scan4x4[1], g_scan2x2[1], g_scan4x4[0], g_scan8x8diag }, + { g_scan4x4[2], g_scan2x2[0], g_scan4x4[0], g_scan8x8diag } +}; + +const uint8_t g_minInGroup[10] = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24 }; + +// Rice parameters for absolute transform levels +const uint8_t g_goRiceRange[5] = { 7, 14, 26, 46, 78 }; + +const uint8_t g_lpsTable[64][4] = +{ + { 128, 176, 208, 240 }, + { 128, 167, 197, 227 }, + { 128, 158, 187, 216 }, + { 123, 150, 178, 205 }, + { 116, 142, 169, 195 }, + { 111, 135, 160, 185 }, + { 105, 128, 152, 175 }, + { 100, 122, 144, 166 }, + { 95, 116, 137, 158 }, + { 90, 110, 130, 150 }, + { 85, 104, 123, 142 }, + { 81, 99, 117, 135 }, + { 77, 94, 111, 128 }, + { 73, 89, 105, 122 }, + { 69, 85, 100, 116 }, + { 66, 80, 95, 110 }, + { 62, 76, 90, 104 }, + { 59, 72, 86, 99 }, + { 56, 69, 81, 94 }, + { 53, 65, 77, 89 }, + { 51, 62, 73, 85 }, + { 48, 59, 69, 80 }, + { 46, 56, 66, 76 }, + { 43, 53, 63, 72 }, + { 41, 50, 59, 69 }, + { 39, 48, 56, 65 }, + { 37, 45, 54, 62 }, + { 35, 43, 51, 59 }, + { 33, 41, 48, 56 }, + { 32, 39, 46, 53 }, + { 30, 37, 43, 50 }, + { 29, 35, 41, 48 }, + { 27, 33, 39, 45 }, + { 26, 31, 37, 43 }, + { 24, 30, 35, 41 }, + { 23, 28, 33, 39 }, + { 22, 27, 32, 37 }, + { 21, 26, 30, 35 }, + { 20, 24, 29, 33 }, + { 19, 23, 27, 31 }, + { 18, 22, 26, 30 }, + { 17, 21, 25, 28 }, + { 16, 20, 23, 27 }, + { 15, 19, 22, 25 }, + { 14, 18, 21, 24 }, + { 14, 17, 20, 23 }, + { 13, 16, 19, 22 }, + { 12, 15, 18, 21 }, + { 12, 14, 17, 20 }, + { 11, 14, 16, 19 }, + { 11, 13, 15, 18 }, + { 10, 12, 15, 17 }, + { 10, 12, 14, 16 }, + { 9, 11, 13, 15 }, + { 9, 11, 12, 14 }, + { 8, 10, 12, 14 }, + { 8, 9, 11, 13 }, + { 7, 9, 11, 12 }, + { 7, 9, 10, 12 }, + { 7, 8, 10, 11 }, + { 6, 8, 9, 11 }, + { 6, 7, 9, 10 }, + { 6, 7, 8, 9 }, + { 2, 2, 2, 2 } +}; + +const uint8_t x265_exp2_lut[64] = +{ + 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, + 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170, + 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250 +}; + +/* bFilter = g_intraFilterFlags[dir] & trSize */ +const uint8_t g_intraFilterFlags[NUM_INTRA_MODE] = +{ + 0x38, 0x00, + 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x20, 0x00, 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, + 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x20, 0x00, 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, + 0x38, +}; + +/* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth + * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8 + * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3 + * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3 + * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */ +const uint32_t g_depthInc[3][4] = +{ + { 16, 4, 0, 0}, + { 64, 16, 4, 1}, + {256, 64, 16, 4} +}; + +/* g_depthScanIdx [y][x] */ +const uint32_t g_depthScanIdx[8][8] = +{ + { 0, 1, 4, 5, 16, 17, 20, 21, }, + { 2, 3, 6, 7, 18, 19, 22, 23, }, + { 8, 9, 12, 13, 24, 25, 28, 29, }, + { 10, 11, 14, 15, 26, 27, 30, 31, }, + { 32, 33, 36, 37, 48, 49, 52, 53, }, + { 34, 35, 38, 39, 50, 51, 54, 55, }, + { 40, 41, 44, 45, 56, 57, 60, 61, }, + { 42, 43, 46, 47, 58, 59, 62, 63, } +}; + +} diff --git a/source/common/constants.h b/source/common/constants.h new file mode 100644 index 0000000..9db47db --- /dev/null +++ b/source/common/constants.h @@ -0,0 +1,104 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_CONSTANTS_H +#define X265_CONSTANTS_H + +#include "common.h" + +namespace x265 { +// private namespace + +void initROM(); +void destroyROM(); + +void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx); +void initRasterToZscan(uint32_t maxFullDepth); + +extern double x265_lambda_tab[QP_MAX_MAX + 1]; +extern double x265_lambda2_tab[QP_MAX_MAX + 1]; +extern const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET + 1]; + +enum { ChromaQPMappingTableSize = 70 }; +enum { AngleMapping422TableSize = 36 }; + +extern const uint8_t g_chromaScale[ChromaQPMappingTableSize]; +extern const uint8_t g_chroma422IntraAngleMappingTable[AngleMapping422TableSize]; + +// flexible conversion from relative to absolute index +extern uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS]; +extern uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS]; + +// conversion of partition index to picture pel position +extern const uint8_t g_zscanToPelX[MAX_NUM_PARTITIONS]; +extern const uint8_t g_zscanToPelY[MAX_NUM_PARTITIONS]; +extern const uint8_t g_log2Size[MAX_CU_SIZE + 1]; // from size to log2(size) + +// global variable (CTU width/height, max. CU depth) +extern uint32_t g_maxLog2CUSize; +extern uint32_t g_maxCUSize; +extern uint32_t g_maxCUDepth; +extern uint32_t g_maxFullDepth; + +extern const int16_t g_t4[4][4]; +extern const int16_t g_t8[8][8]; +extern const int16_t g_t16[16][16]; +extern const int16_t g_t32[32][32]; + +// Subpel interpolation defines and constants + +#define NTAPS_LUMA 8 // Number of taps for luma +#define NTAPS_CHROMA 4 // Number of taps for chroma +#define IF_INTERNAL_PREC 14 // Number of bits for internal precision +#define IF_FILTER_PREC 6 // Log2 of sum of filter taps +#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) // Offset used internally +#define SLFASE_CONSTANT 0x5f4e4a53 + +extern const int16_t g_lumaFilter[4][NTAPS_LUMA]; // Luma filter taps +extern const int16_t g_chromaFilter[8][NTAPS_CHROMA]; // Chroma filter taps + +// Scanning order & context mapping table + +#define NUM_SCAN_SIZE 4 + +extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE]; +extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE]; +extern const uint16_t g_scan8x8diag[8 * 8]; +extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4]; + +extern const uint8_t g_minInGroup[10]; +extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes + +// CABAC tables +extern const uint8_t g_lpsTable[64][4]; +extern const uint8_t x265_exp2_lut[64]; + +// Intra tables +extern const uint8_t g_intraFilterFlags[NUM_INTRA_MODE]; + +extern const uint32_t g_depthInc[3][4]; +extern const uint32_t g_depthScanIdx[8][8]; + +} + +#endif diff --git a/source/common/contexts.h b/source/common/contexts.h new file mode 100644 index 0000000..b692806 --- /dev/null +++ b/source/common/contexts.h @@ -0,0 +1,309 @@ +/***************************************************************************** +* Copyright (C) 2014 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_CONTEXTS_H +#define X265_CONTEXTS_H + +#include "common.h" + +#define NUM_SPLIT_FLAG_CTX 3 // number of context models for split flag +#define NUM_SKIP_FLAG_CTX 3 // number of context models for skip flag + +#define NUM_MERGE_FLAG_EXT_CTX 1 // number of context models for merge flag of merge extended +#define NUM_MERGE_IDX_EXT_CTX 1 // number of context models for merge index of merge extended + +#define NUM_PART_SIZE_CTX 4 // number of context models for partition size +#define NUM_PRED_MODE_CTX 1 // number of context models for prediction mode + +#define NUM_ADI_CTX 1 // number of context models for intra prediction + +#define NUM_CHROMA_PRED_CTX 2 // number of context models for intra prediction (chroma) +#define NUM_INTER_DIR_CTX 5 // number of context models for inter prediction direction +#define NUM_MV_RES_CTX 2 // number of context models for motion vector difference + +#define NUM_REF_NO_CTX 2 // number of context models for reference index +#define NUM_TRANS_SUBDIV_FLAG_CTX 3 // number of context models for transform subdivision flags +#define NUM_QT_CBF_CTX 7 // number of context models for QT CBF +#define NUM_QT_ROOT_CBF_CTX 1 // number of context models for QT ROOT CBF +#define NUM_DELTA_QP_CTX 3 // number of context models for dQP + +#define NUM_SIG_CG_FLAG_CTX 2 // number of context models for MULTI_LEVEL_SIGNIFICANCE + +#define NUM_SIG_FLAG_CTX 42 // number of context models for sig flag +#define NUM_SIG_FLAG_CTX_LUMA 27 // number of context models for luma sig flag +#define NUM_SIG_FLAG_CTX_CHROMA 15 // number of context models for chroma sig flag + +#define NUM_CTX_LAST_FLAG_XY 18 // number of context models for last coefficient position +#define NUM_CTX_LAST_FLAG_XY_LUMA 15 // number of context models for last coefficient position of luma +#define NUM_CTX_LAST_FLAG_XY_CHROMA 3 // number of context models for last coefficient position of chroma + +#define NUM_ONE_FLAG_CTX 24 // number of context models for greater than 1 flag +#define NUM_ONE_FLAG_CTX_LUMA 16 // number of context models for greater than 1 flag of luma +#define NUM_ONE_FLAG_CTX_CHROMA 8 // number of context models for greater than 1 flag of chroma +#define NUM_ABS_FLAG_CTX 6 // number of context models for greater than 2 flag +#define NUM_ABS_FLAG_CTX_LUMA 4 // number of context models for greater than 2 flag of luma +#define NUM_ABS_FLAG_CTX_CHROMA 2 // number of context models for greater than 2 flag of chroma + +#define NUM_MVP_IDX_CTX 1 // number of context models for MVP index + +#define NUM_SAO_MERGE_FLAG_CTX 1 // number of context models for SAO merge flags +#define NUM_SAO_TYPE_IDX_CTX 1 // number of context models for SAO type index + +#define NUM_TRANSFORMSKIP_FLAG_CTX 1 // number of context models for transform skipping +#define NUM_TQUANT_BYPASS_FLAG_CTX 1 +#define CNU 154 // dummy initialization value for unused context models 'Context model Not Used' + +// Offset for context +#define OFF_SPLIT_FLAG_CTX (0) +#define OFF_SKIP_FLAG_CTX (OFF_SPLIT_FLAG_CTX + NUM_SPLIT_FLAG_CTX) +#define OFF_MERGE_FLAG_EXT_CTX (OFF_SKIP_FLAG_CTX + NUM_SKIP_FLAG_CTX) +#define OFF_MERGE_IDX_EXT_CTX (OFF_MERGE_FLAG_EXT_CTX + NUM_MERGE_FLAG_EXT_CTX) +#define OFF_PART_SIZE_CTX (OFF_MERGE_IDX_EXT_CTX + NUM_MERGE_IDX_EXT_CTX) +#define OFF_PRED_MODE_CTX (OFF_PART_SIZE_CTX + NUM_PART_SIZE_CTX) +#define OFF_ADI_CTX (OFF_PRED_MODE_CTX + NUM_PRED_MODE_CTX) +#define OFF_CHROMA_PRED_CTX (OFF_ADI_CTX + NUM_ADI_CTX) +#define OFF_DELTA_QP_CTX (OFF_CHROMA_PRED_CTX + NUM_CHROMA_PRED_CTX) +#define OFF_INTER_DIR_CTX (OFF_DELTA_QP_CTX + NUM_DELTA_QP_CTX) +#define OFF_REF_NO_CTX (OFF_INTER_DIR_CTX + NUM_INTER_DIR_CTX) +#define OFF_MV_RES_CTX (OFF_REF_NO_CTX + NUM_REF_NO_CTX) +#define OFF_QT_CBF_CTX (OFF_MV_RES_CTX + NUM_MV_RES_CTX) +#define OFF_TRANS_SUBDIV_FLAG_CTX (OFF_QT_CBF_CTX + NUM_QT_CBF_CTX) +#define OFF_QT_ROOT_CBF_CTX (OFF_TRANS_SUBDIV_FLAG_CTX + NUM_TRANS_SUBDIV_FLAG_CTX) +#define OFF_SIG_CG_FLAG_CTX (OFF_QT_ROOT_CBF_CTX + NUM_QT_ROOT_CBF_CTX) +#define OFF_SIG_FLAG_CTX (OFF_SIG_CG_FLAG_CTX + 2 * NUM_SIG_CG_FLAG_CTX) +#define OFF_CTX_LAST_FLAG_X (OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX) +#define OFF_CTX_LAST_FLAG_Y (OFF_CTX_LAST_FLAG_X + NUM_CTX_LAST_FLAG_XY) +#define OFF_ONE_FLAG_CTX (OFF_CTX_LAST_FLAG_Y + NUM_CTX_LAST_FLAG_XY) +#define OFF_ABS_FLAG_CTX (OFF_ONE_FLAG_CTX + NUM_ONE_FLAG_CTX) +#define OFF_MVP_IDX_CTX (OFF_ABS_FLAG_CTX + NUM_ABS_FLAG_CTX) +#define OFF_SAO_MERGE_FLAG_CTX (OFF_MVP_IDX_CTX + NUM_MVP_IDX_CTX) +#define OFF_SAO_TYPE_IDX_CTX (OFF_SAO_MERGE_FLAG_CTX + NUM_SAO_MERGE_FLAG_CTX) +#define OFF_TRANSFORMSKIP_FLAG_CTX (OFF_SAO_TYPE_IDX_CTX + NUM_SAO_TYPE_IDX_CTX) +#define OFF_TQUANT_BYPASS_FLAG_CTX (OFF_TRANSFORMSKIP_FLAG_CTX + 2 * NUM_TRANSFORMSKIP_FLAG_CTX) +#define MAX_OFF_CTX_MOD (OFF_TQUANT_BYPASS_FLAG_CTX + NUM_TQUANT_BYPASS_FLAG_CTX) + +namespace x265 { +// private namespace + +extern const uint32_t g_entropyBits[128]; +extern const uint8_t g_nextState[128][2]; + +#define sbacGetMps(S) ((S) & 1) +#define sbacGetState(S) ((S) >> 1) +#define sbacNext(S, V) (g_nextState[(S)][(V)]) +#define sbacGetEntropyBits(S, V) (g_entropyBits[(S) ^ (V)]) +#define sbacGetEntropyBitsTrm(V) (g_entropyBits[126 ^ (V)]) + +#define MAX_NUM_CHANNEL_TYPE 2 + +static const uint32_t ctxCbf[3][5] = { { 1, 0, 0, 0, 0 }, { 2, 3, 4, 5, 6 }, { 2, 3, 4, 5, 6 } }; +static const uint32_t significanceMapContextSetStart[MAX_NUM_CHANNEL_TYPE][3] = { { 0, 9, 21 }, { 0, 9, 12 } }; +static const uint32_t significanceMapContextSetSize[MAX_NUM_CHANNEL_TYPE][3] = { { 9, 12, 6 }, { 9, 3, 3 } }; +static const uint32_t nonDiagonalScan8x8ContextOffset[MAX_NUM_CHANNEL_TYPE] = { 6, 0 }; +static const uint32_t notFirstGroupNeighbourhoodContextOffset[MAX_NUM_CHANNEL_TYPE] = { 3, 0 }; + +// initial probability for cu_transquant_bypass flag +static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] = +{ + { 154 }, + { 154 }, + { 154 }, +}; + +// initial probability for split flag +static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] = +{ + { 107, 139, 126, }, + { 107, 139, 126, }, + { 139, 141, 157, }, +}; + +static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] = +{ + { 197, 185, 201, }, + { 197, 185, 201, }, + { CNU, CNU, CNU, }, +}; + +static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] = +{ + { 154, }, + { 110, }, + { CNU, }, +}; + +static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] = +{ + { 137, }, + { 122, }, + { CNU, }, +}; + +static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] = +{ + { 154, 139, 154, 154 }, + { 154, 139, 154, 154 }, + { 184, CNU, CNU, CNU }, +}; + +static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] = +{ + { 134, }, + { 149, }, + { CNU, }, +}; + +static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] = +{ + { 183, }, + { 154, }, + { 184, }, +}; + +static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] = +{ + { 152, 139, }, + { 152, 139, }, + { 63, 139, }, +}; + +static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] = +{ + { 95, 79, 63, 31, 31, }, + { 95, 79, 63, 31, 31, }, + { CNU, CNU, CNU, CNU, CNU, }, +}; + +static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] = +{ + { 169, 198, }, + { 140, 198, }, + { CNU, CNU, }, +}; + +static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] = +{ + { 153, 153 }, + { 153, 153 }, + { CNU, CNU }, +}; + +static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] = +{ + { 154, 154, 154, }, + { 154, 154, 154, }, + { 154, 154, 154, }, +}; + +static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] = +{ + { 153, 111, 149, 92, 167, 154, 154 }, + { 153, 111, 149, 107, 167, 154, 154 }, + { 111, 141, 94, 138, 182, 154, 154 }, +}; + +static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] = +{ + { 79, }, + { 79, }, + { CNU, }, +}; + +static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] = +{ + { 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, 79, + 108, 123, 93 }, + { 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, 94, + 108, 123, 108 }, + { 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, 79, + 108, 123, 63 }, +}; + +static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] = +{ + { 121, 140, + 61, 154, }, + { 121, 140, + 61, 154, }, + { 91, 171, + 134, 141, }, +}; + +static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] = +{ + { 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, }, + { 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, }, + { 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, }, +}; + +static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] = +{ + { 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, }, + { 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, }, + { 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107, 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, }, +}; + +static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] = +{ + { 107, 167, 91, 107, 107, 167, }, + { 107, 167, 91, 122, 107, 167, }, + { 138, 153, 136, 167, 152, 152, }, +}; + +static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] = +{ + { 168 }, + { 168 }, + { CNU }, +}; + +static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] = +{ + { 153, }, + { 153, }, + { 153, }, +}; + +static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] = +{ + { 160, }, + { 185, }, + { 200, }, +}; + +static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] = +{ + { 224, 167, 122, }, + { 124, 138, 94, }, + { 153, 138, 138, }, +}; + +static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] = +{ + { 139, 139 }, + { 139, 139 }, + { 139, 139 }, +}; +} + +#endif // ifndef X265_CONTEXTS_H diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp new file mode 100644 index 0000000..8e5430e --- /dev/null +++ b/source/common/cpu.cpp @@ -0,0 +1,374 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Loren Merritt + * Laurent Aimar + * Fiona Glaser + * Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "cpu.h" +#include "common.h" + +#if MACOS || SYS_FREEBSD +#include +#include +#endif +#if SYS_OPENBSD +#include +#include +#include +#endif + +#if X265_ARCH_ARM && !defined(HAVE_NEON) +#include +#include +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump = 0; + +static void sigill_handler(int sig) +{ + if (!canjump) + { + signal(sig, SIG_DFL); + raise(sig); + } + + canjump = 0; + siglongjmp(jmpbuf, 1); +} + +#endif // if X265_ARCH_ARM + +namespace x265 { +const cpu_name_t cpu_names[] = +{ +#if X265_ARCH_X86 +#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV + { "MMX2", MMX2 }, + { "MMXEXT", MMX2 }, + { "SSE", MMX2 | X265_CPU_SSE }, +#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2 + { "SSE2Slow", SSE2 | X265_CPU_SSE2_IS_SLOW }, + { "SSE2", SSE2 }, + { "SSE2Fast", SSE2 | X265_CPU_SSE2_IS_FAST }, + { "SSE3", SSE2 | X265_CPU_SSE3 }, + { "SSSE3", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 }, + { "SSE4.1", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 }, + { "SSE4", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 }, + { "SSE4.2", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 }, +#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX + { "AVX", AVX }, + { "XOP", AVX | X265_CPU_XOP }, + { "FMA4", AVX | X265_CPU_FMA4 }, + { "AVX2", AVX | X265_CPU_AVX2 }, + { "FMA3", AVX | X265_CPU_FMA3 }, +#undef AVX +#undef SSE2 +#undef MMX2 + { "Cache32", X265_CPU_CACHELINE_32 }, + { "Cache64", X265_CPU_CACHELINE_64 }, + { "LZCNT", X265_CPU_LZCNT }, + { "BMI1", X265_CPU_BMI1 }, + { "BMI2", X265_CPU_BMI1 | X265_CPU_BMI2 }, + { "SlowCTZ", X265_CPU_SLOW_CTZ }, + { "SlowAtom", X265_CPU_SLOW_ATOM }, + { "SlowPshufb", X265_CPU_SLOW_PSHUFB }, + { "SlowPalignr", X265_CPU_SLOW_PALIGNR }, + { "SlowShuffle", X265_CPU_SLOW_SHUFFLE }, + { "UnalignedStack", X265_CPU_STACK_MOD4 }, + +#elif X265_ARCH_ARM + { "ARMv6", X265_CPU_ARMV6 }, + { "NEON", X265_CPU_NEON }, + { "FastNeonMRC", X265_CPU_FAST_NEON_MRC }, +#endif // if X265_ARCH_X86 + { "", 0 }, +}; + +#if X265_ARCH_X86 + +extern "C" { +/* cpu-a.asm */ +int x265_cpu_cpuid_test(void); +void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); +void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx); +} + +#if defined(_MSC_VER) +#pragma warning(disable: 4309) // truncation of constant value +#endif + +uint32_t cpu_detect(void) +{ + uint32_t cpu = 0; + + uint32_t eax, ebx, ecx, edx; + uint32_t vendor[4] = { 0 }; + uint32_t max_extended_cap, max_basic_cap; + +#if !X86_64 + if (!x265_cpu_cpuid_test()) + return 0; +#endif + + x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1); + max_basic_cap = eax; + if (max_basic_cap == 0) + return 0; + + x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); + if (edx & 0x00800000) + cpu |= X265_CPU_MMX; + else + return cpu; + if (edx & 0x02000000) + cpu |= X265_CPU_MMX2 | X265_CPU_SSE; + if (edx & 0x00008000) + cpu |= X265_CPU_CMOV; + else + return cpu; + if (edx & 0x04000000) + cpu |= X265_CPU_SSE2; + if (ecx & 0x00000001) + cpu |= X265_CPU_SSE3; + if (ecx & 0x00000200) + cpu |= X265_CPU_SSSE3; + if (ecx & 0x00080000) + cpu |= X265_CPU_SSE4; + if (ecx & 0x00100000) + cpu |= X265_CPU_SSE42; + /* Check OXSAVE and AVX bits */ + if ((ecx & 0x18000000) == 0x18000000) + { + /* Check for OS support */ + x265_cpu_xgetbv(0, &eax, &edx); + if ((eax & 0x6) == 0x6) + { + cpu |= X265_CPU_AVX; + if (ecx & 0x00001000) + cpu |= X265_CPU_FMA3; + } + } + + if (max_basic_cap >= 7) + { + x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); + /* AVX2 requires OS support, but BMI1/2 don't. */ + if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020)) + cpu |= X265_CPU_AVX2; + if (ebx & 0x00000008) + { + cpu |= X265_CPU_BMI1; + if (ebx & 0x00000100) + cpu |= X265_CPU_BMI2; + } + } + + if (cpu & X265_CPU_SSSE3) + cpu |= X265_CPU_SSE2_IS_FAST; + + x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + max_extended_cap = eax; + + if (max_extended_cap >= 0x80000001) + { + x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + + if (ecx & 0x00000020) + cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ + if (ecx & 0x00000040) /* SSE4a, AMD only */ + { + int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + cpu |= X265_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ + if (family == 0x14) + { + cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ + cpu |= X265_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ + cpu |= X265_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ + } + if (family == 0x16) + { + cpu |= X265_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough + * compared to alternate instruction sequences that this + * is equal or faster on almost all such functions. */ + } + } + + if (cpu & X265_CPU_AVX) + { + if (ecx & 0x00000800) /* XOP */ + cpu |= X265_CPU_XOP; + if (ecx & 0x00010000) /* FMA4 */ + cpu |= X265_CPU_FMA4; + } + + if (!strcmp((char*)vendor, "AuthenticAMD")) + { + if (edx & 0x00400000) + cpu |= X265_CPU_MMX2; + if (!(cpu & X265_CPU_LZCNT)) + cpu |= X265_CPU_SLOW_CTZ; + if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST)) + cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ + } + } + + if (!strcmp((char*)vendor, "GenuineIntel")) + { + x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); + int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); + if (family == 6) + { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") + * theoretically support sse2, but it's significantly slower than mmx for + * almost all of x264's functions, so let's just pretend they don't. */ + if (model == 9 || model == 13 || model == 14) + { + cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3); + X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu); + } + /* Detect Atom CPU */ + else if (model == 28) + { + cpu |= X265_CPU_SLOW_ATOM; + cpu |= X265_CPU_SLOW_CTZ; + cpu |= X265_CPU_SLOW_PSHUFB; + } + + /* Conroe has a slow shuffle unit. Check the model number to make sure not + * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ + else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23) + cpu |= X265_CPU_SLOW_SHUFFLE; + } + } + + if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42)) + { + /* cacheline size is specified in 3 places, any of which may be missing */ + x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); + int cache = (ebx & 0xff00) >> 5; // cflush size + if (!cache && max_extended_cap >= 0x80000006) + { + x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + cache = ecx & 0xff; // cacheline size + } + if (!cache && max_basic_cap >= 2) + { + // Cache and TLB Information + static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; + static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, + 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; + uint32_t buf[4]; + int max, i = 0; + do + { + x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3); + max = buf[0] & 0xff; + buf[0] &= ~0xff; + for (int j = 0; j < 4; j++) + { + if (!(buf[j] >> 31)) + while (buf[j]) + { + if (strchr(cache32_ids, buf[j] & 0xff)) + cache = 32; + if (strchr(cache64_ids, buf[j] & 0xff)) + cache = 64; + buf[j] >>= 8; + } + } + } + while (++i < max); + } + + if (cache == 32) + cpu |= X265_CPU_CACHELINE_32; + else if (cache == 64) + cpu |= X265_CPU_CACHELINE_64; + else + x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n"); + } + +#if BROKEN_STACK_ALIGNMENT + cpu |= X265_CPU_STACK_MOD4; +#endif + + return cpu; +} + +#elif X265_ARCH_ARM + +extern "C" { +void x265_cpu_neon_test(void); +int x265_cpu_fast_neon_mrc_test(void); +} + +uint32_t cpu_detect(void) +{ + int flags = 0; + +#if HAVE_ARMV6 + flags |= X265_CPU_ARMV6; + + // don't do this hack if compiled with -mfpu=neon +#if !HAVE_NEON + static void (* oldsig)(int); + oldsig = signal(SIGILL, sigill_handler); + if (sigsetjmp(jmpbuf, 1)) + { + signal(SIGILL, oldsig); + return flags; + } + + canjump = 1; + x265_cpu_neon_test(); + canjump = 0; + signal(SIGILL, oldsig); +#endif // if !HAVE_NEON + + flags |= X265_CPU_NEON; + + // fast neon -> arm (Cortex-A9) detection relies on user access to the + // cycle counter; this assumes ARMv7 performance counters. + // NEON requires at least ARMv7, ARMv8 may require changes here, but + // hopefully this hacky detection method will have been replaced by then. + // Note that there is potential for a race condition if another program or + // x264 instance disables or reinits the counters while x264 is using them, + // which may result in incorrect detection and the counters stuck enabled. + // right now Apple does not seem to support performance counters for this test +#ifndef __MACH__ + flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0; +#endif + // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) +#endif // if HAVE_ARMV6 + return flags; +} + +#else // if X265_ARCH_X86 + +uint32_t cpu_detect(void) +{ + return 0; +} + +#endif // if X265_ARCH_X86 +} diff --git a/source/common/cpu.h b/source/common/cpu.h new file mode 100644 index 0000000..6e498db --- /dev/null +++ b/source/common/cpu.h @@ -0,0 +1,59 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Loren Merritt + * Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_CPU_H +#define X265_CPU_H + +#include "common.h" + +// from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp +extern "C" void x265_cpu_emms(void); +extern "C" void x265_safe_intel_cpu_indicator_init(void); + +#if _MSC_VER && _WIN64 +#define x265_emms() x265_cpu_emms() +#elif _MSC_VER +#include +#define x265_emms() _mm_empty() +#elif __GNUC__ +// Cannot use _mm_empty() directly without compiling all the source with +// a fixed CPU arch, which we would like to avoid at the moment +#define x265_emms() x265_cpu_emms() +#else +#define x265_emms() x265_cpu_emms() +#endif + +namespace x265 { +uint32_t cpu_detect(void); + +struct cpu_name_t +{ + char name[16]; + uint32_t flags; +}; + +extern const cpu_name_t cpu_names[]; +} + +#endif // ifndef X265_CPU_H diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp new file mode 100644 index 0000000..d28e005 --- /dev/null +++ b/source/common/cudata.cpp @@ -0,0 +1,2139 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" +#include "mv.h" +#include "cudata.h" + +using namespace x265; + +namespace { +// file private namespace + +/* for all bcast* and copy* functions, dst and src are aligned to MIN(size, 32) */ + +void bcast1(uint8_t* dst, uint8_t val) { dst[0] = val; } + +void copy4(uint8_t* dst, uint8_t* src) { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; } +void bcast4(uint8_t* dst, uint8_t val) { ((uint32_t*)dst)[0] = 0x01010101 * val; } + +void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; } +void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; } + +void copy64(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; + ((uint64_t*)dst)[2] = ((uint64_t*)src)[2]; ((uint64_t*)dst)[3] = ((uint64_t*)src)[3]; + ((uint64_t*)dst)[4] = ((uint64_t*)src)[4]; ((uint64_t*)dst)[5] = ((uint64_t*)src)[5]; + ((uint64_t*)dst)[6] = ((uint64_t*)src)[6]; ((uint64_t*)dst)[7] = ((uint64_t*)src)[7]; } +void bcast64(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; + ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; ((uint64_t*)dst)[2] = bval; ((uint64_t*)dst)[3] = bval; + ((uint64_t*)dst)[4] = bval; ((uint64_t*)dst)[5] = bval; ((uint64_t*)dst)[6] = bval; ((uint64_t*)dst)[7] = bval; } + +/* at 256 bytes, memset/memcpy will probably use SIMD more effectively than our uint64_t hack, + * but hand-written assembly would beat it. */ +void copy256(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 256); } +void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); } + +/* Check whether 2 addresses point to the same column */ +inline bool isEqualCol(int addrA, int addrB, int numUnitsPerRow) +{ + // addrA % numUnitsPerRow == addrB % numUnitsPerRow + return ((addrA ^ addrB) & (numUnitsPerRow - 1)) == 0; +} + +/* Check whether 2 addresses point to the same row */ +inline bool isEqualRow(int addrA, int addrB, int numUnitsPerRow) +{ + // addrA / numUnitsPerRow == addrB / numUnitsPerRow + return ((addrA ^ addrB) & ~(numUnitsPerRow - 1)) == 0; +} + +/* Check whether 2 addresses point to the same row or column */ +inline bool isEqualRowOrCol(int addrA, int addrB, int numUnitsPerRow) +{ + return isEqualCol(addrA, addrB, numUnitsPerRow) | isEqualRow(addrA, addrB, numUnitsPerRow); +} + +/* Check whether one address points to the first column */ +inline bool isZeroCol(int addr, int numUnitsPerRow) +{ + // addr % numUnitsPerRow == 0 + return (addr & (numUnitsPerRow - 1)) == 0; +} + +/* Check whether one address points to the first row */ +inline bool isZeroRow(int addr, int numUnitsPerRow) +{ + // addr / numUnitsPerRow == 0 + return (addr & ~(numUnitsPerRow - 1)) == 0; +} + +/* Check whether one address points to a column whose index is smaller than a given value */ +inline bool lessThanCol(int addr, int val, int numUnitsPerRow) +{ + // addr % numUnitsPerRow < val + return (addr & (numUnitsPerRow - 1)) < val; +} + +/* Check whether one address points to a row whose index is smaller than a given value */ +inline bool lessThanRow(int addr, int val, int numUnitsPerRow) +{ + // addr / numUnitsPerRow < val + return addr < val * numUnitsPerRow; +} + +inline MV scaleMv(MV mv, int scale) +{ + int mvx = Clip3(-32768, 32767, (scale * mv.x + 127 + (scale * mv.x < 0)) >> 8); + int mvy = Clip3(-32768, 32767, (scale * mv.y + 127 + (scale * mv.y < 0)) >> 8); + + return MV((int16_t)mvx, (int16_t)mvy); +} + +// Partition table. +// First index is partitioning mode. Second index is partition index. +// Third index is 0 for partition sizes, 1 for partition offsets. The +// sizes and offsets are encoded as two packed 4-bit values (X,Y). +// X and Y represent 1/4 fractions of the block size. +const uint32_t partTable[8][4][2] = +{ + // XY + { { 0x44, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2Nx2N. + { { 0x42, 0x00 }, { 0x42, 0x02 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxN. + { { 0x24, 0x00 }, { 0x24, 0x20 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_Nx2N. + { { 0x22, 0x00 }, { 0x22, 0x20 }, { 0x22, 0x02 }, { 0x22, 0x22 } }, // SIZE_NxN. + { { 0x41, 0x00 }, { 0x43, 0x01 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnU. + { { 0x43, 0x00 }, { 0x41, 0x03 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnD. + { { 0x14, 0x00 }, { 0x34, 0x10 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_nLx2N. + { { 0x34, 0x00 }, { 0x14, 0x30 }, { 0x00, 0x00 }, { 0x00, 0x00 } } // SIZE_nRx2N. +}; + +// Partition Address table. +// First index is partitioning mode. Second index is partition address. +const uint32_t partAddrTable[8][4] = +{ + { 0x00, 0x00, 0x00, 0x00 }, // SIZE_2Nx2N. + { 0x00, 0x08, 0x08, 0x08 }, // SIZE_2NxN. + { 0x00, 0x04, 0x04, 0x04 }, // SIZE_Nx2N. + { 0x00, 0x04, 0x08, 0x0C }, // SIZE_NxN. + { 0x00, 0x02, 0x02, 0x02 }, // SIZE_2NxnU. + { 0x00, 0x0A, 0x0A, 0x0A }, // SIZE_2NxnD. + { 0x00, 0x01, 0x01, 0x01 }, // SIZE_nLx2N. + { 0x00, 0x05, 0x05, 0x05 } // SIZE_nRx2N. +}; + +} + +cubcast_t CUData::s_partSet[NUM_FULL_DEPTH] = { NULL, NULL, NULL, NULL, NULL }; +uint32_t CUData::s_numPartInCUSize; + +CUData::CUData() +{ + memset(this, 0, sizeof(*this)); +} + +void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance) +{ + m_chromaFormat = csp; + m_hChromaShift = CHROMA_H_SHIFT(csp); + m_vChromaShift = CHROMA_V_SHIFT(csp); + m_numPartitions = NUM_CU_PARTITIONS >> (depth * 2); + + if (!s_partSet[0]) + { + s_numPartInCUSize = 1 << g_maxFullDepth; + switch (g_maxLog2CUSize) + { + case 6: + s_partSet[0] = bcast256; + s_partSet[1] = bcast64; + s_partSet[2] = bcast16; + s_partSet[3] = bcast4; + s_partSet[4] = bcast1; + break; + case 5: + s_partSet[0] = bcast64; + s_partSet[1] = bcast16; + s_partSet[2] = bcast4; + s_partSet[3] = bcast1; + s_partSet[4] = NULL; + break; + case 4: + s_partSet[0] = bcast16; + s_partSet[1] = bcast4; + s_partSet[2] = bcast1; + s_partSet[3] = NULL; + s_partSet[4] = NULL; + break; + default: + X265_CHECK(0, "unexpected CTU size\n"); + break; + } + } + + switch (m_numPartitions) + { + case 256: // 64x64 CU + m_partCopy = copy256; + m_partSet = bcast256; + m_subPartCopy = copy64; + m_subPartSet = bcast64; + break; + case 64: // 32x32 CU + m_partCopy = copy64; + m_partSet = bcast64; + m_subPartCopy = copy16; + m_subPartSet = bcast16; + break; + case 16: // 16x16 CU + m_partCopy = copy16; + m_partSet = bcast16; + m_subPartCopy = copy4; + m_subPartSet = bcast4; + break; + case 4: // 8x8 CU + m_partCopy = copy4; + m_partSet = bcast4; + m_subPartCopy = NULL; + m_subPartSet = NULL; + break; + default: + X265_CHECK(0, "unexpected CU partition count\n"); + break; + } + + /* Each CU's data is layed out sequentially within the charMemBlock */ + uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance; + + m_qp = (char*)charBuf; charBuf += m_numPartitions; + m_log2CUSize = charBuf; charBuf += m_numPartitions; + m_partSize = charBuf; charBuf += m_numPartitions; + m_predMode = charBuf; charBuf += m_numPartitions; + m_lumaIntraDir = charBuf; charBuf += m_numPartitions; + m_tqBypass = charBuf; charBuf += m_numPartitions; + m_refIdx[0] = (char*)charBuf; charBuf += m_numPartitions; + m_refIdx[1] = (char*)charBuf; charBuf += m_numPartitions; + m_cuDepth = charBuf; charBuf += m_numPartitions; + m_skipFlag = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */ + m_mergeFlag = charBuf; charBuf += m_numPartitions; + m_interDir = charBuf; charBuf += m_numPartitions; + m_mvpIdx[0] = charBuf; charBuf += m_numPartitions; + m_mvpIdx[1] = charBuf; charBuf += m_numPartitions; + m_tuDepth = charBuf; charBuf += m_numPartitions; + m_transformSkip[0] = charBuf; charBuf += m_numPartitions; + m_transformSkip[1] = charBuf; charBuf += m_numPartitions; + m_transformSkip[2] = charBuf; charBuf += m_numPartitions; + m_cbf[0] = charBuf; charBuf += m_numPartitions; + m_cbf[1] = charBuf; charBuf += m_numPartitions; + m_cbf[2] = charBuf; charBuf += m_numPartitions; + m_chromaIntraDir = charBuf; charBuf += m_numPartitions; + + X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n"); + + m_mv[0] = dataPool.mvMemBlock + (instance * 4) * m_numPartitions; + m_mv[1] = m_mv[0] + m_numPartitions; + m_mvd[0] = m_mv[1] + m_numPartitions; + m_mvd[1] = m_mvd[0] + m_numPartitions; + + uint32_t cuSize = g_maxCUSize >> depth; + uint32_t sizeL = cuSize * cuSize; + uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); + m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2); + m_trCoeff[1] = m_trCoeff[0] + sizeL; + m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC; +} + +void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp) +{ + m_encData = frame.m_encData; + m_slice = m_encData->m_slice; + m_cuAddr = cuAddr; + m_cuPelX = (cuAddr % m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize; + m_cuPelY = (cuAddr / m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize; + m_absIdxInCTU = 0; + m_numPartitions = NUM_CU_PARTITIONS; + + /* sequential memsets */ + m_partSet((uint8_t*)m_qp, (uint8_t)qp); + m_partSet(m_log2CUSize, (uint8_t)g_maxLog2CUSize); + m_partSet(m_partSize, (uint8_t)SIZE_NONE); + m_partSet(m_predMode, (uint8_t)MODE_NONE); + m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX); + m_partSet(m_tqBypass, (uint8_t)frame.m_encData->m_param->bLossless); + if (m_slice->m_sliceType != I_SLICE) + { + m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID); + m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID); + } + + X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n"); + + /* initialize the remaining CU data in one memset */ + memset(m_cuDepth, 0, (BytesPerPartition - 8) * m_numPartitions); + + uint32_t widthInCU = m_slice->m_sps->numCuInWidth; + m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL; + m_cuAbove = (m_cuAddr / widthInCU) ? m_encData->getPicCTU(m_cuAddr - widthInCU) : NULL; + m_cuAboveLeft = (m_cuLeft && m_cuAbove) ? m_encData->getPicCTU(m_cuAddr - widthInCU - 1) : NULL; + m_cuAboveRight = (m_cuAbove && ((m_cuAddr % widthInCU) < (widthInCU - 1))) ? m_encData->getPicCTU(m_cuAddr - widthInCU + 1) : NULL; +} + +// initialize Sub partition +void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom) +{ + m_absIdxInCTU = cuGeom.encodeIdx; + m_encData = ctu.m_encData; + m_slice = ctu.m_slice; + m_cuAddr = ctu.m_cuAddr; + m_cuPelX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx]; + m_cuPelY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx]; + m_cuLeft = ctu.m_cuLeft; + m_cuAbove = ctu.m_cuAbove; + m_cuAboveLeft = ctu.m_cuAboveLeft; + m_cuAboveRight = ctu.m_cuAboveRight; + X265_CHECK(m_numPartitions == cuGeom.numPartitions, "initSubCU() size mismatch\n"); + + /* sequential memsets */ + m_partSet((uint8_t*)m_qp, (uint8_t)ctu.m_qp[0]); + m_partSet(m_log2CUSize, (uint8_t)cuGeom.log2CUSize); + m_partSet(m_partSize, (uint8_t)SIZE_NONE); + m_partSet(m_predMode, (uint8_t)MODE_NONE); + m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX); + m_partSet(m_tqBypass, (uint8_t)m_encData->m_param->bLossless); + m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID); + m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID); + m_partSet(m_cuDepth, (uint8_t)cuGeom.depth); + + /* initialize the remaining CU data in one memset */ + memset(m_skipFlag, 0, (BytesPerPartition - 9) * m_numPartitions); +} + +/* Copy the results of a sub-part (split) CU to the parent CU */ +void CUData::copyPartFrom(const CUData& subCU, const CUGeom& childGeom, uint32_t subPartIdx) +{ + X265_CHECK(subPartIdx < 4, "part unit should be less than 4\n"); + + uint32_t offset = childGeom.numPartitions * subPartIdx; + + m_subPartCopy((uint8_t*)m_qp + offset, (uint8_t*)subCU.m_qp); + m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize); + m_subPartCopy(m_partSize + offset, subCU.m_partSize); + m_subPartCopy(m_predMode + offset, subCU.m_predMode); + m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir); + m_subPartCopy(m_tqBypass + offset, subCU.m_tqBypass); + m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]); + m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]); + m_subPartCopy(m_cuDepth + offset, subCU.m_cuDepth); + m_subPartCopy(m_skipFlag + offset, subCU.m_skipFlag); + m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag); + m_subPartCopy(m_interDir + offset, subCU.m_interDir); + m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]); + m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]); + m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth); + m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]); + m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]); + m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]); + m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]); + m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]); + m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]); + m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir); + + memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV)); + memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV)); + memcpy(m_mvd[0] + offset, subCU.m_mvd[0], childGeom.numPartitions * sizeof(MV)); + memcpy(m_mvd[1] + offset, subCU.m_mvd[1], childGeom.numPartitions * sizeof(MV)); + + uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2); + uint32_t tmp2 = subPartIdx * tmp; + memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t) * tmp); + + uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift); + uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift); + memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC); + memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC); +} + +/* If a sub-CU part is not present (off the edge of the picture) its depth and + * log2size should still be configured */ +void CUData::setEmptyPart(const CUGeom& childGeom, uint32_t subPartIdx) +{ + uint32_t offset = childGeom.numPartitions * subPartIdx; + m_subPartSet(m_cuDepth + offset, (uint8_t)childGeom.depth); + m_subPartSet(m_log2CUSize + offset, (uint8_t)childGeom.log2CUSize); +} + +/* Copy all CU data from one instance to the next, except set lossless flag + * This will only get used when --cu-lossless is enabled but --lossless is not. */ +void CUData::initLosslessCU(const CUData& cu, const CUGeom& cuGeom) +{ + /* Start by making an exact copy */ + m_encData = cu.m_encData; + m_slice = cu.m_slice; + m_cuAddr = cu.m_cuAddr; + m_cuPelX = cu.m_cuPelX; + m_cuPelY = cu.m_cuPelY; + m_cuLeft = cu.m_cuLeft; + m_cuAbove = cu.m_cuAbove; + m_cuAboveLeft = cu.m_cuAboveLeft; + m_cuAboveRight = cu.m_cuAboveRight; + m_absIdxInCTU = cuGeom.encodeIdx; + m_numPartitions = cuGeom.numPartitions; + memcpy(m_qp, cu.m_qp, BytesPerPartition * m_numPartitions); + memcpy(m_mv[0], cu.m_mv[0], m_numPartitions * sizeof(MV)); + memcpy(m_mv[1], cu.m_mv[1], m_numPartitions * sizeof(MV)); + memcpy(m_mvd[0], cu.m_mvd[0], m_numPartitions * sizeof(MV)); + memcpy(m_mvd[1], cu.m_mvd[1], m_numPartitions * sizeof(MV)); + + /* force TQBypass to true */ + m_partSet(m_tqBypass, true); + + /* clear residual coding flags */ + m_partSet(m_skipFlag, 0); + m_partSet(m_tuDepth, 0); + m_partSet(m_transformSkip[0], 0); + m_partSet(m_transformSkip[1], 0); + m_partSet(m_transformSkip[2], 0); + m_partSet(m_cbf[0], 0); + m_partSet(m_cbf[1], 0); + m_partSet(m_cbf[2], 0); +} + +/* Copy completed predicted CU to CTU in picture */ +void CUData::copyToPic(uint32_t depth) const +{ + CUData& ctu = *m_encData->getPicCTU(m_cuAddr); + + m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp); + m_partCopy(ctu.m_log2CUSize + m_absIdxInCTU, m_log2CUSize); + m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize); + m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode); + m_partCopy(ctu.m_lumaIntraDir + m_absIdxInCTU, m_lumaIntraDir); + m_partCopy(ctu.m_tqBypass + m_absIdxInCTU, m_tqBypass); + m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]); + m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]); + m_partCopy(ctu.m_cuDepth + m_absIdxInCTU, m_cuDepth); + m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag); + m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag); + m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir); + m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]); + m_partCopy(ctu.m_mvpIdx[1] + m_absIdxInCTU, m_mvpIdx[1]); + m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth); + m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]); + m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]); + m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]); + m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]); + m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]); + m_partCopy(ctu.m_cbf[2] + m_absIdxInCTU, m_cbf[2]); + m_partCopy(ctu.m_chromaIntraDir + m_absIdxInCTU, m_chromaIntraDir); + + memcpy(ctu.m_mv[0] + m_absIdxInCTU, m_mv[0], m_numPartitions * sizeof(MV)); + memcpy(ctu.m_mv[1] + m_absIdxInCTU, m_mv[1], m_numPartitions * sizeof(MV)); + memcpy(ctu.m_mvd[0] + m_absIdxInCTU, m_mvd[0], m_numPartitions * sizeof(MV)); + memcpy(ctu.m_mvd[1] + m_absIdxInCTU, m_mvd[1], m_numPartitions * sizeof(MV)); + + uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2); + uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2); + memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t) * tmpY); + + uint32_t tmpC = tmpY >> (m_hChromaShift + m_vChromaShift); + uint32_t tmpC2 = tmpY2 >> (m_hChromaShift + m_vChromaShift); + memcpy(ctu.m_trCoeff[1] + tmpC2, m_trCoeff[1], sizeof(coeff_t) * tmpC); + memcpy(ctu.m_trCoeff[2] + tmpC2, m_trCoeff[2], sizeof(coeff_t) * tmpC); +} + +/* The reverse of copyToPic, called only by encodeResidue */ +void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom) +{ + m_encData = ctu.m_encData; + m_slice = ctu.m_slice; + m_cuAddr = ctu.m_cuAddr; + m_cuPelX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx]; + m_cuPelY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx]; + m_absIdxInCTU = cuGeom.encodeIdx; + m_numPartitions = cuGeom.numPartitions; + + /* copy out all prediction info for this part */ + m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU); + m_partCopy(m_log2CUSize, ctu.m_log2CUSize + m_absIdxInCTU); + m_partCopy(m_partSize, ctu.m_partSize + m_absIdxInCTU); + m_partCopy(m_predMode, ctu.m_predMode + m_absIdxInCTU); + m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU); + m_partCopy(m_tqBypass, ctu.m_tqBypass + m_absIdxInCTU); + m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU); + m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU); + m_partCopy(m_cuDepth, ctu.m_cuDepth + m_absIdxInCTU); + m_partCopy(m_mergeFlag, ctu.m_mergeFlag + m_absIdxInCTU); + m_partCopy(m_interDir, ctu.m_interDir + m_absIdxInCTU); + m_partCopy(m_mvpIdx[0], ctu.m_mvpIdx[0] + m_absIdxInCTU); + m_partCopy(m_mvpIdx[1], ctu.m_mvpIdx[1] + m_absIdxInCTU); + m_partCopy(m_chromaIntraDir, ctu.m_chromaIntraDir + m_absIdxInCTU); + + memcpy(m_mv[0], ctu.m_mv[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV)); + memcpy(m_mv[1], ctu.m_mv[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV)); + memcpy(m_mvd[0], ctu.m_mvd[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV)); + memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV)); + + /* clear residual coding flags */ + m_partSet(m_skipFlag, 0); + m_partSet(m_tuDepth, 0); + m_partSet(m_transformSkip[0], 0); + m_partSet(m_transformSkip[1], 0); + m_partSet(m_transformSkip[2], 0); + m_partSet(m_cbf[0], 0); + m_partSet(m_cbf[1], 0); + m_partSet(m_cbf[2], 0); +} + +/* Only called by encodeResidue, these fields can be modified during inter/intra coding */ +void CUData::updatePic(uint32_t depth) const +{ + CUData& ctu = *m_encData->getPicCTU(m_cuAddr); + + m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp); + m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]); + m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]); + m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]); + m_partCopy(ctu.m_skipFlag + m_absIdxInCTU, m_skipFlag); + m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth); + m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]); + m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]); + m_partCopy(ctu.m_cbf[2] + m_absIdxInCTU, m_cbf[2]); + m_partCopy(ctu.m_chromaIntraDir + m_absIdxInCTU, m_chromaIntraDir); + + uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2); + uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2); + memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t) * tmpY); + tmpY >>= m_hChromaShift + m_vChromaShift; + tmpY2 >>= m_hChromaShift + m_vChromaShift; + memcpy(ctu.m_trCoeff[1] + tmpY2, m_trCoeff[1], sizeof(coeff_t) * tmpY); + memcpy(ctu.m_trCoeff[2] + tmpY2, m_trCoeff[2], sizeof(coeff_t) * tmpY); +} + +const CUData* CUData::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const +{ + uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx]; + + if (!isZeroCol(absPartIdx, s_numPartInCUSize)) + { + uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU]; + lPartUnitIdx = g_rasterToZscan[absPartIdx - 1]; + if (isEqualCol(absPartIdx, absZorderCUIdx, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + lPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + + lPartUnitIdx = g_rasterToZscan[absPartIdx + s_numPartInCUSize - 1]; + return m_cuLeft; +} + +const CUData* CUData::getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary) const +{ + uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx]; + + if (!isZeroRow(absPartIdx, s_numPartInCUSize)) + { + uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU]; + aPartUnitIdx = g_rasterToZscan[absPartIdx - s_numPartInCUSize]; + if (isEqualRow(absPartIdx, absZorderCUIdx, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + aPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + + if (planarAtCTUBoundary) + return NULL; + + aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize]; + return m_cuAbove; +} + +const CUData* CUData::getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx) const +{ + uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx]; + + if (!isZeroCol(absPartIdx, s_numPartInCUSize)) + { + if (!isZeroRow(absPartIdx, s_numPartInCUSize)) + { + uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU]; + alPartUnitIdx = g_rasterToZscan[absPartIdx - s_numPartInCUSize - 1]; + if (isEqualRowOrCol(absPartIdx, absZorderCUIdx, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + alPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize - 1]; + return m_cuAbove; + } + + if (!isZeroRow(absPartIdx, s_numPartInCUSize)) + { + alPartUnitIdx = g_rasterToZscan[absPartIdx - 1]; + return m_cuLeft; + } + + alPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - 1]; + return m_cuAboveLeft; +} + +const CUData* CUData::getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx) const +{ + if ((m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[curPartUnitIdx] + UNIT_SIZE) >= m_slice->m_sps->picWidthInLumaSamples) + return NULL; + + uint32_t absPartIdxRT = g_zscanToRaster[curPartUnitIdx]; + + if (lessThanCol(absPartIdxRT, s_numPartInCUSize - 1, s_numPartInCUSize)) + { + if (!isZeroRow(absPartIdxRT, s_numPartInCUSize)) + { + if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1]) + { + uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1; + arPartUnitIdx = g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + 1]; + if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + arPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + return NULL; + } + arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + 1]; + return m_cuAbove; + } + + if (!isZeroRow(absPartIdxRT, s_numPartInCUSize)) + return NULL; + + arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize]; + return m_cuAboveRight; +} + +const CUData* CUData::getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx) const +{ + if ((m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[curPartUnitIdx] + UNIT_SIZE) >= m_slice->m_sps->picHeightInLumaSamples) + return NULL; + + uint32_t absPartIdxLB = g_zscanToRaster[curPartUnitIdx]; + + if (lessThanRow(absPartIdxLB, s_numPartInCUSize - 1, s_numPartInCUSize)) + { + if (!isZeroCol(absPartIdxLB, s_numPartInCUSize)) + { + if (curPartUnitIdx > g_rasterToZscan[absPartIdxLB + s_numPartInCUSize - 1]) + { + uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInCTU] + ((1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1) * s_numPartInCUSize; + blPartUnitIdx = g_rasterToZscan[absPartIdxLB + s_numPartInCUSize - 1]; + if (isEqualRowOrCol(absPartIdxLB, absZorderCUIdxLB, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + blPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + return NULL; + } + blPartUnitIdx = g_rasterToZscan[absPartIdxLB + s_numPartInCUSize * 2 - 1]; + return m_cuLeft; + } + + return NULL; +} + +const CUData* CUData::getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const +{ + if ((m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[curPartUnitIdx] + (partUnitOffset << LOG2_UNIT_SIZE)) >= m_slice->m_sps->picHeightInLumaSamples) + return NULL; + + uint32_t absPartIdxLB = g_zscanToRaster[curPartUnitIdx]; + + if (lessThanRow(absPartIdxLB, s_numPartInCUSize - partUnitOffset, s_numPartInCUSize)) + { + if (!isZeroCol(absPartIdxLB, s_numPartInCUSize)) + { + if (curPartUnitIdx > g_rasterToZscan[absPartIdxLB + partUnitOffset * s_numPartInCUSize - 1]) + { + uint32_t absZorderCUIdxLB = g_zscanToRaster[m_absIdxInCTU] + ((1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1) * s_numPartInCUSize; + blPartUnitIdx = g_rasterToZscan[absPartIdxLB + partUnitOffset * s_numPartInCUSize - 1]; + if (isEqualRowOrCol(absPartIdxLB, absZorderCUIdxLB, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + blPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + return NULL; + } + blPartUnitIdx = g_rasterToZscan[absPartIdxLB + (1 + partUnitOffset) * s_numPartInCUSize - 1]; + if (!m_cuLeft || !m_cuLeft->m_slice) + return NULL; + return m_cuLeft; + } + + return NULL; +} + +const CUData* CUData::getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const +{ + if ((m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[curPartUnitIdx] + (partUnitOffset << LOG2_UNIT_SIZE)) >= m_slice->m_sps->picWidthInLumaSamples) + return NULL; + + uint32_t absPartIdxRT = g_zscanToRaster[curPartUnitIdx]; + + if (lessThanCol(absPartIdxRT, s_numPartInCUSize - partUnitOffset, s_numPartInCUSize)) + { + if (!isZeroRow(absPartIdxRT, s_numPartInCUSize)) + { + if (curPartUnitIdx > g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + partUnitOffset]) + { + uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1; + arPartUnitIdx = g_rasterToZscan[absPartIdxRT - s_numPartInCUSize + partUnitOffset]; + if (isEqualRowOrCol(absPartIdxRT, absZorderCUIdx, s_numPartInCUSize)) + return m_encData->getPicCTU(m_cuAddr); + else + { + arPartUnitIdx -= m_absIdxInCTU; + return this; + } + } + return NULL; + } + arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset]; + if (!m_cuAbove || !m_cuAbove->m_slice) + return NULL; + return m_cuAbove; + } + + if (!isZeroRow(absPartIdxRT, s_numPartInCUSize)) + return NULL; + + arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1]; + if ((m_cuAboveRight == NULL || m_cuAboveRight->m_slice == NULL || (m_cuAboveRight->m_cuAddr) > m_cuAddr)) + return NULL; + return m_cuAboveRight; +} + +/* Get left QpMinCu */ +const CUData* CUData::getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t curAbsIdxInCTU) const +{ + uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2); + uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx]; + + // check for left CTU boundary + if (isZeroCol(absRorderQpMinCUIdx, s_numPartInCUSize)) + return NULL; + + // get index of left-CU relative to top-left corner of current quantization group + lPartUnitIdx = g_rasterToZscan[absRorderQpMinCUIdx - 1]; + + // return pointer to current CTU + return m_encData->getPicCTU(m_cuAddr); +} + +/* Get above QpMinCu */ +const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdxInCTU) const +{ + uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2); + uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx]; + + // check for top CTU boundary + if (isZeroRow(absRorderQpMinCUIdx, s_numPartInCUSize)) + return NULL; + + // get index of top-CU relative to top-left corner of current quantization group + aPartUnitIdx = g_rasterToZscan[absRorderQpMinCUIdx - s_numPartInCUSize]; + + // return pointer to current CTU + return m_encData->getPicCTU(m_cuAddr); +} + +/* Get reference QP from left QpMinCu or latest coded QP */ +char CUData::getRefQP(uint32_t curAbsIdxInCTU) const +{ + uint32_t lPartIdx = 0, aPartIdx = 0; + const CUData* cULeft = getQpMinCuLeft(lPartIdx, m_absIdxInCTU + curAbsIdxInCTU); + const CUData* cUAbove = getQpMinCuAbove(aPartIdx, m_absIdxInCTU + curAbsIdxInCTU); + + return ((cULeft ? cULeft->m_qp[lPartIdx] : getLastCodedQP(curAbsIdxInCTU)) + (cUAbove ? cUAbove->m_qp[aPartIdx] : getLastCodedQP(curAbsIdxInCTU)) + 1) >> 1; +} + +int CUData::getLastValidPartIdx(int absPartIdx) const +{ + int lastValidPartIdx = absPartIdx - 1; + + while (lastValidPartIdx >= 0 && m_predMode[lastValidPartIdx] == MODE_NONE) + { + uint32_t depth = m_cuDepth[lastValidPartIdx]; + lastValidPartIdx -= m_numPartitions >> (depth << 1); + } + + return lastValidPartIdx; +} + +char CUData::getLastCodedQP(uint32_t absPartIdx) const +{ + uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2; + int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask); + + if (lastValidPartIdx >= 0) + return m_qp[lastValidPartIdx]; + else + { + if (m_absIdxInCTU) + return m_encData->getPicCTU(m_cuAddr)->getLastCodedQP(m_absIdxInCTU); + else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth))) + return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS); + else + return (char)m_slice->m_sliceQp; + } +} + +/* Get allowed chroma intra modes */ +void CUData::getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const +{ + modeList[0] = PLANAR_IDX; + modeList[1] = VER_IDX; + modeList[2] = HOR_IDX; + modeList[3] = DC_IDX; + modeList[4] = DM_CHROMA_IDX; + + uint32_t lumaMode = m_lumaIntraDir[absPartIdx]; + + for (int i = 0; i < NUM_CHROMA_MODE - 1; i++) + { + if (lumaMode == modeList[i]) + { + modeList[i] = 34; // VER+8 mode + break; + } + } +} + +/* Get most probable intra modes */ +int CUData::getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const +{ + const CUData* tempCU; + uint32_t tempPartIdx; + uint32_t leftIntraDir, aboveIntraDir; + + // Get intra direction of left PU + tempCU = getPULeft(tempPartIdx, m_absIdxInCTU + absPartIdx); + + leftIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX; + + // Get intra direction of above PU + tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx, true); + + aboveIntraDir = (tempCU && tempCU->isIntra(tempPartIdx)) ? tempCU->m_lumaIntraDir[tempPartIdx] : DC_IDX; + + if (leftIntraDir == aboveIntraDir) + { + if (leftIntraDir >= 2) // angular modes + { + intraDirPred[0] = leftIntraDir; + intraDirPred[1] = ((leftIntraDir - 2 + 31) & 31) + 2; + intraDirPred[2] = ((leftIntraDir - 2 + 1) & 31) + 2; + } + else //non-angular + { + intraDirPred[0] = PLANAR_IDX; + intraDirPred[1] = DC_IDX; + intraDirPred[2] = VER_IDX; + } + return 1; + } + else + { + intraDirPred[0] = leftIntraDir; + intraDirPred[1] = aboveIntraDir; + + if (leftIntraDir && aboveIntraDir) //both modes are non-planar + intraDirPred[2] = PLANAR_IDX; + else + intraDirPred[2] = (leftIntraDir + aboveIntraDir) < 2 ? VER_IDX : DC_IDX; + return 2; + } +} + +uint32_t CUData::getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const +{ + const CUData* tempCU; + uint32_t tempPartIdx; + uint32_t ctx; + + // Get left split flag + tempCU = getPULeft(tempPartIdx, m_absIdxInCTU + absPartIdx); + ctx = (tempCU) ? ((tempCU->m_cuDepth[tempPartIdx] > depth) ? 1 : 0) : 0; + + // Get above split flag + tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx); + ctx += (tempCU) ? ((tempCU->m_cuDepth[tempPartIdx] > depth) ? 1 : 0) : 0; + + return ctx; +} + +void CUData::getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const +{ + uint32_t log2CUSize = m_log2CUSize[absPartIdx]; + uint32_t splitFlag = m_partSize[absPartIdx] == SIZE_NxN; + + tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize; + tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize; + + tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (m_slice->m_sps->quadtreeTUMaxDepthIntra - 1 + splitFlag), tuDepthRange[1])); +} + +void CUData::getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const +{ + uint32_t log2CUSize = m_log2CUSize[absPartIdx]; + uint32_t quadtreeTUMaxDepth = m_slice->m_sps->quadtreeTUMaxDepthInter; + uint32_t splitFlag = quadtreeTUMaxDepth == 1 && m_partSize[absPartIdx] != SIZE_2Nx2N; + + tuDepthRange[0] = m_slice->m_sps->quadtreeTULog2MinSize; + tuDepthRange[1] = m_slice->m_sps->quadtreeTULog2MaxSize; + + tuDepthRange[0] = X265_MAX(tuDepthRange[0], X265_MIN(log2CUSize - (quadtreeTUMaxDepth - 1 + splitFlag), tuDepthRange[1])); +} + +uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const +{ + const CUData* tempCU; + uint32_t tempPartIdx; + uint32_t ctx; + + // Get BCBP of left PU + tempCU = getPULeft(tempPartIdx, m_absIdxInCTU + absPartIdx); + ctx = tempCU ? tempCU->isSkipped(tempPartIdx) : 0; + + // Get BCBP of above PU + tempCU = getPUAbove(tempPartIdx, m_absIdxInCTU + absPartIdx); + ctx += tempCU ? tempCU->isSkipped(tempPartIdx) : 0; + + return ctx; +} + +bool CUData::setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth) +{ + uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1); + uint32_t curPartNumQ = curPartNumb >> 2; + + if (m_cuDepth[absPartIdx] > depth) + { + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + if (setQPSubCUs(qp, absPartIdx + subPartIdx * curPartNumQ, depth + 1)) + return true; + } + else + { + if (getQtRootCbf(absPartIdx)) + return true; + else + setQPSubParts(qp, absPartIdx, depth); + } + + return false; +} + +void CUData::setPUInterDir(uint8_t dir, uint32_t absPartIdx, uint32_t puIdx) +{ + uint32_t curPartNumQ = m_numPartitions >> 2; + X265_CHECK(puIdx < 2, "unexpected part unit index\n"); + + switch (m_partSize[absPartIdx]) + { + case SIZE_2Nx2N: + memset(m_interDir + absPartIdx, dir, 4 * curPartNumQ); + break; + case SIZE_2NxN: + memset(m_interDir + absPartIdx, dir, 2 * curPartNumQ); + break; + case SIZE_Nx2N: + memset(m_interDir + absPartIdx, dir, curPartNumQ); + memset(m_interDir + absPartIdx + 2 * curPartNumQ, dir, curPartNumQ); + break; + case SIZE_NxN: + memset(m_interDir + absPartIdx, dir, curPartNumQ); + break; + case SIZE_2NxnU: + if (!puIdx) + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ >> 1)); + memset(m_interDir + absPartIdx + curPartNumQ, dir, (curPartNumQ >> 1)); + } + else + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ >> 1)); + memset(m_interDir + absPartIdx + curPartNumQ, dir, ((curPartNumQ >> 1) + (curPartNumQ << 1))); + } + break; + case SIZE_2NxnD: + if (!puIdx) + { + memset(m_interDir + absPartIdx, dir, ((curPartNumQ << 1) + (curPartNumQ >> 1))); + memset(m_interDir + absPartIdx + (curPartNumQ << 1) + curPartNumQ, dir, (curPartNumQ >> 1)); + } + else + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ >> 1)); + memset(m_interDir + absPartIdx + curPartNumQ, dir, (curPartNumQ >> 1)); + } + break; + case SIZE_nLx2N: + if (!puIdx) + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ >> 1), dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ << 1), dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ << 1) + (curPartNumQ >> 1), dir, (curPartNumQ >> 2)); + } + else + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ >> 1), dir, (curPartNumQ + (curPartNumQ >> 2))); + memset(m_interDir + absPartIdx + (curPartNumQ << 1), dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ << 1) + (curPartNumQ >> 1), dir, (curPartNumQ + (curPartNumQ >> 2))); + } + break; + case SIZE_nRx2N: + if (!puIdx) + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ + (curPartNumQ >> 2))); + memset(m_interDir + absPartIdx + curPartNumQ + (curPartNumQ >> 1), dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ << 1), dir, (curPartNumQ + (curPartNumQ >> 2))); + memset(m_interDir + absPartIdx + (curPartNumQ << 1) + curPartNumQ + (curPartNumQ >> 1), dir, (curPartNumQ >> 2)); + } + else + { + memset(m_interDir + absPartIdx, dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ >> 1), dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ << 1), dir, (curPartNumQ >> 2)); + memset(m_interDir + absPartIdx + (curPartNumQ << 1) + (curPartNumQ >> 1), dir, (curPartNumQ >> 2)); + } + break; + default: + X265_CHECK(0, "unexpected part type\n"); + break; + } +} + +template +void CUData::setAllPU(T* p, const T& val, int absPartIdx, int puIdx) +{ + int i; + + p += absPartIdx; + int numElements = m_numPartitions; + + switch (m_partSize[absPartIdx]) + { + case SIZE_2Nx2N: + for (i = 0; i < numElements; i++) + p[i] = val; + break; + + case SIZE_2NxN: + numElements >>= 1; + for (i = 0; i < numElements; i++) + p[i] = val; + break; + + case SIZE_Nx2N: + numElements >>= 2; + for (i = 0; i < numElements; i++) + { + p[i] = val; + p[i + 2 * numElements] = val; + } + break; + + case SIZE_2NxnU: + { + int curPartNumQ = numElements >> 2; + if (!puIdx) + { + T *pT = p; + T *pT2 = p + curPartNumQ; + for (i = 0; i < (curPartNumQ >> 1); i++) + { + pT[i] = val; + pT2[i] = val; + } + } + else + { + T *pT = p; + for (i = 0; i < (curPartNumQ >> 1); i++) + pT[i] = val; + + pT = p + curPartNumQ; + for (i = 0; i < ((curPartNumQ >> 1) + (curPartNumQ << 1)); i++) + pT[i] = val; + } + break; + } + + case SIZE_2NxnD: + { + int curPartNumQ = numElements >> 2; + if (!puIdx) + { + T *pT = p; + for (i = 0; i < ((curPartNumQ >> 1) + (curPartNumQ << 1)); i++) + pT[i] = val; + + pT = p + (numElements - curPartNumQ); + for (i = 0; i < (curPartNumQ >> 1); i++) + pT[i] = val; + } + else + { + T *pT = p; + T *pT2 = p + curPartNumQ; + for (i = 0; i < (curPartNumQ >> 1); i++) + { + pT[i] = val; + pT2[i] = val; + } + } + break; + } + + case SIZE_nLx2N: + { + int curPartNumQ = numElements >> 2; + if (!puIdx) + { + T *pT = p; + T *pT2 = p + (curPartNumQ << 1); + T *pT3 = p + (curPartNumQ >> 1); + T *pT4 = p + (curPartNumQ << 1) + (curPartNumQ >> 1); + + for (i = 0; i < (curPartNumQ >> 2); i++) + { + pT[i] = val; + pT2[i] = val; + pT3[i] = val; + pT4[i] = val; + } + } + else + { + T *pT = p; + T *pT2 = p + (curPartNumQ << 1); + for (i = 0; i < (curPartNumQ >> 2); i++) + { + pT[i] = val; + pT2[i] = val; + } + + pT = p + (curPartNumQ >> 1); + pT2 = p + (curPartNumQ << 1) + (curPartNumQ >> 1); + for (i = 0; i < ((curPartNumQ >> 2) + curPartNumQ); i++) + { + pT[i] = val; + pT2[i] = val; + } + } + break; + } + + case SIZE_nRx2N: + { + int curPartNumQ = numElements >> 2; + if (!puIdx) + { + T *pT = p; + T *pT2 = p + (curPartNumQ << 1); + for (i = 0; i < ((curPartNumQ >> 2) + curPartNumQ); i++) + { + pT[i] = val; + pT2[i] = val; + } + + pT = p + curPartNumQ + (curPartNumQ >> 1); + pT2 = p + numElements - curPartNumQ + (curPartNumQ >> 1); + for (i = 0; i < (curPartNumQ >> 2); i++) + { + pT[i] = val; + pT2[i] = val; + } + } + else + { + T *pT = p; + T *pT2 = p + (curPartNumQ >> 1); + T *pT3 = p + (curPartNumQ << 1); + T *pT4 = p + (curPartNumQ << 1) + (curPartNumQ >> 1); + for (i = 0; i < (curPartNumQ >> 2); i++) + { + pT[i] = val; + pT2[i] = val; + pT3[i] = val; + pT4[i] = val; + } + } + break; + } + + case SIZE_NxN: + default: + X265_CHECK(0, "unknown partition type\n"); + break; + } +} + +void CUData::setPUMv(int list, const MV& mv, int absPartIdx, int puIdx) +{ + setAllPU(m_mv[list], mv, absPartIdx, puIdx); +} + +void CUData::setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx) +{ + setAllPU(m_refIdx[list], refIdx, absPartIdx, puIdx); +} + +void CUData::getPartIndexAndSize(uint32_t partIdx, uint32_t& outPartAddr, int& outWidth, int& outHeight) const +{ + int cuSize = 1 << m_log2CUSize[0]; + int partType = m_partSize[0]; + + int tmp = partTable[partType][partIdx][0]; + outWidth = ((tmp >> 4) * cuSize) >> 2; + outHeight = ((tmp & 0xF) * cuSize) >> 2; + outPartAddr = (partAddrTable[partType][partIdx] * m_numPartitions) >> 4; +} + +void CUData::getMvField(const CUData* cu, uint32_t absPartIdx, int picList, MVField& outMvField) const +{ + if (cu) + { + outMvField.mv = cu->m_mv[picList][absPartIdx]; + outMvField.refIdx = cu->m_refIdx[picList][absPartIdx]; + } + else + { + // OUT OF BOUNDARY + outMvField.mv.word = 0; + outMvField.refIdx = REF_NOT_VALID; + } +} + +void CUData::deriveLeftRightTopIdx(uint32_t partIdx, uint32_t& partIdxLT, uint32_t& partIdxRT) const +{ + partIdxLT = m_absIdxInCTU; + partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1]; + + switch (m_partSize[0]) + { + case SIZE_2Nx2N: break; + case SIZE_2NxN: + partIdxLT += (partIdx == 0) ? 0 : m_numPartitions >> 1; + partIdxRT += (partIdx == 0) ? 0 : m_numPartitions >> 1; + break; + case SIZE_Nx2N: + partIdxLT += (partIdx == 0) ? 0 : m_numPartitions >> 2; + partIdxRT -= (partIdx == 1) ? 0 : m_numPartitions >> 2; + break; + case SIZE_NxN: + partIdxLT += (m_numPartitions >> 2) * partIdx; + partIdxRT += (m_numPartitions >> 2) * (partIdx - 1); + break; + case SIZE_2NxnU: + partIdxLT += (partIdx == 0) ? 0 : m_numPartitions >> 3; + partIdxRT += (partIdx == 0) ? 0 : m_numPartitions >> 3; + break; + case SIZE_2NxnD: + partIdxLT += (partIdx == 0) ? 0 : (m_numPartitions >> 1) + (m_numPartitions >> 3); + partIdxRT += (partIdx == 0) ? 0 : (m_numPartitions >> 1) + (m_numPartitions >> 3); + break; + case SIZE_nLx2N: + partIdxLT += (partIdx == 0) ? 0 : m_numPartitions >> 4; + partIdxRT -= (partIdx == 1) ? 0 : (m_numPartitions >> 2) + (m_numPartitions >> 4); + break; + case SIZE_nRx2N: + partIdxLT += (partIdx == 0) ? 0 : (m_numPartitions >> 2) + (m_numPartitions >> 4); + partIdxRT -= (partIdx == 1) ? 0 : m_numPartitions >> 4; + break; + default: + X265_CHECK(0, "unexpected part index\n"); + break; + } +} + +uint32_t CUData::deriveLeftBottomIdx(uint32_t puIdx) const +{ + uint32_t outPartIdxLB; + outPartIdxLB = g_rasterToZscan[g_zscanToRaster[m_absIdxInCTU] + ((1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE - 1)) - 1) * s_numPartInCUSize]; + + switch (m_partSize[0]) + { + case SIZE_2Nx2N: + outPartIdxLB += m_numPartitions >> 1; + break; + case SIZE_2NxN: + outPartIdxLB += puIdx ? m_numPartitions >> 1 : 0; + break; + case SIZE_Nx2N: + outPartIdxLB += puIdx ? (m_numPartitions >> 2) * 3 : m_numPartitions >> 1; + break; + case SIZE_NxN: + outPartIdxLB += (m_numPartitions >> 2) * puIdx; + break; + case SIZE_2NxnU: + outPartIdxLB += puIdx ? m_numPartitions >> 1 : -((int)m_numPartitions >> 3); + break; + case SIZE_2NxnD: + outPartIdxLB += puIdx ? m_numPartitions >> 1 : (m_numPartitions >> 2) + (m_numPartitions >> 3); + break; + case SIZE_nLx2N: + outPartIdxLB += puIdx ? (m_numPartitions >> 1) + (m_numPartitions >> 4) : m_numPartitions >> 1; + break; + case SIZE_nRx2N: + outPartIdxLB += puIdx ? (m_numPartitions >> 1) + (m_numPartitions >> 2) + (m_numPartitions >> 4) : m_numPartitions >> 1; + break; + default: + X265_CHECK(0, "unexpected part index\n"); + break; + } + return outPartIdxLB; +} + +/* Derives the partition index of neighboring bottom right block */ +uint32_t CUData::deriveRightBottomIdx(uint32_t puIdx) const +{ + uint32_t outPartIdxRB; + outPartIdxRB = g_rasterToZscan[g_zscanToRaster[m_absIdxInCTU] + + ((1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE - 1)) - 1) * s_numPartInCUSize + + (1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE)) - 1]; + + switch (m_partSize[0]) + { + case SIZE_2Nx2N: + outPartIdxRB += m_numPartitions >> 1; + break; + case SIZE_2NxN: + outPartIdxRB += puIdx ? m_numPartitions >> 1 : 0; + break; + case SIZE_Nx2N: + outPartIdxRB += puIdx ? m_numPartitions >> 1 : m_numPartitions >> 2; + break; + case SIZE_NxN: + outPartIdxRB += (m_numPartitions >> 2) * (puIdx - 1); + break; + case SIZE_2NxnU: + outPartIdxRB += puIdx ? m_numPartitions >> 1 : -((int)m_numPartitions >> 3); + break; + case SIZE_2NxnD: + outPartIdxRB += puIdx ? m_numPartitions >> 1 : (m_numPartitions >> 2) + (m_numPartitions >> 3); + break; + case SIZE_nLx2N: + outPartIdxRB += puIdx ? m_numPartitions >> 1 : (m_numPartitions >> 3) + (m_numPartitions >> 4); + break; + case SIZE_nRx2N: + outPartIdxRB += puIdx ? m_numPartitions >> 1 : (m_numPartitions >> 2) + (m_numPartitions >> 3) + (m_numPartitions >> 4); + break; + default: + X265_CHECK(0, "unexpected part index\n"); + break; + } + return outPartIdxRB; +} + +void CUData::deriveLeftRightTopIdxAdi(uint32_t& outPartIdxLT, uint32_t& outPartIdxRT, uint32_t partOffset, uint32_t partDepth) const +{ + uint32_t numPartInWidth = 1 << (m_log2CUSize[0] - LOG2_UNIT_SIZE - partDepth); + + outPartIdxLT = m_absIdxInCTU + partOffset; + outPartIdxRT = g_rasterToZscan[g_zscanToRaster[outPartIdxLT] + numPartInWidth - 1]; +} + +bool CUData::hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const +{ + if (m_interDir[absPartIdx] != candCU.m_interDir[candAbsPartIdx]) + return false; + + for (uint32_t refListIdx = 0; refListIdx < 2; refListIdx++) + { + if (m_interDir[absPartIdx] & (1 << refListIdx)) + { + if (m_mv[refListIdx][absPartIdx] != candCU.m_mv[refListIdx][candAbsPartIdx] || + m_refIdx[refListIdx][absPartIdx] != candCU.m_refIdx[refListIdx][candAbsPartIdx]) + return false; + } + } + + return true; +} + +/* Construct list of merging candidates */ +uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const +{ + uint32_t absPartAddr = m_absIdxInCTU + absPartIdx; + const bool isInterB = m_slice->isInterB(); + + const uint32_t maxNumMergeCand = m_slice->m_maxNumMergeCand; + + for (uint32_t i = 0; i < maxNumMergeCand; ++i) + { + mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID; + mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID; + } + + /* calculate the location of upper-left corner pixel and size of the current PU */ + int xP, yP, nPSW, nPSH; + + int cuSize = 1 << m_log2CUSize[0]; + int partMode = m_partSize[0]; + + int tmp = partTable[partMode][puIdx][0]; + nPSW = ((tmp >> 4) * cuSize) >> 2; + nPSH = ((tmp & 0xF) * cuSize) >> 2; + + tmp = partTable[partMode][puIdx][1]; + xP = ((tmp >> 4) * cuSize) >> 2; + yP = ((tmp & 0xF) * cuSize) >> 2; + + uint32_t count = 0; + + uint32_t partIdxLT, partIdxRT, partIdxLB = deriveLeftBottomIdx(puIdx); + PartSize curPS = (PartSize)m_partSize[absPartIdx]; + + // left + uint32_t leftPartIdx = 0; + const CUData* cuLeft = getPULeft(leftPartIdx, partIdxLB); + bool isAvailableA1 = cuLeft && + cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) && + !(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) && + !cuLeft->isIntra(leftPartIdx); + if (isAvailableA1) + { + // get Inter Dir + interDirNeighbours[count] = cuLeft->m_interDir[leftPartIdx]; + // get Mv from Left + cuLeft->getMvField(cuLeft, leftPartIdx, 0, mvFieldNeighbours[count][0]); + if (isInterB) + cuLeft->getMvField(cuLeft, leftPartIdx, 1, mvFieldNeighbours[count][1]); + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + + deriveLeftRightTopIdx(puIdx, partIdxLT, partIdxRT); + + // above + uint32_t abovePartIdx = 0; + const CUData* cuAbove = getPUAbove(abovePartIdx, partIdxRT); + bool isAvailableB1 = cuAbove && + cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) && + !(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) && + !cuAbove->isIntra(abovePartIdx); + if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx))) + { + // get Inter Dir + interDirNeighbours[count] = cuAbove->m_interDir[abovePartIdx]; + // get Mv from Left + cuAbove->getMvField(cuAbove, abovePartIdx, 0, mvFieldNeighbours[count][0]); + if (isInterB) + cuAbove->getMvField(cuAbove, abovePartIdx, 1, mvFieldNeighbours[count][1]); + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + + // above right + uint32_t aboveRightPartIdx = 0; + const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT); + bool isAvailableB0 = cuAboveRight && + cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) && + !cuAboveRight->isIntra(aboveRightPartIdx); + if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx))) + { + // get Inter Dir + interDirNeighbours[count] = cuAboveRight->m_interDir[aboveRightPartIdx]; + // get Mv from Left + cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, mvFieldNeighbours[count][0]); + if (isInterB) + cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, mvFieldNeighbours[count][1]); + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + + // left bottom + uint32_t leftBottomPartIdx = 0; + const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB); + bool isAvailableA0 = cuLeftBottom && + cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) && + !cuLeftBottom->isIntra(leftBottomPartIdx); + if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx))) + { + // get Inter Dir + interDirNeighbours[count] = cuLeftBottom->m_interDir[leftBottomPartIdx]; + // get Mv from Left + cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, mvFieldNeighbours[count][0]); + if (isInterB) + cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, mvFieldNeighbours[count][1]); + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + + // above left + if (count < 4) + { + uint32_t aboveLeftPartIdx = 0; + const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr); + bool isAvailableB2 = cuAboveLeft && + cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) && + !cuAboveLeft->isIntra(aboveLeftPartIdx); + if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx)) + && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx))) + { + // get Inter Dir + interDirNeighbours[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx]; + // get Mv from Left + cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, mvFieldNeighbours[count][0]); + if (isInterB) + cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, mvFieldNeighbours[count][1]); + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + } + if (m_slice->m_sps->bTemporalMVPEnabled) + { + uint32_t partIdxRB = deriveRightBottomIdx(puIdx); + MV colmv; + int ctuIdx = -1; + + // image boundary check + if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples && + m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples) + { + uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB]; + uint32_t numPartInCUSize = s_numPartInCUSize; + bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU + bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row of CTU + + if (bNotLastCol && bNotLastRow) + { + absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1]; + ctuIdx = m_cuAddr; + } + else if (bNotLastCol) + absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)]; + else if (bNotLastRow) + { + absPartAddr = g_rasterToZscan[absPartIdxRB + 1]; + ctuIdx = m_cuAddr + 1; + } + else // is the right bottom corner of CTU + absPartAddr = 0; + } + + int refIdx = 0; + uint32_t partIdxCenter = deriveCenterIdx(puIdx); + uint32_t curCTUIdx = m_cuAddr; + int dir = 0; + bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, 0, ctuIdx, absPartAddr); + if (!bExistMV) + bExistMV = getColMVP(colmv, refIdx, 0, curCTUIdx, partIdxCenter); + if (bExistMV) + { + dir |= 1; + mvFieldNeighbours[count][0].mv = colmv; + mvFieldNeighbours[count][0].refIdx = refIdx; + } + + if (isInterB) + { + bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, 1, ctuIdx, absPartAddr); + if (!bExistMV) + bExistMV = getColMVP(colmv, refIdx, 1, curCTUIdx, partIdxCenter); + + if (bExistMV) + { + dir |= 2; + mvFieldNeighbours[count][1].mv = colmv; + mvFieldNeighbours[count][1].refIdx = refIdx; + } + } + + if (dir != 0) + { + interDirNeighbours[count] = (uint8_t)dir; + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + } + + if (isInterB) + { + const uint32_t cutoff = count * (count - 1); + uint32_t priorityList0 = 0xEDC984; // { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 } + uint32_t priorityList1 = 0xB73621; // { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 } + + for (uint32_t idx = 0; idx < cutoff; idx++) + { + int i = priorityList0 & 3; + int j = priorityList1 & 3; + priorityList0 >>= 2; + priorityList1 >>= 2; + + if ((interDirNeighbours[i] & 0x1) && (interDirNeighbours[j] & 0x2)) + { + // get Mv from cand[i] and cand[j] + int refIdxL0 = mvFieldNeighbours[i][0].refIdx; + int refIdxL1 = mvFieldNeighbours[j][1].refIdx; + int refPOCL0 = m_slice->m_refPOCList[0][refIdxL0]; + int refPOCL1 = m_slice->m_refPOCList[1][refIdxL1]; + if (!(refPOCL0 == refPOCL1 && mvFieldNeighbours[i][0].mv == mvFieldNeighbours[j][1].mv)) + { + mvFieldNeighbours[count][0].mv = mvFieldNeighbours[i][0].mv; + mvFieldNeighbours[count][0].refIdx = refIdxL0; + mvFieldNeighbours[count][1].mv = mvFieldNeighbours[j][1].mv; + mvFieldNeighbours[count][1].refIdx = refIdxL1; + interDirNeighbours[count] = 3; + + count++; + + if (count == maxNumMergeCand) + return maxNumMergeCand; + } + } + } + } + int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx[0], m_slice->m_numRefIdx[1]) : m_slice->m_numRefIdx[0]; + int r = 0; + int refcnt = 0; + while (count < maxNumMergeCand) + { + interDirNeighbours[count] = 1; + mvFieldNeighbours[count][0].mv.word = 0; + mvFieldNeighbours[count][0].refIdx = r; + + if (isInterB) + { + interDirNeighbours[count] = 3; + mvFieldNeighbours[count][1].mv.word = 0; + mvFieldNeighbours[count][1].refIdx = r; + } + + count++; + + if (refcnt == numRefIdx - 1) + r = 0; + else + { + ++r; + ++refcnt; + } + } + + return count; +} + +/* Check whether the current PU and a spatial neighboring PU are in a same ME region */ +bool CUData::isDiffMER(int xN, int yN, int xP, int yP) const +{ + uint32_t plevel = 2; + + if ((xN >> plevel) != (xP >> plevel)) + return true; + if ((yN >> plevel) != (yP >> plevel)) + return true; + return false; +} + +/* Constructs a list of candidates for AMVP, and a larger list of motion candidates */ +int CUData::fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const +{ + int num = 0; + + // spatial MV + uint32_t partIdxLT, partIdxRT, partIdxLB = deriveLeftBottomIdx(puIdx); + + deriveLeftRightTopIdx(puIdx, partIdxLT, partIdxRT); + + MV mv[MD_ABOVE_LEFT + 1]; + MV mvOrder[MD_ABOVE_LEFT + 1]; + bool valid[MD_ABOVE_LEFT + 1]; + bool validOrder[MD_ABOVE_LEFT + 1]; + + valid[MD_BELOW_LEFT] = addMVPCand(mv[MD_BELOW_LEFT], picList, refIdx, partIdxLB, MD_BELOW_LEFT); + valid[MD_LEFT] = addMVPCand(mv[MD_LEFT], picList, refIdx, partIdxLB, MD_LEFT); + valid[MD_ABOVE_RIGHT] = addMVPCand(mv[MD_ABOVE_RIGHT], picList, refIdx, partIdxRT, MD_ABOVE_RIGHT); + valid[MD_ABOVE] = addMVPCand(mv[MD_ABOVE], picList, refIdx, partIdxRT, MD_ABOVE); + valid[MD_ABOVE_LEFT] = addMVPCand(mv[MD_ABOVE_LEFT], picList, refIdx, partIdxLT, MD_ABOVE_LEFT); + + validOrder[MD_BELOW_LEFT] = addMVPCandOrder(mvOrder[MD_BELOW_LEFT], picList, refIdx, partIdxLB, MD_BELOW_LEFT); + validOrder[MD_LEFT] = addMVPCandOrder(mvOrder[MD_LEFT], picList, refIdx, partIdxLB, MD_LEFT); + validOrder[MD_ABOVE_RIGHT] = addMVPCandOrder(mvOrder[MD_ABOVE_RIGHT], picList, refIdx, partIdxRT, MD_ABOVE_RIGHT); + validOrder[MD_ABOVE] = addMVPCandOrder(mvOrder[MD_ABOVE], picList, refIdx, partIdxRT, MD_ABOVE); + validOrder[MD_ABOVE_LEFT] = addMVPCandOrder(mvOrder[MD_ABOVE_LEFT], picList, refIdx, partIdxLT, MD_ABOVE_LEFT); + + // Left predictor search + if (valid[MD_BELOW_LEFT]) + amvpCand[num++] = mv[MD_BELOW_LEFT]; + else if (valid[MD_LEFT]) + amvpCand[num++] = mv[MD_LEFT]; + else if (validOrder[MD_BELOW_LEFT]) + amvpCand[num++] = mvOrder[MD_BELOW_LEFT]; + else if (validOrder[MD_LEFT]) + amvpCand[num++] = mvOrder[MD_LEFT]; + + bool bAddedSmvp = num > 0; + + // Above predictor search + if (valid[MD_ABOVE_RIGHT]) + amvpCand[num++] = mv[MD_ABOVE_RIGHT]; + else if (valid[MD_ABOVE]) + amvpCand[num++] = mv[MD_ABOVE]; + else if (valid[MD_ABOVE_LEFT]) + amvpCand[num++] = mv[MD_ABOVE_LEFT]; + + if (!bAddedSmvp) + { + if (validOrder[MD_ABOVE_RIGHT]) + amvpCand[num++] = mvOrder[MD_ABOVE_RIGHT]; + else if (validOrder[MD_ABOVE]) + amvpCand[num++] = mvOrder[MD_ABOVE]; + else if (validOrder[MD_ABOVE_LEFT]) + amvpCand[num++] = mvOrder[MD_ABOVE_LEFT]; + } + + int numMvc = 0; + for (int dir = MD_LEFT; dir <= MD_ABOVE_LEFT; dir++) + { + if (valid[dir] && mv[dir].notZero()) + mvc[numMvc++] = mv[dir]; + + if (validOrder[dir] && mvOrder[dir].notZero()) + mvc[numMvc++] = mvOrder[dir]; + } + + if (num == 2) + { + if (amvpCand[0] == amvpCand[1]) + num = 1; + else + /* AMVP_NUM_CANDS = 2 */ + return numMvc; + } + + if (m_slice->m_sps->bTemporalMVPEnabled) + { + uint32_t absPartAddr = m_absIdxInCTU + absPartIdx; + uint32_t partIdxRB = deriveRightBottomIdx(puIdx); + MV colmv; + + // co-located RightBottom temporal predictor (H) + int ctuIdx = -1; + + // image boundary check + if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples && + m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples) + { + uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB]; + uint32_t numPartInCUSize = s_numPartInCUSize; + bool bNotLastCol = lessThanCol(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last column of CTU + bool bNotLastRow = lessThanRow(absPartIdxRB, numPartInCUSize - 1, numPartInCUSize); // is not at the last row of CTU + + if (bNotLastCol && bNotLastRow) + { + absPartAddr = g_rasterToZscan[absPartIdxRB + numPartInCUSize + 1]; + ctuIdx = m_cuAddr; + } + else if (bNotLastCol) + absPartAddr = g_rasterToZscan[(absPartIdxRB + numPartInCUSize + 1) & (numPartInCUSize - 1)]; + else if (bNotLastRow) + { + absPartAddr = g_rasterToZscan[absPartIdxRB + 1]; + ctuIdx = m_cuAddr + 1; + } + else // is the right bottom corner of CTU + absPartAddr = 0; + } + if (ctuIdx >= 0 && getColMVP(colmv, refIdx, picList, ctuIdx, absPartAddr)) + { + amvpCand[num++] = colmv; + mvc[numMvc++] = colmv; + } + else + { + uint32_t partIdxCenter = deriveCenterIdx(puIdx); + uint32_t curCTUIdx = m_cuAddr; + if (getColMVP(colmv, refIdx, picList, curCTUIdx, partIdxCenter)) + { + amvpCand[num++] = colmv; + mvc[numMvc++] = colmv; + } + } + } + + while (num < AMVP_NUM_CANDS) + amvpCand[num++] = 0; + + return numMvc; +} + +void CUData::clipMv(MV& outMV) const +{ + int mvshift = 2; + int offset = 8; + int xmax = (m_slice->m_sps->picWidthInLumaSamples + offset - m_cuPelX - 1) << mvshift; + int xmin = (-(int)g_maxCUSize - offset - (int)m_cuPelX + 1) << mvshift; + + int ymax = (m_slice->m_sps->picHeightInLumaSamples + offset - m_cuPelY - 1) << mvshift; + int ymin = (-(int)g_maxCUSize - offset - (int)m_cuPelY + 1) << mvshift; + + outMV.x = (int16_t)X265_MIN(xmax, X265_MAX(xmin, (int)outMV.x)); + outMV.y = (int16_t)X265_MIN(ymax, X265_MAX(ymin, (int)outMV.y)); +} + +bool CUData::addMVPCand(MV& mvp, int picList, int refIdx, uint32_t partUnitIdx, MVP_DIR dir) const +{ + const CUData* tmpCU = NULL; + uint32_t idx = 0; + + switch (dir) + { + case MD_LEFT: + tmpCU = getPULeft(idx, partUnitIdx); + break; + case MD_ABOVE: + tmpCU = getPUAbove(idx, partUnitIdx); + break; + case MD_ABOVE_RIGHT: + tmpCU = getPUAboveRight(idx, partUnitIdx); + break; + case MD_BELOW_LEFT: + tmpCU = getPUBelowLeft(idx, partUnitIdx); + break; + case MD_ABOVE_LEFT: + tmpCU = getPUAboveLeft(idx, partUnitIdx); + break; + default: + return false; + } + + if (!tmpCU) + return false; + + int refPOC = m_slice->m_refPOCList[picList][refIdx]; + int partRefIdx = tmpCU->m_refIdx[picList][idx]; + if (partRefIdx >= 0 && refPOC == tmpCU->m_slice->m_refPOCList[picList][partRefIdx]) + { + mvp = tmpCU->m_mv[picList][idx]; + return true; + } + + int refPicList2nd = 0; + if (picList == 0) + refPicList2nd = 1; + else if (picList == 1) + refPicList2nd = 0; + + int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; + int neibRefPOC; + + partRefIdx = tmpCU->m_refIdx[refPicList2nd][idx]; + if (partRefIdx >= 0) + { + neibRefPOC = tmpCU->m_slice->m_refPOCList[refPicList2nd][partRefIdx]; + if (neibRefPOC == curRefPOC) + { + // Same reference frame but different list + mvp = tmpCU->m_mv[refPicList2nd][idx]; + return true; + } + } + return false; +} + +bool CUData::addMVPCandOrder(MV& outMV, int picList, int refIdx, uint32_t partUnitIdx, MVP_DIR dir) const +{ + const CUData* tmpCU = NULL; + uint32_t idx = 0; + + switch (dir) + { + case MD_LEFT: + tmpCU = getPULeft(idx, partUnitIdx); + break; + case MD_ABOVE: + tmpCU = getPUAbove(idx, partUnitIdx); + break; + case MD_ABOVE_RIGHT: + tmpCU = getPUAboveRight(idx, partUnitIdx); + break; + case MD_BELOW_LEFT: + tmpCU = getPUBelowLeft(idx, partUnitIdx); + break; + case MD_ABOVE_LEFT: + tmpCU = getPUAboveLeft(idx, partUnitIdx); + break; + default: + return false; + } + + if (!tmpCU) + return false; + + int refPicList2nd = 0; + if (picList == 0) + refPicList2nd = 1; + else if (picList == 1) + refPicList2nd = 0; + + int curPOC = m_slice->m_poc; + int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; + int neibPOC = curPOC; + int neibRefPOC; + + int partRefIdx = tmpCU->m_refIdx[picList][idx]; + if (partRefIdx >= 0) + { + neibRefPOC = tmpCU->m_slice->m_refPOCList[picList][partRefIdx]; + MV mvp = tmpCU->m_mv[picList][idx]; + + scaleMvByPOCDist(outMV, mvp, curPOC, curRefPOC, neibPOC, neibRefPOC); + return true; + } + + partRefIdx = tmpCU->m_refIdx[refPicList2nd][idx]; + if (partRefIdx >= 0) + { + neibRefPOC = tmpCU->m_slice->m_refPOCList[refPicList2nd][partRefIdx]; + MV mvp = tmpCU->m_mv[refPicList2nd][idx]; + + scaleMvByPOCDist(outMV, mvp, curPOC, curRefPOC, neibPOC, neibRefPOC); + return true; + } + + return false; +} + +bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const +{ + uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK; + + int colRefPicList; + int colPOC, colRefPOC, curPOC, curRefPOC; + MV colmv; + + // use coldir. + Frame *colPic = m_slice->m_refPicList[m_slice->isInterB() ? 1 - m_slice->m_colFromL0Flag : 0][m_slice->m_colRefIdx]; + CUData *colCU = colPic->m_encData->getPicCTU(cuAddr); + + if (colCU->m_partSize[partUnitIdx] == SIZE_NONE) + return false; + + curPOC = m_slice->m_poc; + colPOC = colCU->m_slice->m_poc; + + if (colCU->isIntra(absPartAddr)) + return false; + + colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag; + + int colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr]; + + if (colRefIdx < 0) + { + colRefPicList = 1 - colRefPicList; + colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr]; + + if (colRefIdx < 0) + return false; + } + + // Scale the vector + colRefPOC = colCU->m_slice->m_refPOCList[colRefPicList][colRefIdx]; + colmv = colCU->m_mv[colRefPicList][absPartAddr]; + curRefPOC = m_slice->m_refPOCList[picList][outRefIdx]; + + scaleMvByPOCDist(outMV, colmv, curPOC, curRefPOC, colPOC, colRefPOC); + return true; +} + +void CUData::scaleMvByPOCDist(MV& outMV, const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const +{ + int diffPocD = colPOC - colRefPOC; + int diffPocB = curPOC - curRefPOC; + + if (diffPocD == diffPocB) + outMV = inMV; + else + { + int tdb = Clip3(-128, 127, diffPocB); + int tdd = Clip3(-128, 127, diffPocD); + int x = (0x4000 + abs(tdd / 2)) / tdd; + int scale = Clip3(-4096, 4095, (tdb * x + 32) >> 6); + outMV = scaleMv(inMV, scale); + } +} + +uint32_t CUData::deriveCenterIdx(uint32_t puIdx) const +{ + uint32_t absPartIdx; + int puWidth, puHeight; + + getPartIndexAndSize(puIdx, absPartIdx, puWidth, puHeight); + + return g_rasterToZscan[g_zscanToRaster[m_absIdxInCTU + absPartIdx] + + (puHeight >> (LOG2_UNIT_SIZE + 1)) * s_numPartInCUSize + + (puWidth >> (LOG2_UNIT_SIZE + 1))]; +} + +ScanType CUData::getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const +{ + uint32_t dirMode; + + if (!bIsIntra) + return SCAN_DIAG; + + // check that MDCS can be used for this TU + if (bIsLuma) + { + if (log2TrSize > MDCS_LOG2_MAX_SIZE) + return SCAN_DIAG; + + dirMode = m_lumaIntraDir[absPartIdx]; + } + else + { + if (log2TrSize > (uint32_t)(MDCS_LOG2_MAX_SIZE - m_hChromaShift)) + return SCAN_DIAG; + + dirMode = m_chromaIntraDir[absPartIdx]; + if (dirMode == DM_CHROMA_IDX) + { + dirMode = m_lumaIntraDir[(m_chromaFormat == X265_CSP_I444) ? absPartIdx : absPartIdx & 0xFC]; + dirMode = (m_chromaFormat == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[dirMode] : dirMode; + } + } + + if (abs((int)dirMode - VER_IDX) <= MDCS_ANGLE_LIMIT) + return SCAN_HOR; + else if (abs((int)dirMode - HOR_IDX) <= MDCS_ANGLE_LIMIT) + return SCAN_VER; + else + return SCAN_DIAG; +} + +void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const +{ + // set the group layout + result.log2TrSizeCG = log2TrSize - 2; + + // set the scan orders + result.scanType = getCoefScanIdx(absPartIdx, log2TrSize, bIsLuma, isIntra(absPartIdx)); + result.scan = g_scanOrder[result.scanType][log2TrSize - 2]; + result.scanCG = g_scanOrderCG[result.scanType][result.log2TrSizeCG]; + + if (log2TrSize == 2) + result.firstSignificanceMapContext = 0; + else if (log2TrSize == 3) + { + result.firstSignificanceMapContext = 9; + if (result.scanType != SCAN_DIAG && bIsLuma) + result.firstSignificanceMapContext += 6; + } + else + result.firstSignificanceMapContext = bIsLuma ? 21 : 12; +} + +#define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag)) + +void CUData::calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const +{ + // Initialize the coding blocks inside the CTB + for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--) + { + uint32_t blockSize = 1 << log2CUSize; + uint32_t sbWidth = 1 << (g_log2Size[maxCUSize] - log2CUSize); + int32_t lastLevelFlag = log2CUSize == MIN_LOG2_CU_SIZE; + for (uint32_t sbY = 0; sbY < sbWidth; sbY++) + { + for (uint32_t sbX = 0; sbX < sbWidth; sbX++) + { + uint32_t depthIdx = g_depthScanIdx[sbY][sbX]; + uint32_t cuIdx = rangeCUIdx + depthIdx; + uint32_t childIdx = rangeCUIdx + sbWidth * sbWidth + (depthIdx << 2); + uint32_t px = m_cuPelX + sbX * blockSize; + uint32_t py = m_cuPelY + sbY * blockSize; + int32_t presentFlag = px < picWidth && py < picHeight; + int32_t splitMandatoryFlag = presentFlag && !lastLevelFlag && (px + blockSize > picWidth || py + blockSize > picHeight); + + /* Offset of the luma CU in the X, Y direction in terms of pixels from the CTU origin */ + uint32_t xOffset = (sbX * blockSize) >> 3; + uint32_t yOffset = (sbY * blockSize) >> 3; + X265_CHECK(cuIdx < CUGeom::MAX_GEOMS, "CU geom index bug\n"); + + CUGeom *cu = cuDataArray + cuIdx; + cu->log2CUSize = log2CUSize; + cu->childOffset = childIdx - cuIdx; + cu->encodeIdx = g_depthScanIdx[yOffset][xOffset] * 4; + cu->numPartitions = (NUM_CU_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2)); + cu->depth = g_log2Size[maxCUSize] - log2CUSize; + + cu->flags = 0; + CU_SET_FLAG(cu->flags, CUGeom::PRESENT, presentFlag); + CU_SET_FLAG(cu->flags, CUGeom::SPLIT_MANDATORY | CUGeom::SPLIT, splitMandatoryFlag); + CU_SET_FLAG(cu->flags, CUGeom::LEAF, lastLevelFlag); + } + } + rangeCUIdx += sbWidth * sbWidth; + } +} diff --git a/source/common/cudata.h b/source/common/cudata.h new file mode 100644 index 0000000..7f735d6 --- /dev/null +++ b/source/common/cudata.h @@ -0,0 +1,304 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_CUDATA_H +#define X265_CUDATA_H + +#include "common.h" +#include "slice.h" +#include "mv.h" + +namespace x265 { +// private namespace + +class FrameData; +class Slice; +struct TUEntropyCodingParameters; +struct CUDataMemPool; + +enum PartSize +{ + SIZE_2Nx2N, // symmetric motion partition, 2Nx2N + SIZE_2NxN, // symmetric motion partition, 2Nx N + SIZE_Nx2N, // symmetric motion partition, Nx2N + SIZE_NxN, // symmetric motion partition, Nx N + SIZE_2NxnU, // asymmetric motion partition, 2Nx( N/2) + 2Nx(3N/2) + SIZE_2NxnD, // asymmetric motion partition, 2Nx(3N/2) + 2Nx( N/2) + SIZE_nLx2N, // asymmetric motion partition, ( N/2)x2N + (3N/2)x2N + SIZE_nRx2N, // asymmetric motion partition, (3N/2)x2N + ( N/2)x2N + SIZE_NONE = 15 +}; + +enum PredMode +{ + MODE_INTER, + MODE_INTRA, + MODE_NONE = 15 +}; + +// motion vector predictor direction used in AMVP +enum MVP_DIR +{ + MD_LEFT = 0, // MVP of left block + MD_ABOVE, // MVP of above block + MD_ABOVE_RIGHT, // MVP of above right block + MD_BELOW_LEFT, // MVP of below left block + MD_ABOVE_LEFT // MVP of above left block +}; + +struct CUGeom +{ + enum { + INTRA = 1<<0, // CU is intra predicted + PRESENT = 1<<1, // CU is not completely outside the frame + SPLIT_MANDATORY = 1<<2, // CU split is mandatory if CU is inside frame and can be split + LEAF = 1<<3, // CU is a leaf node of the CTU + SPLIT = 1<<4, // CU is currently split in four child CUs. + }; + + // (1 + 4 + 16 + 64) = 85. + enum { MAX_GEOMS = 85 }; + + uint32_t log2CUSize; // Log of the CU size. + uint32_t childOffset; // offset of the first child CU from current CU + uint32_t encodeIdx; // Encoding index of this CU in terms of 4x4 blocks. + uint32_t numPartitions; // Number of 4x4 blocks in the CU + uint32_t depth; // depth of this CU relative from CTU + uint32_t flags; // CU flags. +}; + +struct MVField +{ + MV mv; + int refIdx; +}; + +typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32) +typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32) + +// Partition count table, index represents partitioning mode. +const uint32_t nbPartsTable[8] = { 1, 2, 2, 4, 2, 2, 2, 2 }; + +// Holds part data for a CU of a given size, from an 8x8 CU to a CTU +class CUData +{ +public: + + static cubcast_t s_partSet[NUM_FULL_DEPTH]; // pointer to broadcast set functions per absolute depth + static uint32_t s_numPartInCUSize; + + FrameData* m_encData; + const Slice* m_slice; + + cucopy_t m_partCopy; // pointer to function that copies m_numPartitions elements + cubcast_t m_partSet; // pointer to function that sets m_numPartitions elements + cucopy_t m_subPartCopy; // pointer to function that copies m_numPartitions/4 elements, may be NULL + cubcast_t m_subPartSet; // pointer to function that sets m_numPartitions/4 elements, may be NULL + + uint32_t m_cuAddr; // address of CTU within the picture in raster order + uint32_t m_absIdxInCTU; // address of CU within its CTU in Z scan order + uint32_t m_cuPelX; // CU position within the picture, in pixels (X) + uint32_t m_cuPelY; // CU position within the picture, in pixels (Y) + uint32_t m_numPartitions; // maximum number of 4x4 partitions within this CU + + int m_chromaFormat; + int m_hChromaShift; + int m_vChromaShift; + + /* Per-part data, stored contiguously */ + char* m_qp; // array of QP values + uint8_t* m_log2CUSize; // array of cu log2Size TODO: seems redundant to depth + uint8_t* m_partSize; // array of partition sizes + uint8_t* m_predMode; // array of prediction modes + uint8_t* m_lumaIntraDir; // array of intra directions (luma) + uint8_t* m_tqBypass; // array of CU lossless flags + char* m_refIdx[2]; // array of motion reference indices per list + uint8_t* m_cuDepth; // array of depths + uint8_t* m_skipFlag; // array of skip flags + uint8_t* m_mergeFlag; // array of merge flags + uint8_t* m_interDir; // array of inter directions + uint8_t* m_mvpIdx[2]; // array of motion vector predictor candidates or merge candidate indices [0] + uint8_t* m_tuDepth; // array of transform indices + uint8_t* m_transformSkip[3]; // array of transform skipping flags per plane + uint8_t* m_cbf[3]; // array of coded block flags (CBF) per plane + uint8_t* m_chromaIntraDir; // array of intra directions (chroma) + enum { BytesPerPartition = 22 }; // combined sizeof() of all per-part data + + coeff_t* m_trCoeff[3]; // transformed coefficient buffer per plane + + MV* m_mv[2]; // array of motion vectors per list + MV* m_mvd[2]; // array of coded motion vector deltas per list + enum { TMVP_UNIT_MASK = 0xF0 }; // mask for mapping index to into a compressed (reference) MV field + + const CUData* m_cuAboveLeft; // pointer to above-left neighbor CTU + const CUData* m_cuAboveRight; // pointer to above-right neighbor CTU + const CUData* m_cuAbove; // pointer to above neighbor CTU + const CUData* m_cuLeft; // pointer to left neighbor CTU + + CUData(); + + void initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance); + void calcCTUGeoms(uint32_t picWidth, uint32_t picHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) const; + + void initCTU(const Frame& frame, uint32_t cuAddr, int qp); + void initSubCU(const CUData& ctu, const CUGeom& cuGeom); + void initLosslessCU(const CUData& cu, const CUGeom& cuGeom); + + void copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx); + void setEmptyPart(const CUGeom& childGeom, uint32_t subPartIdx); + void copyToPic(uint32_t depth) const; + + /* RD-0 methods called only from encodeResidue */ + void copyFromPic(const CUData& ctu, const CUGeom& cuGeom); + void updatePic(uint32_t depth) const; + + void setPartSizeSubParts(PartSize size) { m_partSet(m_partSize, (uint8_t)size); } + void setSkipFlagSubParts(uint8_t skipFlag) { m_partSet(m_skipFlag, skipFlag); } + void setPredModeSubParts(PredMode mode) { m_partSet(m_predMode, (uint8_t)mode); } + void clearCbf() { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); } + + /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */ + void setQPSubParts(char qp, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); } + void setTUDepthSubParts(uint8_t tuDepth, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_tuDepth + absPartIdx, tuDepth); } + void setLumaIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_lumaIntraDir + absPartIdx, dir); } + void setChromIntraDirSubParts(uint8_t dir, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_chromaIntraDir + absPartIdx, dir); } + void setCbfSubParts(uint8_t cbf, TextType ttype, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_cbf[ttype] + absPartIdx, cbf); } + void setCbfPartRange(uint8_t cbf, TextType ttype, uint32_t absPartIdx, uint32_t coveredPartIdxes) { memset(m_cbf[ttype] + absPartIdx, cbf, coveredPartIdxes); } + void setTransformSkipSubParts(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t depth) { s_partSet[depth](m_transformSkip[ttype] + absPartIdx, tskip); } + void setTransformSkipPartRange(uint8_t tskip, TextType ttype, uint32_t absPartIdx, uint32_t coveredPartIdxes) { memset(m_transformSkip[ttype] + absPartIdx, tskip, coveredPartIdxes); } + + bool setQPSubCUs(char qp, uint32_t absPartIdx, uint32_t depth); + + void setPUInterDir(uint8_t dir, uint32_t absPartIdx, uint32_t puIdx); + void setPUMv(int list, const MV& mv, int absPartIdx, int puIdx); + void setPURefIdx(int list, char refIdx, int absPartIdx, int puIdx); + + uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t trDepth) const { return (m_cbf[ttype][absPartIdx] >> trDepth) & 0x1; } + uint8_t getQtRootCbf(uint32_t absPartIdx) const { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; } + char getRefQP(uint32_t currAbsIdxInCTU) const; + uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const; + void clipMv(MV& outMV) const; + int fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const; + void getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const; + void getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const; + + uint32_t getNumPartInter() const { return nbPartsTable[(int)m_partSize[0]]; } + bool isIntra(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_INTRA; } + bool isSkipped(uint32_t absPartIdx) const { return !!m_skipFlag[absPartIdx]; } + bool isBipredRestriction() const { return m_log2CUSize[0] == 3 && m_partSize[0] != SIZE_2Nx2N; } + + void getPartIndexAndSize(uint32_t puIdx, uint32_t& absPartIdx, int& puWidth, int& puHeight) const; + void getMvField(const CUData* cu, uint32_t absPartIdx, int picList, MVField& mvField) const; + + void getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const; + int getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const; + void deriveLeftRightTopIdxAdi(uint32_t& partIdxLT, uint32_t& partIdxRT, uint32_t partOffset, uint32_t partDepth) const; + + uint32_t getSCUAddr() const { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; } + uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const; + uint32_t getCtxSkipFlag(uint32_t absPartIdx) const; + ScanType getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const; + void getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const; + + const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const; + const CUData* getPUAbove(uint32_t& aPartUnitIdx, uint32_t curPartUnitIdx, bool planarAtCTUBoundary = false) const; + const CUData* getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUnitIdx) const; + const CUData* getPUAboveRight(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx) const; + const CUData* getPUBelowLeft(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx) const; + + const CUData* getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t currAbsIdxInCTU) const; + const CUData* getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t currAbsIdxInCTU) const; + + const CUData* getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const; + const CUData* getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const; + +protected: + + template + void setAllPU(T *p, const T& val, int absPartIdx, int puIdx); + + char getLastCodedQP(uint32_t absPartIdx) const; + int getLastValidPartIdx(int absPartIdx) const; + + bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const; + + bool isDiffMER(int xN, int yN, int xP, int yP) const; + + // add possible motion vector predictor candidates + bool addMVPCand(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const; + bool addMVPCandOrder(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const; + + bool getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int absPartIdx) const; + + void scaleMvByPOCDist(MV& outMV, const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const; + + void deriveLeftRightTopIdx(uint32_t puIdx, uint32_t& partIdxLT, uint32_t& partIdxRT) const; + + uint32_t deriveCenterIdx(uint32_t puIdx) const; + uint32_t deriveRightBottomIdx(uint32_t puIdx) const; + uint32_t deriveLeftBottomIdx(uint32_t puIdx) const; +}; + +// TU settings for entropy encoding +struct TUEntropyCodingParameters +{ + const uint16_t *scan; + const uint16_t *scanCG; + ScanType scanType; + uint32_t log2TrSizeCG; + uint32_t firstSignificanceMapContext; +}; + +struct CUDataMemPool +{ + uint8_t* charMemBlock; + coeff_t* trCoeffMemBlock; + MV* mvMemBlock; + + CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; } + + bool create(uint32_t depth, uint32_t csp, uint32_t numInstances) + { + uint32_t numPartition = NUM_CU_PARTITIONS >> (depth * 2); + uint32_t cuSize = g_maxCUSize >> depth; + uint32_t sizeL = cuSize * cuSize; + uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp)); + CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances); + CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition); + CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances); + return true; + + fail: + return false; + } + + void destroy() + { + X265_FREE(trCoeffMemBlock); + X265_FREE(mvMemBlock); + X265_FREE(charMemBlock); + } +}; +} + +#endif // ifndef X265_CUDATA_H diff --git a/source/common/dct.cpp b/source/common/dct.cpp new file mode 100644 index 0000000..714006e --- /dev/null +++ b/source/common/dct.cpp @@ -0,0 +1,893 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Mandar Gurav + * Deepthi Devaki Akkoorath + * Mahesh Pittala + * Rajesh Paulraj + * Min Chen + * Praveen Kumar Tiwari + * Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" + +using namespace x265; + +#if _MSC_VER +#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions +#endif + +namespace { +// anonymous file-static namespace + +// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm +// give identical results +void fastForwardDst(int16_t *block, int16_t *coeff, int shift) // input block, output coeff +{ + int c[4]; + int rnd_factor = 1 << (shift - 1); + + for (int i = 0; i < 4; i++) + { + // Intermediate Variables + c[0] = block[4 * i + 0] + block[4 * i + 3]; + c[1] = block[4 * i + 1] + block[4 * i + 3]; + c[2] = block[4 * i + 0] - block[4 * i + 1]; + c[3] = 74 * block[4 * i + 2]; + + coeff[i] = (int16_t)((29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift); + coeff[4 + i] = (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift); + coeff[8 + i] = (int16_t)((29 * c[2] + 55 * c[0] - c[3] + rnd_factor) >> shift); + coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift); + } +} + +void inversedst(int16_t *tmp, int16_t *block, int shift) // input tmp, output block +{ + int i, c[4]; + int rnd_factor = 1 << (shift - 1); + + for (i = 0; i < 4; i++) + { + // Intermediate Variables + c[0] = tmp[i] + tmp[8 + i]; + c[1] = tmp[8 + i] + tmp[12 + i]; + c[2] = tmp[i] - tmp[12 + i]; + c[3] = 74 * tmp[4 + i]; + + block[4 * i + 0] = (int16_t)Clip3(-32768, 32767, (29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift); + block[4 * i + 1] = (int16_t)Clip3(-32768, 32767, (55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift); + block[4 * i + 2] = (int16_t)Clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i] + tmp[12 + i]) + rnd_factor) >> shift); + block[4 * i + 3] = (int16_t)Clip3(-32768, 32767, (55 * c[0] + 29 * c[2] - c[3] + rnd_factor) >> shift); + } +} + +void partialButterfly16(int16_t *src, int16_t *dst, int shift, int line) +{ + int j, k; + int E[8], O[8]; + int EE[4], EO[4]; + int EEE[2], EEO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O */ + for (k = 0; k < 8; k++) + { + E[k] = src[k] + src[15 - k]; + O[k] = src[k] - src[15 - k]; + } + + /* EE and EO */ + for (k = 0; k < 4; k++) + { + EE[k] = E[k] + E[7 - k]; + EO[k] = E[k] - E[7 - k]; + } + + /* EEE and EEO */ + EEE[0] = EE[0] + EE[3]; + EEO[0] = EE[0] - EE[3]; + EEE[1] = EE[1] + EE[2]; + EEO[1] = EE[1] - EE[2]; + + dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift); + dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift); + dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift); + dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift); + + for (k = 2; k < 16; k += 4) + { + dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] + + g_t16[k][3] * EO[3] + add) >> shift); + } + + for (k = 1; k < 16; k += 2) + { + dst[k * line] = (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] + + g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] + + add) >> shift); + } + + src += 16; + dst++; + } +} + +void partialButterfly32(int16_t *src, int16_t *dst, int shift, int line) +{ + int j, k; + int E[16], O[16]; + int EE[8], EO[8]; + int EEE[4], EEO[4]; + int EEEE[2], EEEO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O*/ + for (k = 0; k < 16; k++) + { + E[k] = src[k] + src[31 - k]; + O[k] = src[k] - src[31 - k]; + } + + /* EE and EO */ + for (k = 0; k < 8; k++) + { + EE[k] = E[k] + E[15 - k]; + EO[k] = E[k] - E[15 - k]; + } + + /* EEE and EEO */ + for (k = 0; k < 4; k++) + { + EEE[k] = EE[k] + EE[7 - k]; + EEO[k] = EE[k] - EE[7 - k]; + } + + /* EEEE and EEEO */ + EEEE[0] = EEE[0] + EEE[3]; + EEEO[0] = EEE[0] - EEE[3]; + EEEE[1] = EEE[1] + EEE[2]; + EEEO[1] = EEE[1] - EEE[2]; + + dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift); + dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift); + dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift); + dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift); + for (k = 4; k < 32; k += 8) + { + dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] + + g_t32[k][3] * EEO[3] + add) >> shift); + } + + for (k = 2; k < 32; k += 4) + { + dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] + + g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] + + g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift); + } + + for (k = 1; k < 32; k += 2) + { + dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] + + g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] + + g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] * + O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] + + g_t32[k][15] * O[15] + add) >> shift); + } + + src += 32; + dst++; + } +} + +void partialButterfly8(int16_t *src, int16_t *dst, int shift, int line) +{ + int j, k; + int E[4], O[4]; + int EE[2], EO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O*/ + for (k = 0; k < 4; k++) + { + E[k] = src[k] + src[7 - k]; + O[k] = src[k] - src[7 - k]; + } + + /* EE and EO */ + EE[0] = E[0] + E[3]; + EO[0] = E[0] - E[3]; + EE[1] = E[1] + E[2]; + EO[1] = E[1] - E[2]; + + dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift); + dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift); + dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift); + dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift); + + dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift); + dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift); + dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift); + dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift); + + src += 8; + dst++; + } +} + +void partialButterflyInverse4(int16_t *src, int16_t *dst, int shift, int line) +{ + int j; + int E[2], O[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line]; + O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line]; + E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line]; + E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line]; + + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ + dst[0] = (int16_t)(Clip3(-32768, 32767, (E[0] + O[0] + add) >> shift)); + dst[1] = (int16_t)(Clip3(-32768, 32767, (E[1] + O[1] + add) >> shift)); + dst[2] = (int16_t)(Clip3(-32768, 32767, (E[1] - O[1] + add) >> shift)); + dst[3] = (int16_t)(Clip3(-32768, 32767, (E[0] - O[0] + add) >> shift)); + + src++; + dst += 4; + } +} + +void partialButterflyInverse8(int16_t *src, int16_t *dst, int shift, int line) +{ + int j, k; + int E[4], O[4]; + int EE[2], EO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for (k = 0; k < 4; k++) + { + O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line]; + } + + EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line]; + EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line]; + EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line]; + EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line]; + + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ + E[0] = EE[0] + EO[0]; + E[3] = EE[0] - EO[0]; + E[1] = EE[1] + EO[1]; + E[2] = EE[1] - EO[1]; + for (k = 0; k < 4; k++) + { + dst[k] = (int16_t)Clip3(-32768, 32767, (E[k] + O[k] + add) >> shift); + dst[k + 4] = (int16_t)Clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift); + } + + src++; + dst += 8; + } +} + +void partialButterflyInverse16(int16_t *src, int16_t *dst, int shift, int line) +{ + int j, k; + int E[8], O[8]; + int EE[4], EO[4]; + int EEE[2], EEO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for (k = 0; k < 8; k++) + { + O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] + + g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line]; + } + + for (k = 0; k < 4; k++) + { + EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line]; + } + + EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line]; + EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line]; + EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line]; + EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line]; + + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ + for (k = 0; k < 2; k++) + { + EE[k] = EEE[k] + EEO[k]; + EE[k + 2] = EEE[1 - k] - EEO[1 - k]; + } + + for (k = 0; k < 4; k++) + { + E[k] = EE[k] + EO[k]; + E[k + 4] = EE[3 - k] - EO[3 - k]; + } + + for (k = 0; k < 8; k++) + { + dst[k] = (int16_t)Clip3(-32768, 32767, (E[k] + O[k] + add) >> shift); + dst[k + 8] = (int16_t)Clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift); + } + + src++; + dst += 16; + } +} + +void partialButterflyInverse32(int16_t *src, int16_t *dst, int shift, int line) +{ + int j, k; + int E[16], O[16]; + int EE[8], EO[8]; + int EEE[4], EEO[4]; + int EEEE[2], EEEO[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for (k = 0; k < 16; k++) + { + O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] + + g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] + + g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] + + g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line]; + } + + for (k = 0; k < 8; k++) + { + EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] + + g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line]; + } + + for (k = 0; k < 4; k++) + { + EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line]; + } + + EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line]; + EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line]; + EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line]; + EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line]; + + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ + EEE[0] = EEEE[0] + EEEO[0]; + EEE[3] = EEEE[0] - EEEO[0]; + EEE[1] = EEEE[1] + EEEO[1]; + EEE[2] = EEEE[1] - EEEO[1]; + for (k = 0; k < 4; k++) + { + EE[k] = EEE[k] + EEO[k]; + EE[k + 4] = EEE[3 - k] - EEO[3 - k]; + } + + for (k = 0; k < 8; k++) + { + E[k] = EE[k] + EO[k]; + E[k + 8] = EE[7 - k] - EO[7 - k]; + } + + for (k = 0; k < 16; k++) + { + dst[k] = (int16_t)Clip3(-32768, 32767, (E[k] + O[k] + add) >> shift); + dst[k + 16] = (int16_t)Clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift); + } + + src++; + dst += 32; + } +} + +void partialButterfly4(int16_t *src, int16_t *dst, int shift, int line) +{ + int j; + int E[2], O[2]; + int add = 1 << (shift - 1); + + for (j = 0; j < line; j++) + { + /* E and O */ + E[0] = src[0] + src[3]; + O[0] = src[0] - src[3]; + E[1] = src[1] + src[2]; + O[1] = src[1] - src[2]; + + dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift); + dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift); + dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift); + dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift); + + src += 4; + dst++; + } +} + +void dst4_c(int16_t *src, int32_t *dst, intptr_t stride) +{ + const int shift_1st = 1 + X265_DEPTH - 8; + const int shift_2nd = 8; + + ALIGN_VAR_32(int16_t, coef[4 * 4]); + ALIGN_VAR_32(int16_t, block[4 * 4]); + + for (int i = 0; i < 4; i++) + { + memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t)); + } + + fastForwardDst(block, coef, shift_1st); + fastForwardDst(coef, block, shift_2nd); + +#define N (4) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + dst[i * N + j] = block[i * N + j]; + } + } + +#undef N +} + +void dct4_c(int16_t *src, int32_t *dst, intptr_t stride) +{ + const int shift_1st = 1 + X265_DEPTH - 8; + const int shift_2nd = 8; + + ALIGN_VAR_32(int16_t, coef[4 * 4]); + ALIGN_VAR_32(int16_t, block[4 * 4]); + + for (int i = 0; i < 4; i++) + { + memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t)); + } + + partialButterfly4(block, coef, shift_1st, 4); + partialButterfly4(coef, block, shift_2nd, 4); +#define N (4) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + dst[i * N + j] = block[i * N + j]; + } + } + +#undef N +} + +void dct8_c(int16_t *src, int32_t *dst, intptr_t stride) +{ + const int shift_1st = 2 + X265_DEPTH - 8; + const int shift_2nd = 9; + + ALIGN_VAR_32(int16_t, coef[8 * 8]); + ALIGN_VAR_32(int16_t, block[8 * 8]); + + for (int i = 0; i < 8; i++) + { + memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t)); + } + + partialButterfly8(block, coef, shift_1st, 8); + partialButterfly8(coef, block, shift_2nd, 8); + +#define N (8) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + dst[i * N + j] = block[i * N + j]; + } + } + +#undef N +} + +void dct16_c(int16_t *src, int32_t *dst, intptr_t stride) +{ + const int shift_1st = 3 + X265_DEPTH - 8; + const int shift_2nd = 10; + + ALIGN_VAR_32(int16_t, coef[16 * 16]); + ALIGN_VAR_32(int16_t, block[16 * 16]); + + for (int i = 0; i < 16; i++) + { + memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t)); + } + + partialButterfly16(block, coef, shift_1st, 16); + partialButterfly16(coef, block, shift_2nd, 16); + +#define N (16) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + dst[i * N + j] = block[i * N + j]; + } + } + +#undef N +} + +void dct32_c(int16_t *src, int32_t *dst, intptr_t stride) +{ + const int shift_1st = 4 + X265_DEPTH - 8; + const int shift_2nd = 11; + + ALIGN_VAR_32(int16_t, coef[32 * 32]); + ALIGN_VAR_32(int16_t, block[32 * 32]); + + for (int i = 0; i < 32; i++) + { + memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t)); + } + + partialButterfly32(block, coef, shift_1st, 32); + partialButterfly32(coef, block, shift_2nd, 32); + +#define N (32) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + dst[i * N + j] = block[i * N + j]; + } + } + +#undef N +} + +void idst4_c(int32_t *src, int16_t *dst, intptr_t stride) +{ + const int shift_1st = 7; + const int shift_2nd = 12 - (X265_DEPTH - 8); + + ALIGN_VAR_32(int16_t, coef[4 * 4]); + ALIGN_VAR_32(int16_t, block[4 * 4]); + +#define N (4) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + block[i * N + j] = (int16_t)src[i * N + j]; + } + } + +#undef N + + inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output + inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output + + for (int i = 0; i < 4; i++) + { + memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t)); + } +} + +void idct4_c(int32_t *src, int16_t *dst, intptr_t stride) +{ + const int shift_1st = 7; + const int shift_2nd = 12 - (X265_DEPTH - 8); + + ALIGN_VAR_32(int16_t, coef[4 * 4]); + ALIGN_VAR_32(int16_t, block[4 * 4]); + +#define N (4) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + block[i * N + j] = (int16_t)src[i * N + j]; + } + } + +#undef N + + partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output + partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output + + for (int i = 0; i < 4; i++) + { + memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t)); + } +} + +void idct8_c(int32_t *src, int16_t *dst, intptr_t stride) +{ + const int shift_1st = 7; + const int shift_2nd = 12 - (X265_DEPTH - 8); + + ALIGN_VAR_32(int16_t, coef[8 * 8]); + ALIGN_VAR_32(int16_t, block[8 * 8]); + +#define N (8) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + block[i * N + j] = (int16_t)src[i * N + j]; + } + } + +#undef N + + partialButterflyInverse8(block, coef, shift_1st, 8); + partialButterflyInverse8(coef, block, shift_2nd, 8); + for (int i = 0; i < 8; i++) + { + memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t)); + } +} + +void idct16_c(int32_t *src, int16_t *dst, intptr_t stride) +{ + const int shift_1st = 7; + const int shift_2nd = 12 - (X265_DEPTH - 8); + + ALIGN_VAR_32(int16_t, coef[16 * 16]); + ALIGN_VAR_32(int16_t, block[16 * 16]); + +#define N (16) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + block[i * N + j] = (int16_t)src[i * N + j]; + } + } + +#undef N + + partialButterflyInverse16(block, coef, shift_1st, 16); + partialButterflyInverse16(coef, block, shift_2nd, 16); + for (int i = 0; i < 16; i++) + { + memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t)); + } +} + +void idct32_c(int32_t *src, int16_t *dst, intptr_t stride) +{ + const int shift_1st = 7; + const int shift_2nd = 12 - (X265_DEPTH - 8); + + ALIGN_VAR_32(int16_t, coef[32 * 32]); + ALIGN_VAR_32(int16_t, block[32 * 32]); + +#define N (32) + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + block[i * N + j] = (int16_t)src[i * N + j]; + } + } + +#undef N + + partialButterflyInverse32(block, coef, shift_1st, 32); + partialButterflyInverse32(coef, block, shift_2nd, 32); + + for (int i = 0; i < 32; i++) + { + memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t)); + } +} + +void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) +{ +#if HIGH_BIT_DEPTH + X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale); +#else + // NOTE: maximum of scale is (72 * 256) + X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale); +#endif + X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); + X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num); + X265_CHECK(shift <= 10, "shift too large %d\n", shift); + X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n"); + + int add, coeffQ; + + add = 1 << (shift - 1); + + for (int n = 0; n < num; n++) + { + coeffQ = (quantCoef[n] * scale + add) >> shift; + coef[n] = Clip3(-32768, 32767, coeffQ); + } +} + +void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift) +{ + X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); + + int add, coeffQ; + + shift += 4; + + if (shift > per) + { + add = 1 << (shift - per - 1); + + for (int n = 0; n < num; n++) + { + coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per); + coef[n] = Clip3(-32768, 32767, coeffQ); + } + } + else + { + for (int n = 0; n < num; n++) + { + coeffQ = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]); + coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift)); + } + } +} + +uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff) +{ + X265_CHECK(qBits >= 8, "qBits less than 8\n"); + X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n"); + int qBits8 = qBits - 8; + uint32_t numSig = 0; + + for (int blockpos = 0; blockpos < numCoeff; blockpos++) + { + int level = coef[blockpos]; + int sign = (level < 0 ? -1 : 1); + + int tmplevel = abs(level) * quantCoeff[blockpos]; + level = ((tmplevel + add) >> qBits); + deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8); + if (level) + ++numSig; + level *= sign; + qCoef[blockpos] = (int16_t)Clip3(-32768, 32767, level); + } + + return numSig; +} + +uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff) +{ + X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n"); + X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n"); + X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n"); + + uint32_t numSig = 0; + + for (int blockpos = 0; blockpos < numCoeff; blockpos++) + { + int level = coef[blockpos]; + int sign = (level < 0 ? -1 : 1); + + int tmplevel = abs(level) * quantCoeff[blockpos]; + level = ((tmplevel + add) >> qBits); + if (level) + ++numSig; + level *= sign; + qCoef[blockpos] = (int16_t)Clip3(-32768, 32767, level); + } + + return numSig; +} + +int count_nonzero_c(const int16_t *quantCoeff, int numCoeff) +{ + X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n"); + X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff); + + int count = 0; + + for (int i = 0; i < numCoeff; i++) + { + count += quantCoeff[i] != 0; + } + + return count; +} + +template +uint32_t copy_count(int16_t* coeff, int16_t* residual, intptr_t stride) +{ + uint32_t numSig = 0; + for (int k = 0; k < trSize; k++) + { + for (int j = 0; j < trSize; j++) + { + coeff[k * trSize + j] = residual[k * stride + j]; + numSig += (residual[k * stride + j] != 0); + } + } + + return numSig; +} + +void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff) +{ + for (int i = 0; i < numCoeff; i++) + { + int level = dctCoef[i]; + int sign = level >> 31; + level = (level + sign) ^ sign; + resSum[i] += level; + level -= offset[i]; + dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign; + } +} + +} // closing - anonymous file-static namespace + +namespace x265 { +// x265 private namespace + +void Setup_C_DCTPrimitives(EncoderPrimitives& p) +{ + p.dequant_scaling = dequant_scaling_c; + p.dequant_normal = dequant_normal_c; + p.quant = quant_c; + p.nquant = nquant_c; + p.dct[DST_4x4] = dst4_c; + p.dct[DCT_4x4] = dct4_c; + p.dct[DCT_8x8] = dct8_c; + p.dct[DCT_16x16] = dct16_c; + p.dct[DCT_32x32] = dct32_c; + p.idct[IDST_4x4] = idst4_c; + p.idct[IDCT_4x4] = idct4_c; + p.idct[IDCT_8x8] = idct8_c; + p.idct[IDCT_16x16] = idct16_c; + p.idct[IDCT_32x32] = idct32_c; + p.count_nonzero = count_nonzero_c; + p.denoiseDct = denoiseDct_c; + + p.copy_cnt[BLOCK_4x4] = copy_count<4>; + p.copy_cnt[BLOCK_8x8] = copy_count<8>; + p.copy_cnt[BLOCK_16x16] = copy_count<16>; + p.copy_cnt[BLOCK_32x32] = copy_count<32>; +} +} diff --git a/source/common/deblock.cpp b/source/common/deblock.cpp new file mode 100644 index 0000000..c9a2731 --- /dev/null +++ b/source/common/deblock.cpp @@ -0,0 +1,647 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Author: Gopu Govindaswamy +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "deblock.h" +#include "framedata.h" +#include "picyuv.h" +#include "slice.h" +#include "mv.h" + +using namespace x265; + +#define DEBLOCK_SMALLEST_BLOCK 8 +#define DEFAULT_INTRA_TC_OFFSET 2 + +void Deblock::deblockCTU(CUData* cu, int32_t dir) +{ + uint8_t blockingStrength[MAX_NUM_PARTITIONS]; + + memset(blockingStrength, 0, sizeof(uint8_t) * m_numPartitions); + + deblockCU(cu, 0, 0, dir, blockingStrength); +} + +/* Deblocking filter process in CU-based (the same function as conventional's) + * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */ +void Deblock::deblockCU(CUData* cu, uint32_t absPartIdx, uint32_t depth, const int32_t dir, uint8_t blockingStrength[]) +{ + if (cu->m_partSize[absPartIdx] == SIZE_NONE) + return; + + uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1); + + const SPS& sps = *cu->m_slice->m_sps; + + if (cu->m_cuDepth[absPartIdx] > depth) + { + uint32_t qNumParts = curNumParts >> 2; + uint32_t xmax = sps.picWidthInLumaSamples - cu->m_cuPelX; + uint32_t ymax = sps.picHeightInLumaSamples - cu->m_cuPelY; + for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts) + if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax) + deblockCU(cu, absPartIdx, depth + 1, dir, blockingStrength); + return; + } + + const uint32_t widthInBaseUnits = sps.numPartInCUSize >> depth; + Param params; + setLoopfilterParam(cu, absPartIdx, ¶ms); + setEdgefilterPU(cu, absPartIdx, dir, blockingStrength, widthInBaseUnits); + setEdgefilterTU(cu, absPartIdx, depth, dir, blockingStrength); + setEdgefilterMultiple(cu, absPartIdx, dir, 0, (dir == EDGE_VER ? params.leftEdge : params.topEdge), blockingStrength, widthInBaseUnits); + + for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + curNumParts; partIdx++) + { + uint32_t bsCheck = !(partIdx & (1 << dir)); + + if (bsCheck && blockingStrength[partIdx]) + getBoundaryStrengthSingle(cu, dir, partIdx, blockingStrength); + } + + const uint32_t partIdxIncr = DEBLOCK_SMALLEST_BLOCK >> LOG2_UNIT_SIZE; + uint32_t sizeInPU = sps.numPartInCUSize >> depth; + uint32_t shiftFactor = (dir == EDGE_VER) ? cu->m_hChromaShift : cu->m_vChromaShift; + uint32_t chromaMask = ((DEBLOCK_SMALLEST_BLOCK << shiftFactor) >> LOG2_UNIT_SIZE) - 1; + uint32_t e0 = (dir == EDGE_VER ? g_zscanToPelX[absPartIdx] : g_zscanToPelY[absPartIdx]) >> LOG2_UNIT_SIZE; + + for (uint32_t e = 0; e < sizeInPU; e += partIdxIncr) + { + edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockingStrength); + if (!((e0 + e) & chromaMask)) + edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockingStrength); + } +} + +static inline uint32_t calcBsIdx(CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx) +{ + uint32_t ctuWidthInBaseUnits = cu->m_slice->m_sps->numPartInCUSize; + + if (dir) + return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * ctuWidthInBaseUnits + baseUnitIdx]; + else + return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * ctuWidthInBaseUnits + edgeIdx]; +} + +void Deblock::setEdgefilterMultiple(CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits) +{ + const uint32_t numElem = widthInBaseUnits; + X265_CHECK(numElem > 0, "numElem edge filter check\n"); + for (uint32_t i = 0; i < numElem; i++) + { + const uint32_t bsidx = calcBsIdx(cu, scanIdx, dir, edgeIdx, i); + blockingStrength[bsidx] = value; + } +} + +void Deblock::setEdgefilterTU(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]) +{ + if ((uint32_t)cu->m_tuDepth[absPartIdx] + cu->m_cuDepth[absPartIdx] > depth) + { + const uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1); + const uint32_t qNumParts = curNumParts >> 2; + + for (uint32_t partIdx = 0; partIdx < 4; partIdx++, absPartIdx += qNumParts) + setEdgefilterTU(cu, absPartIdx, depth + 1, dir, blockingStrength); + return; + } + + uint32_t widthInBaseUnits = 1 << (cu->m_log2CUSize[absPartIdx] - cu->m_tuDepth[absPartIdx] - LOG2_UNIT_SIZE); + setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockingStrength, widthInBaseUnits); +} + +void Deblock::setEdgefilterPU(CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits) +{ + const uint32_t hWidthInBaseUnits = widthInBaseUnits >> 1; + const uint32_t qWidthInBaseUnits = widthInBaseUnits >> 2; + + switch (cu->m_partSize[absPartIdx]) + { + case SIZE_2NxN: + if (EDGE_HOR == dir) + setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + case SIZE_Nx2N: + if (EDGE_VER == dir) + setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + case SIZE_NxN: + setEdgefilterMultiple(cu, absPartIdx, dir, hWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + case SIZE_2NxnU: + if (EDGE_HOR == dir) + setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + case SIZE_nLx2N: + if (EDGE_VER == dir) + setEdgefilterMultiple(cu, absPartIdx, dir, qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + case SIZE_2NxnD: + if (EDGE_HOR == dir) + setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + case SIZE_nRx2N: + if (EDGE_VER == dir) + setEdgefilterMultiple(cu, absPartIdx, dir, widthInBaseUnits - qWidthInBaseUnits, 1, blockingStrength, widthInBaseUnits); + break; + + case SIZE_2Nx2N: + default: + break; + } +} + +void Deblock::setLoopfilterParam(CUData* cu, uint32_t absPartIdx, Param *params) +{ + uint32_t x = cu->m_cuPelX + g_zscanToPelX[absPartIdx]; + uint32_t y = cu->m_cuPelY + g_zscanToPelY[absPartIdx]; + + const CUData* tempCU; + uint32_t tempPartIdx; + + if (!x) + params->leftEdge = 0; + else + { + tempCU = cu->getPULeft(tempPartIdx, absPartIdx); + if (tempCU) + params->leftEdge = 2; + else + params->leftEdge = 0; + } + + if (!y) + params->topEdge = 0; + else + { + tempCU = cu->getPUAbove(tempPartIdx, absPartIdx); + if (tempCU) + params->topEdge = 2; + else + params->topEdge = 0; + } +} + +void Deblock::getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t absPartIdx, uint8_t blockingStrength[]) +{ + const Slice* const slice = cu->m_slice; + const uint32_t partQ = absPartIdx; + CUData* const cuQ = cu; + + uint32_t partP; + const CUData* cuP; + uint8_t bs = 0; + + // Calculate block index + if (dir == EDGE_VER) + cuP = cuQ->getPULeft(partP, partQ); + else // (dir == EDGE_HOR) + cuP = cuQ->getPUAbove(partP, partQ); + + // Set BS for Intra MB : BS = 4 or 3 + if (cuP->isIntra(partP) || cuQ->isIntra(partQ)) + bs = 2; + + // Set BS for not Intra MB : BS = 2 or 1 or 0 + if (!cuP->isIntra(partP) && !cuQ->isIntra(partQ)) + { + uint32_t nsPartQ = partQ; + uint32_t nsPartP = partP; + + if (blockingStrength[absPartIdx] > 1 && + (cuQ->getCbf(nsPartQ, TEXT_LUMA, cuQ->m_tuDepth[nsPartQ]) || + cuP->getCbf(nsPartP, TEXT_LUMA, cuP->m_tuDepth[nsPartP]))) + bs = 1; + else + { + if (dir == EDGE_HOR) + cuP = cuQ->getPUAbove(partP, partQ); + + if (slice->isInterB() || cuP->m_slice->isInterB()) + { + int32_t refIdx; + Frame *refP0, *refP1, *refQ0, *refQ1; + refIdx = cuP->m_refIdx[0][partP]; + refP0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx]; + refIdx = cuP->m_refIdx[1][partP]; + refP1 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[1][refIdx]; + refIdx = cuQ->m_refIdx[0][partQ]; + refQ0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx]; + refIdx = cuQ->m_refIdx[1][partQ]; + refQ1 = (refIdx < 0) ? NULL : slice->m_refPicList[1][refIdx]; + + MV mvp0 = cuP->m_mv[0][partP]; + MV mvp1 = cuP->m_mv[1][partP]; + MV mvq0 = cuQ->m_mv[0][partQ]; + MV mvq1 = cuQ->m_mv[1][partQ]; + + if (!refP0) mvp0 = 0; + if (!refP1) mvp1 = 0; + if (!refQ0) mvq0 = 0; + if (!refQ1) mvq1 = 0; + + if (((refP0 == refQ0) && (refP1 == refQ1)) || ((refP0 == refQ1) && (refP1 == refQ0))) + { + if (refP0 != refP1) // Different L0 & L1 + { + if (refP0 == refQ0) + { + bs = ((abs(mvq0.x - mvp0.x) >= 4) || + (abs(mvq0.y - mvp0.y) >= 4) || + (abs(mvq1.x - mvp1.x) >= 4) || + (abs(mvq1.y - mvp1.y) >= 4)) ? 1 : 0; + } + else + { + bs = ((abs(mvq1.x - mvp0.x) >= 4) || + (abs(mvq1.y - mvp0.y) >= 4) || + (abs(mvq0.x - mvp1.x) >= 4) || + (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0; + } + } + else // Same L0 & L1 + { + bs = ((abs(mvq0.x - mvp0.x) >= 4) || + (abs(mvq0.y - mvp0.y) >= 4) || + (abs(mvq1.x - mvp1.x) >= 4) || + (abs(mvq1.y - mvp1.y) >= 4)) && + ((abs(mvq1.x - mvp0.x) >= 4) || + (abs(mvq1.y - mvp0.y) >= 4) || + (abs(mvq0.x - mvp1.x) >= 4) || + (abs(mvq0.y - mvp1.y) >= 4)) ? 1 : 0; + } + } + else // for all different Ref_Idx + bs = 1; + } + else // slice->isInterP() + { + int32_t refIdx; + Frame *refp0, *refq0; + refIdx = cuP->m_refIdx[0][partP]; + refp0 = (refIdx < 0) ? NULL : cuP->m_slice->m_refPicList[0][refIdx]; + refIdx = cuQ->m_refIdx[0][partQ]; + refq0 = (refIdx < 0) ? NULL : slice->m_refPicList[0][refIdx]; + MV mvp0 = cuP->m_mv[0][partP]; + MV mvq0 = cuQ->m_mv[0][partQ]; + + if (!refp0) mvp0 = 0; + if (!refq0) mvq0 = 0; + + bs = ((refp0 != refq0) || + (abs(mvq0.x - mvp0.x) >= 4) || + (abs(mvq0.y - mvp0.y) >= 4)) ? 1 : 0; + } + } + } + + blockingStrength[absPartIdx] = bs; +} + +static inline int32_t calcDP(pixel* src, intptr_t offset) +{ + return abs(static_cast(src[-offset * 3]) - 2 * src[-offset * 2] + src[-offset]); +} + +static inline int32_t calcDQ(pixel* src, intptr_t offset) +{ + return abs(static_cast(src[0]) - 2 * src[offset] + src[offset * 2]); +} + +static inline bool useStrongFiltering(intptr_t offset, int32_t beta, int32_t tc, pixel* src) +{ + int16_t m4 = (int16_t)src[0]; + int16_t m3 = (int16_t)src[-offset]; + int16_t m7 = (int16_t)src[offset * 3]; + int16_t m0 = (int16_t)src[-offset * 4]; + int32_t strong = abs(m0 - m3) + abs(m7 - m4); + + return (strong < (beta >> 3)) && (abs(m3 - m4) < ((tc * 5 + 1) >> 1)); +} + +/* Deblocking for the luminance component with strong or weak filter + * \param src pointer to picture data + * \param offset offset value for picture data + * \param tc tc value + * \param partPNoFilter indicator to disable filtering on partP + * \param partQNoFilter indicator to disable filtering on partQ + * \param filterSecondP decision weak filter/no filter for partP + * \param filterSecondQ decision weak filter/no filter for partQ */ +static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter) +{ + for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) + { + int16_t m4 = (int16_t)src[0]; + int16_t m3 = (int16_t)src[-offset]; + int16_t m5 = (int16_t)src[offset]; + int16_t m2 = (int16_t)src[-offset * 2]; + int32_t tc2 = 2 * tc; + if (!partPNoFilter) + { + int16_t m1 = (int16_t)src[-offset * 3]; + int16_t m0 = (int16_t)src[-offset * 4]; + src[-offset * 3] = (pixel)(Clip3(-tc2, tc2, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); + src[-offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); + src[-offset] = (pixel)(Clip3(-tc2, tc2, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); + } + if (!partQNoFilter) + { + int16_t m6 = (int16_t)src[offset * 2]; + int16_t m7 = (int16_t)src[offset * 3]; + src[0] = (pixel)(Clip3(-tc2, tc2, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); + src[offset] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); + src[offset * 2] = (pixel)(Clip3(-tc2, tc2, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); + } + } +} + +/* Weak filter */ +static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter, + bool filterSecondP, bool filterSecondQ) +{ + int32_t thrCut = tc * 10; + + for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) + { + int16_t m4 = (int16_t)src[0]; + int16_t m3 = (int16_t)src[-offset]; + int16_t m5 = (int16_t)src[offset]; + int16_t m2 = (int16_t)src[-offset * 2]; + + int32_t delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4; + + if (abs(delta) < thrCut) + { + delta = Clip3(-tc, tc, delta); + + int32_t tc2 = tc >> 1; + if (!partPNoFilter) + { + src[-offset] = Clip(m3 + delta); + if (filterSecondP) + { + int16_t m1 = (int16_t)src[-offset * 3]; + int32_t delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1)); + src[-offset * 2] = Clip(m2 + delta1); + } + } + if (!partQNoFilter) + { + src[0] = Clip(m4 - delta); + if (filterSecondQ) + { + int16_t m6 = (int16_t)src[offset * 2]; + int32_t delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1)); + src[offset] = Clip(m5 + delta2); + } + } + } + } +} + +/* Deblocking of one line/column for the chrominance component + * \param src pointer to picture data + * \param offset offset value for picture data + * \param tc tc value + * \param partPNoFilter indicator to disable filtering on partP + * \param partQNoFilter indicator to disable filtering on partQ */ +static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, bool partPNoFilter, bool partQNoFilter) +{ + for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) + { + int16_t m4 = (int16_t)src[0]; + int16_t m3 = (int16_t)src[-offset]; + int16_t m5 = (int16_t)src[offset]; + int16_t m2 = (int16_t)src[-offset * 2]; + + int32_t delta = Clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3)); + if (!partPNoFilter) + src[-offset] = Clip(m3 + delta); + if (!partQNoFilter) + src[0] = Clip(m4 - delta); + } +} + +void Deblock::edgeFilterLuma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]) +{ + PicYuv* reconYuv = cu->m_encData->m_reconPicYuv; + pixel* src = reconYuv->getLumaAddr(cu->m_cuAddr, absPartIdx); + + intptr_t stride = reconYuv->m_stride; + uint32_t numParts = cu->m_slice->m_sps->numPartInCUSize >> depth; + + intptr_t offset, srcStep; + + bool partPNoFilter = false; + bool partQNoFilter = false; + uint32_t partP = 0; + uint32_t partQ = 0; + const CUData* cuP = cu; + const CUData* cuQ = cu; + int32_t betaOffset = cuQ->m_slice->m_pps->deblockingFilterBetaOffsetDiv2 << 1; + int32_t tcOffset = cuQ->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1; + + if (dir == EDGE_VER) + { + offset = 1; + srcStep = stride; + src += (edge << LOG2_UNIT_SIZE); + } + else // (dir == EDGE_HOR) + { + offset = stride; + srcStep = 1; + src += (edge << LOG2_UNIT_SIZE) * stride; + } + + for (uint32_t idx = 0; idx < numParts; idx++) + { + uint32_t unitOffset = idx << LOG2_UNIT_SIZE; + uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx); + uint32_t bs = blockingStrength[bsAbsIdx]; + if (bs) + { + int32_t qpQ = cu->m_qp[bsAbsIdx]; + partQ = bsAbsIdx; + + // Derive neighboring PU index + if (dir == EDGE_VER) + cuP = cuQ->getPULeft(partP, partQ); + else // (dir == EDGE_HOR) + cuP = cuQ->getPUAbove(partP, partQ); + + int32_t qpP = cuP->m_qp[partP]; + int32_t qp = (qpP + qpQ + 1) >> 1; + + int32_t indexB = Clip3(0, QP_MAX_SPEC, qp + betaOffset); + + const int32_t bitdepthShift = X265_DEPTH - 8; + int32_t beta = s_betaTable[indexB] << bitdepthShift; + + int32_t dp0 = calcDP(src + srcStep * (unitOffset + 0), offset); + int32_t dq0 = calcDQ(src + srcStep * (unitOffset + 0), offset); + int32_t dp3 = calcDP(src + srcStep * (unitOffset + 3), offset); + int32_t dq3 = calcDQ(src + srcStep * (unitOffset + 3), offset); + int32_t d0 = dp0 + dq0; + int32_t d3 = dp3 + dq3; + + int32_t d = d0 + d3; + + if (d < beta) + { + if (cu->m_slice->m_pps->bTransquantBypassEnabled) + { + // check if each of PUs is lossless coded + partPNoFilter = !!cuP->m_tqBypass[partP]; + partQNoFilter = !!cuQ->m_tqBypass[partQ]; + } + + int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset)); + int32_t tc = s_tcTable[indexTC] << bitdepthShift; + + bool sw = (2 * d0 < (beta >> 2) && + 2 * d3 < (beta >> 2) && + useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 0)) && + useStrongFiltering(offset, beta, tc, src + srcStep * (unitOffset + 3))); + + if (sw) + pelFilterLumaStrong(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter); + else + { + int32_t sideThreshold = (beta + (beta >> 1)) >> 3; + int32_t dp = dp0 + dp3; + int32_t dq = dq0 + dq3; + bool filterP = (dp < sideThreshold); + bool filterQ = (dq < sideThreshold); + + pelFilterLuma(src + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter, filterP, filterQ); + } + } + } + } +} + +void Deblock::edgeFilterChroma(CUData* cu, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]) +{ + int32_t chFmt = cu->m_chromaFormat, chromaShift; + intptr_t offset, srcStep; + + bool partPNoFilter = false; + bool partQNoFilter = false; + uint32_t partP; + uint32_t partQ; + const CUData* cuP; + const CUData* cuQ = cu; + int32_t tcOffset = cu->m_slice->m_pps->deblockingFilterTcOffsetDiv2 << 1; + + X265_CHECK(((dir == EDGE_VER) + ? ((g_zscanToPelX[absPartIdx] + edge * UNIT_SIZE) >> cu->m_hChromaShift) + : ((g_zscanToPelY[absPartIdx] + edge * UNIT_SIZE) >> cu->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0, + "invalid edge\n"); + + PicYuv* reconPic = cu->m_encData->m_reconPicYuv; + intptr_t stride = reconPic->m_strideC; + intptr_t srcOffset = reconPic->getChromaAddrOffset(cu->m_cuAddr, absPartIdx); + + if (dir == EDGE_VER) + { + chromaShift = cu->m_vChromaShift; + srcOffset += (edge << (LOG2_UNIT_SIZE - cu->m_hChromaShift)); + offset = 1; + srcStep = stride; + } + else // (dir == EDGE_HOR) + { + chromaShift = cu->m_hChromaShift; + srcOffset += edge * stride << (LOG2_UNIT_SIZE - cu->m_vChromaShift); + offset = stride; + srcStep = 1; + } + + pixel* srcChroma[2]; + srcChroma[0] = reconPic->m_picOrg[1] + srcOffset; + srcChroma[1] = reconPic->m_picOrg[2] + srcOffset; + + uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift); + + for (uint32_t idx = 0; idx < numUnits; idx++) + { + uint32_t unitOffset = idx << LOG2_UNIT_SIZE; + uint32_t bsAbsIdx = calcBsIdx(cu, absPartIdx, dir, edge, idx << chromaShift); + uint32_t bs = blockingStrength[bsAbsIdx]; + + if (bs > 1) + { + int32_t qpQ = cu->m_qp[bsAbsIdx]; + partQ = bsAbsIdx; + + // Derive neighboring PU index + if (dir == EDGE_VER) + cuP = cuQ->getPULeft(partP, partQ); + else // (dir == EDGE_HOR) + cuP = cuQ->getPUAbove(partP, partQ); + + int32_t qpP = cuP->m_qp[partP]; + + if (cu->m_slice->m_pps->bTransquantBypassEnabled) + { + // check if each of PUs is lossless coded + partPNoFilter = !!cuP->m_tqBypass[partP]; + partQNoFilter = !!cuQ->m_tqBypass[partQ]; + } + + for (uint32_t chromaIdx = 0; chromaIdx < 2; chromaIdx++) + { + int32_t chromaQPOffset = !chromaIdx ? cu->m_slice->m_pps->chromaCbQpOffset : cu->m_slice->m_pps->chromaCrQpOffset; + int32_t qp = ((qpP + qpQ + 1) >> 1) + chromaQPOffset; + if (qp >= 30) + { + if (chFmt == X265_CSP_I420) + qp = g_chromaScale[qp]; + else + qp = X265_MIN(qp, 51); + } + + int32_t indexTC = Clip3(0, QP_MAX_SPEC + DEFAULT_INTRA_TC_OFFSET, int32_t(qp + DEFAULT_INTRA_TC_OFFSET + tcOffset)); + const int32_t bitdepthShift = X265_DEPTH - 8; + int32_t tc = s_tcTable[indexTC] << bitdepthShift; + pixel* srcC = srcChroma[chromaIdx]; + + pelFilterChroma(srcC + srcStep * unitOffset, srcStep, offset, tc, partPNoFilter, partQNoFilter); + } + } + } +} + +const uint8_t Deblock::s_tcTable[54] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24 +}; + +const uint8_t Deblock::s_betaTable[52] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64 +}; + diff --git a/source/common/deblock.h b/source/common/deblock.h new file mode 100644 index 0000000..4bdfeff --- /dev/null +++ b/source/common/deblock.h @@ -0,0 +1,75 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Author: Gopu Govindaswamy +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_DEBLOCK_H +#define X265_DEBLOCK_H + +#include "common.h" + +namespace x265 { +// private namespace + +class CUData; + +class Deblock +{ +public: + enum { EDGE_VER, EDGE_HOR }; + + uint32_t m_numPartitions; + + Deblock() : m_numPartitions(0) {} + + void init() { m_numPartitions = 1 << (g_maxFullDepth * 2); } + + void deblockCTU(CUData* cu, int32_t dir); + +protected: + + // CU-level deblocking function + void deblockCU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, const int32_t Edge, uint8_t blockingStrength[]); + + struct Param + { + uint8_t leftEdge; + uint8_t topEdge; + }; + + // set filtering functions + void setLoopfilterParam(CUData* cu, uint32_t absZOrderIdx, Param *params); + void setEdgefilterTU(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, uint8_t blockingStrength[]); + void setEdgefilterPU(CUData* cu, uint32_t absZOrderIdx, int32_t dir, uint8_t blockingStrength[], uint32_t widthInBaseUnits); + void setEdgefilterMultiple(CUData* cu, uint32_t absZOrderIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockingStrength[], uint32_t widthInBaseUnits); + + // get filtering functions + void getBoundaryStrengthSingle(CUData* cu, int32_t dir, uint32_t partIdx, uint8_t blockingStrength[]); + + // filter luma/chroma functions + void edgeFilterLuma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]); + void edgeFilterChroma(CUData* cu, uint32_t absZOrderIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockingStrength[]); + + static const uint8_t s_tcTable[54]; + static const uint8_t s_betaTable[52]; +}; +} +#endif // ifndef X265_DEBLOCK_H diff --git a/source/common/frame.cpp b/source/common/frame.cpp new file mode 100644 index 0000000..8ae912f --- /dev/null +++ b/source/common/frame.cpp @@ -0,0 +1,101 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Author: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "picyuv.h" +#include "framedata.h" + +using namespace x265; + +Frame::Frame() +{ + m_bChromaExtended = false; + m_reconRowCount.set(0); + m_countRefEncoders = 0; + m_encData = NULL; + m_reconPicYuv = NULL; + m_next = NULL; + m_prev = NULL; + memset(&m_lowres, 0, sizeof(m_lowres)); +} + +bool Frame::create(x265_param *param) +{ + m_origPicYuv = new PicYuv; + + return m_origPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp) && + m_lowres.create(m_origPicYuv, param->bframes, !!param->rc.aqMode); +} + +bool Frame::allocEncodeData(x265_param *param, const SPS& sps) +{ + m_encData = new FrameData; + m_reconPicYuv = new PicYuv; + m_encData->m_reconPicYuv = m_reconPicYuv; + bool ok = m_encData->create(param, sps) && m_reconPicYuv->create(param->sourceWidth, param->sourceHeight, param->internalCsp); + if (ok) + { + /* initialize right border of m_reconpicYuv as SAO may read beyond the + * end of the picture accessing uninitialized pixels */ + int maxHeight = sps.numCuInHeight * g_maxCUSize; + memset(m_reconPicYuv->m_picOrg[0], 0, m_reconPicYuv->m_stride * maxHeight); + memset(m_reconPicYuv->m_picOrg[1], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift)); + memset(m_reconPicYuv->m_picOrg[2], 0, m_reconPicYuv->m_strideC * (maxHeight >> m_reconPicYuv->m_vChromaShift)); + } + return ok; +} + +/* prepare to re-use a FrameData instance to encode a new picture */ +void Frame::reinit(const SPS& sps) +{ + m_bChromaExtended = false; + m_reconPicYuv = m_encData->m_reconPicYuv; + m_encData->reinit(sps); +} + +void Frame::destroy() +{ + if (m_encData) + { + m_encData->destroy(); + delete m_encData; + m_encData = NULL; + } + + if (m_origPicYuv) + { + m_origPicYuv->destroy(); + delete m_origPicYuv; + m_origPicYuv = NULL; + } + + if (m_reconPicYuv) + { + m_reconPicYuv->destroy(); + delete m_reconPicYuv; + m_reconPicYuv = NULL; + } + + m_lowres.destroy(); +} diff --git a/source/common/frame.h b/source/common/frame.h new file mode 100644 index 0000000..0fae62a --- /dev/null +++ b/source/common/frame.h @@ -0,0 +1,79 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Author: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_FRAME_H +#define X265_FRAME_H + +#include "common.h" +#include "lowres.h" +#include "threading.h" + +namespace x265 { +// private namespace + +class FrameData; +class PicYuv; +struct SPS; + +#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B) + +class Frame +{ +public: + + /* These two items will be NULL until the Frame begins to be encoded, at which point + * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */ + FrameData* m_encData; + PicYuv* m_reconPicYuv; + + /* Data associated with x265_picture */ + PicYuv* m_origPicYuv; + int m_poc; + int64_t m_pts; // user provided presentation time stamp + int64_t m_reorderedPts; + int64_t m_dts; + int32_t m_forceqp; // Force to use the qp specified in qp file + x265_intra_data* m_intraData; + x265_inter_data* m_interData; + void* m_userData; // user provided pointer passed in with this picture + + Lowres m_lowres; + bool m_bChromaExtended; // orig chroma planes motion extended for weight analysis + + /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */ + ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference + volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount + + Frame* m_next; // PicList doubly linked list pointers + Frame* m_prev; + + Frame(); + + bool create(x265_param *param); + bool allocEncodeData(x265_param *param, const SPS& sps); + void reinit(const SPS& sps); + void destroy(); +}; +} + +#endif // ifndef X265_FRAME_H diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp new file mode 100644 index 0000000..f07ae11 --- /dev/null +++ b/source/common/framedata.cpp @@ -0,0 +1,69 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Author: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "framedata.h" +#include "picyuv.h" + +using namespace x265; + +FrameData::FrameData() +{ + memset(this, 0, sizeof(*this)); +} + +bool FrameData::create(x265_param *param, const SPS& sps) +{ + m_param = param; + m_slice = new Slice; + m_picCTU = new CUData[sps.numCUsInFrame]; + + m_cuMemPool.create(0, param->internalCsp, sps.numCUsInFrame); + for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++) + m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param->internalCsp, ctuAddr); + + CHECKED_MALLOC(m_cuStat, RCStatCU, sps.numCUsInFrame); + CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight); + reinit(sps); + return true; + +fail: + return false; +} + +void FrameData::reinit(const SPS& sps) +{ + memset(m_cuStat, 0, sps.numCUsInFrame * sizeof(*m_cuStat)); + memset(m_rowStat, 0, sps.numCuInHeight * sizeof(*m_rowStat)); +} + +void FrameData::destroy() +{ + delete [] m_picCTU; + delete m_slice; + delete m_saoParam; + + m_cuMemPool.destroy(); + + X265_FREE(m_cuStat); + X265_FREE(m_rowStat); +} diff --git a/source/common/framedata.h b/source/common/framedata.h new file mode 100644 index 0000000..f6ea9d4 --- /dev/null +++ b/source/common/framedata.h @@ -0,0 +1,100 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Author: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_FRAMEDATA_H +#define X265_FRAMEDATA_H + +#include "common.h" +#include "slice.h" +#include "cudata.h" + +namespace x265 { +// private namespace + +class PicYuv; + +/* Per-frame data that is used during encodes and referenced while the picture + * is available for reference. A FrameData instance is attached to a Frame as it + * comes out of the lookahead. Frames which are not being encoded do not have a + * FrameData instance. These instances are re-used once the encoded frame has + * no active references. They hold the Slice instance and the 'official' CTU + * data structures. They are maintained in a free-list pool along together with + * a reconstructed image PicYuv in order to conserve memory. */ +class FrameData +{ +public: + + Slice* m_slice; + SAOParam* m_saoParam; + x265_param* m_param; + + FrameData* m_freeListNext; + PicYuv* m_reconPicYuv; + bool m_bHasReferences; /* used during DPB/RPS updates */ + int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */ + + CUDataMemPool m_cuMemPool; + CUData* m_picCTU; + + /* Rate control data used during encode and by references */ + struct RCStatCU + { + uint32_t totalBits; /* total bits to encode this CTU */ + uint32_t vbvCost; /* sum of lowres costs for 16x16 sub-blocks */ + uint32_t intraVbvCost; /* sum of lowres intra costs for 16x16 sub-blocks */ + uint64_t avgCost[4]; /* stores the avg cost of CU's in frame for each depth */ + uint32_t count[4]; /* count and avgCost only used by Analysis at RD0..4 */ + double baseQp; /* Qp of Cu set from RateControl/Vbv (only used by frame encoder) */ + }; + + struct RCStatRow + { + uint32_t numEncodedCUs; /* ctuAddr of last encoded CTU in row */ + uint32_t encodedBits; /* sum of 'totalBits' of encoded CTUs */ + uint32_t satdForVbv; /* sum of lowres (estimated) costs for entire row */ + uint32_t diagSatd; + uint32_t diagIntraSatd; + double diagQp; + double diagQpScale; + double sumQpRc; + double sumQpAq; + }; + + RCStatCU* m_cuStat; + RCStatRow* m_rowStat; + + double m_avgQpRc; /* avg QP as decided by rate-control */ + double m_avgQpAq; /* avg QP as decided by AQ in addition to rate-control */ + double m_rateFactor; /* calculated based on the Frame QP */ + + FrameData(); + + bool create(x265_param *param, const SPS& sps); + void reinit(const SPS& sps); + void destroy(); + + CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; } +}; +} + +#endif // ifndef X265_FRAMEDATA_H diff --git a/source/common/intrapred.cpp b/source/common/intrapred.cpp new file mode 100644 index 0000000..f43ec19 --- /dev/null +++ b/source/common/intrapred.cpp @@ -0,0 +1,307 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" + +using namespace x265; + +namespace { +pixel dcPredValue(pixel* above, pixel* left, intptr_t width) +{ + int w, sum = 0; + pixel pDcVal; + + for (w = 0; w < width; w++) + { + sum += above[w]; + } + + for (w = 0; w < width; w++) + { + sum += left[w]; + } + + pDcVal = (pixel)((sum + width) / (width + width)); + + return pDcVal; +} + +void dcPredFilter(pixel* above, pixel* left, pixel* dst, intptr_t dststride, int size) +{ + // boundary pixels processing + dst[0] = (pixel)((above[0] + left[0] + 2 * dst[0] + 2) >> 2); + + for (int x = 1; x < size; x++) + { + dst[x] = (pixel)((above[x] + 3 * dst[x] + 2) >> 2); + } + + dst += dststride; + for (int y = 1; y < size; y++) + { + *dst = (pixel)((left[y] + 3 * *dst + 2) >> 2); + dst += dststride; + } +} + +template +void intra_pred_dc_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int bFilter) +{ + int k, l; + + pixel dcval = dcPredValue(above + 1, left + 1, width); + + for (k = 0; k < width; k++) + { + for (l = 0; l < width; l++) + { + dst[k * dstStride + l] = dcval; + } + } + + if (bFilter) + { + dcPredFilter(above + 1, left + 1, dst, dstStride, width); + } +} + +template +void planar_pred_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int /*bFilter*/) +{ + above += 1; + left += 1; + int k, l; + pixel bottomLeft, topRight; + int horPred; + int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1]; + // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits) + int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE]; + const int blkSize = 1 << log2Size; + const int offset2D = blkSize; + const int shift1D = log2Size; + const int shift2D = shift1D + 1; + + // Get left and above reference column and row + for (k = 0; k < blkSize + 1; k++) + { + topRow[k] = above[k]; + leftColumn[k] = left[k]; + } + + // Prepare intermediate variables used in interpolation + bottomLeft = (pixel)leftColumn[blkSize]; + topRight = (pixel)topRow[blkSize]; + for (k = 0; k < blkSize; k++) + { + bottomRow[k] = (int16_t)(bottomLeft - topRow[k]); + rightColumn[k] = (int16_t)(topRight - leftColumn[k]); + topRow[k] <<= shift1D; + leftColumn[k] <<= shift1D; + } + + // Generate prediction signal + for (k = 0; k < blkSize; k++) + { + horPred = leftColumn[k] + offset2D; + for (l = 0; l < blkSize; l++) + { + horPred += rightColumn[k]; + topRow[l] += bottomRow[l]; + dst[k * dstStride + l] = (pixel)((horPred + topRow[l]) >> shift2D); + } + } +} + +template +void intra_pred_ang_c(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +{ + // Map the mode index to main prediction direction and angle + int k, l; + bool modeHor = (dirMode < 18); + bool modeVer = !modeHor; + int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0; + int absAng = abs(intraPredAngle); + int signAng = intraPredAngle < 0 ? -1 : 1; + + // Set bitshifts and scale the angle parameter to block size + static const int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 }; + static const int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle + int invAngle = invAngTable[absAng]; + + absAng = angTable[absAng]; + intraPredAngle = signAng * absAng; + + // Do angular predictions + { + pixel* refMain; + pixel* refSide; + + // Initialise the Main and Left reference array. + if (intraPredAngle < 0) + { + refMain = (modeVer ? refAbove : refLeft); // + (width - 1); + refSide = (modeVer ? refLeft : refAbove); // + (width - 1); + + // Extend the Main reference to the left. + int invAngleSum = 128; // rounding for (shift by 8) + for (k = -1; k > width * intraPredAngle >> 5; k--) + { + invAngleSum += invAngle; + refMain[k] = refSide[invAngleSum >> 8]; + } + } + else + { + refMain = modeVer ? refAbove : refLeft; + refSide = modeVer ? refLeft : refAbove; + } + + if (intraPredAngle == 0) + { + for (k = 0; k < width; k++) + { + for (l = 0; l < width; l++) + { + dst[k * dstStride + l] = refMain[l + 1]; + } + } + + if (bFilter) + { + for (k = 0; k < width; k++) + { + dst[k * dstStride] = (pixel)Clip3((int16_t)0, (int16_t)((1 << X265_DEPTH) - 1), static_cast((dst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1))); + } + } + } + else + { + int deltaPos = 0; + int deltaInt; + int deltaFract; + int refMainIndex; + + for (k = 0; k < width; k++) + { + deltaPos += intraPredAngle; + deltaInt = deltaPos >> 5; + deltaFract = deltaPos & (32 - 1); + + if (deltaFract) + { + // Do linear filtering + for (l = 0; l < width; l++) + { + refMainIndex = l + deltaInt + 1; + dst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5); + } + } + else + { + // Just copy the integer samples + for (l = 0; l < width; l++) + { + dst[k * dstStride + l] = refMain[l + deltaInt + 1]; + } + } + } + } + + // Flip the block if this is the horizontal mode + if (modeHor) + { + for (k = 0; k < width - 1; k++) + { + for (l = k + 1; l < width; l++) + { + pixel tmp = dst[k * dstStride + l]; + dst[k * dstStride + l] = dst[l * dstStride + k]; + dst[l * dstStride + k] = tmp; + } + } + } + } +} + +template +void all_angs_pred_c(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma) +{ + const int size = 1 << log2Size; + for (int mode = 2; mode <= 34; mode++) + { + pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0); + pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0); + pixel *out = dest + ((mode - 2) << (log2Size * 2)); + + intra_pred_ang_c(out, size, left, above, mode, bLuma); + + // Optimize code don't flip buffer + bool modeHor = (mode < 18); + + // transpose the block if this is a horizontal mode + if (modeHor) + { + for (int k = 0; k < size - 1; k++) + { + for (int l = k + 1; l < size; l++) + { + pixel tmp = out[k * size + l]; + out[k * size + l] = out[l * size + k]; + out[l * size + k] = tmp; + } + } + } + } +} +} + +namespace x265 { +// x265 private namespace + +void Setup_C_IPredPrimitives(EncoderPrimitives& p) +{ + p.intra_pred[0][BLOCK_4x4] = planar_pred_c<2>; + p.intra_pred[0][BLOCK_8x8] = planar_pred_c<3>; + p.intra_pred[0][BLOCK_16x16] = planar_pred_c<4>; + p.intra_pred[0][BLOCK_32x32] = planar_pred_c<5>; + + // Intra Prediction DC + p.intra_pred[1][BLOCK_4x4] = intra_pred_dc_c<4>; + p.intra_pred[1][BLOCK_8x8] = intra_pred_dc_c<8>; + p.intra_pred[1][BLOCK_16x16] = intra_pred_dc_c<16>; + p.intra_pred[1][BLOCK_32x32] = intra_pred_dc_c<32>; + for (int i = 2; i < NUM_INTRA_MODE; i++) + { + p.intra_pred[i][BLOCK_4x4] = intra_pred_ang_c<4>; + p.intra_pred[i][BLOCK_8x8] = intra_pred_ang_c<8>; + p.intra_pred[i][BLOCK_16x16] = intra_pred_ang_c<16>; + p.intra_pred[i][BLOCK_32x32] = intra_pred_ang_c<32>; + } + + p.intra_pred_allangs[BLOCK_4x4] = all_angs_pred_c<2>; + p.intra_pred_allangs[BLOCK_8x8] = all_angs_pred_c<3>; + p.intra_pred_allangs[BLOCK_16x16] = all_angs_pred_c<4>; + p.intra_pred_allangs[BLOCK_32x32] = all_angs_pred_c<5>; +} +} diff --git a/source/common/ipfilter.cpp b/source/common/ipfilter.cpp new file mode 100644 index 0000000..4982cba --- /dev/null +++ b/source/common/ipfilter.cpp @@ -0,0 +1,518 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Devaki , + * Rajesh Paulraj + * Praveen Kumar Tiwari + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "x265.h" + +using namespace x265; + +#if _MSC_VER +#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions +#endif + +namespace { +template +void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +{ + int shift = IF_INTERNAL_PREC - X265_DEPTH; + int row, col; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int16_t val = src[col] << shift; + dst[col] = val - (int16_t)IF_INTERNAL_OFFS; + } + + src += srcStride; + dst += dstStride; + } +} + +void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX) +{ + for (int y = 0; y < height; y++) + { +#if HIGH_BIT_DEPTH + for (int x = 0; x < marginX; x++) + { + txt[-marginX + x] = txt[0]; + txt[width + x] = txt[width - 1]; + } + +#else + ::memset(txt - marginX, txt[0], marginX); + ::memset(txt + width, txt[width - 1], marginX); +#endif + + txt += stride; + } +} + +template +void interp_horiz_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +{ + int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + int headRoom = IF_FILTER_PREC; + int offset = (1 << (headRoom - 1)); + uint16_t maxVal = (1 << X265_DEPTH) - 1; + int cStride = 1; + + src -= (N / 2 - 1) * cStride; + + int row, col; + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0 * cStride] * coeff[0]; + sum += src[col + 1 * cStride] * coeff[1]; + sum += src[col + 2 * cStride] * coeff[2]; + sum += src[col + 3 * cStride] * coeff[3]; + if (N == 8) + { + sum += src[col + 4 * cStride] * coeff[4]; + sum += src[col + 5 * cStride] * coeff[5]; + sum += src[col + 6 * cStride] * coeff[6]; + sum += src[col + 7 * cStride] * coeff[7]; + } + int16_t val = (int16_t)((sum + offset) >> headRoom); + + if (val < 0) val = 0; + if (val > maxVal) val = maxVal; + dst[col] = (pixel)val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void interp_horiz_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +{ + int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + int headRoom = IF_INTERNAL_PREC - X265_DEPTH; + int shift = IF_FILTER_PREC - headRoom; + int offset = -IF_INTERNAL_OFFS << shift; + int blkheight = height; + + src -= N / 2 - 1; + + if (isRowExt) + { + src -= (N / 2 - 1) * srcStride; + blkheight += N - 1; + } + + int row, col; + for (row = 0; row < blkheight; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0] * coeff[0]; + sum += src[col + 1] * coeff[1]; + sum += src[col + 2] * coeff[2]; + sum += src[col + 3] * coeff[3]; + if (N == 8) + { + sum += src[col + 4] * coeff[4]; + sum += src[col + 5] * coeff[5]; + sum += src[col + 6] * coeff[6]; + sum += src[col + 7] * coeff[7]; + } + + int16_t val = (int16_t)((sum + offset) >> shift); + dst[col] = val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void interp_vert_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +{ + int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + int shift = IF_FILTER_PREC; + int offset = 1 << (shift - 1); + uint16_t maxVal = (1 << X265_DEPTH) - 1; + + src -= (N / 2 - 1) * srcStride; + + int row, col; + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0 * srcStride] * c[0]; + sum += src[col + 1 * srcStride] * c[1]; + sum += src[col + 2 * srcStride] * c[2]; + sum += src[col + 3 * srcStride] * c[3]; + if (N == 8) + { + sum += src[col + 4 * srcStride] * c[4]; + sum += src[col + 5 * srcStride] * c[5]; + sum += src[col + 6 * srcStride] * c[6]; + sum += src[col + 7 * srcStride] * c[7]; + } + + int16_t val = (int16_t)((sum + offset) >> shift); + val = (val < 0) ? 0 : val; + val = (val > maxVal) ? maxVal : val; + + dst[col] = (pixel)val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void interp_vert_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +{ + int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; + int headRoom = IF_INTERNAL_PREC - X265_DEPTH; + int shift = IF_FILTER_PREC - headRoom; + int offset = -IF_INTERNAL_OFFS << shift; + + src -= (N / 2 - 1) * srcStride; + + int row, col; + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0 * srcStride] * c[0]; + sum += src[col + 1 * srcStride] * c[1]; + sum += src[col + 2 * srcStride] * c[2]; + sum += src[col + 3 * srcStride] * c[3]; + if (N == 8) + { + sum += src[col + 4 * srcStride] * c[4]; + sum += src[col + 5 * srcStride] * c[5]; + sum += src[col + 6 * srcStride] * c[6]; + sum += src[col + 7 * srcStride] * c[7]; + } + + int16_t val = (int16_t)((sum + offset) >> shift); + dst[col] = val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void interp_vert_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +{ + int headRoom = IF_INTERNAL_PREC - X265_DEPTH; + int shift = IF_FILTER_PREC + headRoom; + int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC); + uint16_t maxVal = (1 << X265_DEPTH) - 1; + const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); + + src -= (N / 2 - 1) * srcStride; + + int row, col; + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0 * srcStride] * coeff[0]; + sum += src[col + 1 * srcStride] * coeff[1]; + sum += src[col + 2 * srcStride] * coeff[2]; + sum += src[col + 3 * srcStride] * coeff[3]; + if (N == 8) + { + sum += src[col + 4 * srcStride] * coeff[4]; + sum += src[col + 5 * srcStride] * coeff[5]; + sum += src[col + 6 * srcStride] * coeff[6]; + sum += src[col + 7 * srcStride] * coeff[7]; + } + + int16_t val = (int16_t)((sum + offset) >> shift); + + val = (val < 0) ? 0 : val; + val = (val > maxVal) ? maxVal : val; + + dst[col] = (pixel)val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void interp_vert_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +{ + const int16_t *const c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); + int shift = IF_FILTER_PREC; + int row, col; + + src -= (N / 2 - 1) * srcStride; + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0 * srcStride] * c[0]; + sum += src[col + 1 * srcStride] * c[1]; + sum += src[col + 2 * srcStride] * c[2]; + sum += src[col + 3 * srcStride] * c[3]; + if (N == 8) + { + sum += src[col + 4 * srcStride] * c[4]; + sum += src[col + 5 * srcStride] * c[5]; + sum += src[col + 6 * srcStride] * c[6]; + sum += src[col + 7 * srcStride] * c[7]; + } + + int16_t val = (int16_t)((sum) >> shift); + dst[col] = val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx) +{ + int headRoom = IF_INTERNAL_PREC - X265_DEPTH; + int shift = IF_FILTER_PREC + headRoom; + int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC); + uint16_t maxVal = (1 << X265_DEPTH) - 1; + const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]); + + src -= (N / 2 - 1) * srcStride; + + int row, col; + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int sum; + + sum = src[col + 0 * srcStride] * coeff[0]; + sum += src[col + 1 * srcStride] * coeff[1]; + sum += src[col + 2 * srcStride] * coeff[2]; + sum += src[col + 3 * srcStride] * coeff[3]; + if (N == 8) + { + sum += src[col + 4 * srcStride] * coeff[4]; + sum += src[col + 5 * srcStride] * coeff[5]; + sum += src[col + 6 * srcStride] * coeff[6]; + sum += src[col + 7 * srcStride] * coeff[7]; + } + + int16_t val = (int16_t)((sum + offset) >> shift); + + val = (val < 0) ? 0 : val; + val = (val > maxVal) ? maxVal : val; + + dst[col] = (pixel)val; + } + + src += srcStride; + dst += dstStride; + } +} + +template +void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) +{ + short immedVals[(64 + 8) * (64 + 8)]; + + interp_horiz_ps_c(src, srcStride, immedVals, width, idxX, 1); + filterVertical_sp_c(immedVals + 3 * width, width, dst, dstStride, width, height, idxY); +} +} + +namespace x265 { +// x265 private namespace + +#define CHROMA_420(W, H) \ + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \ + p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>; + +#define CHROMA_422(W, H) \ + p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \ + p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>; + +#define CHROMA_444(W, H) \ + p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \ + p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>; + +#define LUMA(W, H) \ + p.luma_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<8, W, H>; \ + p.luma_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<8, W, H>; \ + p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>; \ + p.luma_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<8, W, H>; \ + p.luma_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<8, W, H>; \ + p.luma_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<8, W, H>; \ + p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_hv_pp_c<8, W, H>; + +void Setup_C_IPFilterPrimitives(EncoderPrimitives& p) +{ + LUMA(4, 4); + LUMA(8, 8); + CHROMA_420(4, 4); + LUMA(4, 8); + CHROMA_420(2, 4); + LUMA(8, 4); + CHROMA_420(4, 2); + LUMA(16, 16); + CHROMA_420(8, 8); + LUMA(16, 8); + CHROMA_420(8, 4); + LUMA(8, 16); + CHROMA_420(4, 8); + LUMA(16, 12); + CHROMA_420(8, 6); + LUMA(12, 16); + CHROMA_420(6, 8); + LUMA(16, 4); + CHROMA_420(8, 2); + LUMA(4, 16); + CHROMA_420(2, 8); + LUMA(32, 32); + CHROMA_420(16, 16); + LUMA(32, 16); + CHROMA_420(16, 8); + LUMA(16, 32); + CHROMA_420(8, 16); + LUMA(32, 24); + CHROMA_420(16, 12); + LUMA(24, 32); + CHROMA_420(12, 16); + LUMA(32, 8); + CHROMA_420(16, 4); + LUMA(8, 32); + CHROMA_420(4, 16); + LUMA(64, 64); + CHROMA_420(32, 32); + LUMA(64, 32); + CHROMA_420(32, 16); + LUMA(32, 64); + CHROMA_420(16, 32); + LUMA(64, 48); + CHROMA_420(32, 24); + LUMA(48, 64); + CHROMA_420(24, 32); + LUMA(64, 16); + CHROMA_420(32, 8); + LUMA(16, 64); + CHROMA_420(8, 32); + + CHROMA_422(4, 8); + CHROMA_422(4, 4); + CHROMA_422(2, 8); + CHROMA_422(8, 16); + CHROMA_422(8, 8); + CHROMA_422(4, 16); + CHROMA_422(8, 12); + CHROMA_422(6, 16); + CHROMA_422(8, 4); + CHROMA_422(2, 16); + CHROMA_422(16, 32); + CHROMA_422(16, 16); + CHROMA_422(8, 32); + CHROMA_422(16, 24); + CHROMA_422(12, 32); + CHROMA_422(16, 8); + CHROMA_422(4, 32); + CHROMA_422(32, 64); + CHROMA_422(32, 32); + CHROMA_422(16, 64); + CHROMA_422(32, 48); + CHROMA_422(24, 64); + CHROMA_422(32, 16); + CHROMA_422(8, 64); + + CHROMA_444(4, 4); + CHROMA_444(8, 8); + CHROMA_444(4, 8); + CHROMA_444(8, 4); + CHROMA_444(16, 16); + CHROMA_444(16, 8); + CHROMA_444(8, 16); + CHROMA_444(16, 12); + CHROMA_444(12, 16); + CHROMA_444(16, 4); + CHROMA_444(4, 16); + CHROMA_444(32, 32); + CHROMA_444(32, 16); + CHROMA_444(16, 32); + CHROMA_444(32, 24); + CHROMA_444(24, 32); + CHROMA_444(32, 8); + CHROMA_444(8, 32); + CHROMA_444(64, 64); + CHROMA_444(64, 32); + CHROMA_444(32, 64); + CHROMA_444(64, 48); + CHROMA_444(48, 64); + CHROMA_444(64, 16); + CHROMA_444(16, 64); + p.luma_p2s = filterConvertPelToShort_c; + + p.chroma_p2s[X265_CSP_I444] = filterConvertPelToShort_c; + p.chroma_p2s[X265_CSP_I420] = filterConvertPelToShort_c; + p.chroma_p2s[X265_CSP_I422] = filterConvertPelToShort_c; + + p.extendRowBorder = extendCURowColBorder; +} +} diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp new file mode 100644 index 0000000..58a28c7 --- /dev/null +++ b/source/common/loopfilter.cpp @@ -0,0 +1,53 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Praveen Kumar Tiwari +* Dnyaneshwar Gorade +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "primitives.h" + +#define PIXEL_MIN 0 +#define PIXEL_MAX ((1 << X265_DEPTH) - 1) + +void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft) +{ + int x; + int8_t signRight; + int8_t edgeType; + + for (x = 0; x < width; x++) + { + signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0; + edgeType = signRight + signLeft + 2; + signLeft = -signRight; + + short v = rec[x] + offsetEo[edgeType]; + rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v); + } +} + +namespace x265 { +void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p) +{ + p.saoCuOrgE0 = processSaoCUE0; +} +} diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp new file mode 100644 index 0000000..fe4f7b9 --- /dev/null +++ b/source/common/lowres.cpp @@ -0,0 +1,168 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Gopu Govindaswamy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "picyuv.h" +#include "lowres.h" +#include "mv.h" + +using namespace x265; + +bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled) +{ + isLowres = true; + bframes = _bframes; + width = origPic->m_picWidth / 2; + lines = origPic->m_picHeight / 2; + lumaStride = width + 2 * origPic->m_lumaMarginX; + if (lumaStride & 31) + lumaStride += 32 - (lumaStride & 31); + int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int cuCount = cuWidth * cuHeight; + + /* rounding the width to multiple of lowres CU size */ + width = cuWidth * X265_LOWRES_CU_SIZE; + lines = cuHeight * X265_LOWRES_CU_SIZE; + + size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY); + size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX; + + if (bAQEnabled) + { + CHECKED_MALLOC(qpAqOffset, double, cuCount); + CHECKED_MALLOC(invQscaleFactor, int, cuCount); + CHECKED_MALLOC(qpCuTreeOffset, double, cuCount); + } + CHECKED_MALLOC(propagateCost, uint16_t, cuCount); + + /* allocate lowres buffers */ + for (int i = 0; i < 4; i++) + { + CHECKED_MALLOC(buffer[i], pixel, planesize); + /* initialize the whole buffer to prevent valgrind warnings on right edge */ + memset(buffer[i], 0, sizeof(pixel) * planesize); + } + + lowresPlane[0] = buffer[0] + padoffset; + lowresPlane[1] = buffer[1] + padoffset; + lowresPlane[2] = buffer[2] + padoffset; + lowresPlane[3] = buffer[3] + padoffset; + + CHECKED_MALLOC(intraCost, int32_t, cuCount); + + for (int i = 0; i < bframes + 2; i++) + { + for (int j = 0; j < bframes + 2; j++) + { + CHECKED_MALLOC(rowSatds[i][j], int32_t, cuHeight); + CHECKED_MALLOC(lowresCosts[i][j], uint16_t, cuCount); + } + } + + for (int i = 0; i < bframes + 1; i++) + { + CHECKED_MALLOC(lowresMvs[0][i], MV, cuCount); + CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount); + CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount); + CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount); + } + + return true; + +fail: + return false; +} + +void Lowres::destroy() +{ + for (int i = 0; i < 4; i++) + X265_FREE(buffer[i]); + + X265_FREE(intraCost); + + for (int i = 0; i < bframes + 2; i++) + { + for (int j = 0; j < bframes + 2; j++) + { + X265_FREE(rowSatds[i][j]); + X265_FREE(lowresCosts[i][j]); + } + } + + for (int i = 0; i < bframes + 1; i++) + { + X265_FREE(lowresMvs[0][i]); + X265_FREE(lowresMvs[1][i]); + X265_FREE(lowresMvCosts[0][i]); + X265_FREE(lowresMvCosts[1][i]); + } + + X265_FREE(qpAqOffset); + X265_FREE(invQscaleFactor); + X265_FREE(qpCuTreeOffset); + X265_FREE(propagateCost); +} + +// (re) initialize lowres state +void Lowres::init(PicYuv *origPic, int poc, int type) +{ + bIntraCalculated = false; + bLastMiniGopBFrame = false; + bScenecut = true; // could be a scene-cut, until ruled out by flash detection + bKeyframe = false; // Not a keyframe unless identified by lookahead + sliceType = type; + frameNum = poc; + leadingBframes = 0; + indB = 0; + satdCost = (int64_t)-1; + memset(costEst, -1, sizeof(costEst)); + memset(weightedCostDelta, 0, sizeof(weightedCostDelta)); + + if (qpAqOffset && invQscaleFactor) + memset(costEstAq, -1, sizeof(costEstAq)); + + for (int y = 0; y < bframes + 2; y++) + for (int x = 0; x < bframes + 2; x++) + rowSatds[y][x][0] = -1; + + for (int i = 0; i < bframes + 1; i++) + { + lowresMvs[0][i][0].x = 0x7FFF; + lowresMvs[1][i][0].x = 0x7FFF; + } + + for (int i = 0; i < bframes + 2; i++) + intraMbs[i] = 0; + + /* downscale and generate 4 hpel planes for lookahead */ + primitives.frame_init_lowres_core(origPic->m_picOrg[0], + lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3], + origPic->m_stride, lumaStride, width, lines); + + /* extend hpel planes for motion search */ + extendPicBorder(lowresPlane[0], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); + extendPicBorder(lowresPlane[1], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); + extendPicBorder(lowresPlane[2], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); + extendPicBorder(lowresPlane[3], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); + fpelPlane = lowresPlane[0]; +} diff --git a/source/common/lowres.h b/source/common/lowres.h new file mode 100644 index 0000000..b88ad3e --- /dev/null +++ b/source/common/lowres.h @@ -0,0 +1,148 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Gopu Govindaswamy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_LOWRES_H +#define X265_LOWRES_H + +#include "primitives.h" +#include "common.h" +#include "mv.h" + +namespace x265 { +// private namespace + +class PicYuv; + +struct ReferencePlanes +{ + ReferencePlanes() { memset(this, 0, sizeof(ReferencePlanes)); } + + pixel* fpelPlane; + pixel* lowresPlane[4]; + + bool isWeighted; + bool isLowres; + intptr_t lumaStride; + int weight; + int offset; + int shift; + int round; + + /* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels + * in case QPEL is required. Else it returns a pointer to the HPEL pixels */ + inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride) + { + if ((qmv.x | qmv.y) & 1) + { + int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1); + pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride; + + MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2); + int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1); + + pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride; + primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32); + return buf; + } + else + { + outstride = lumaStride; + int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1); + return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride; + } + } + + inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp) + { + if ((qmv.x | qmv.y) & 1) + { + ALIGN_VAR_16(pixel, subpelbuf[8 * 8]); + int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1); + pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride; + MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2); + int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1); + pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride; + primitives.pixelavg_pp[LUMA_8x8](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32); + return comp(fenc, FENC_STRIDE, subpelbuf, 8); + } + else + { + int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1); + pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride; + return comp(fenc, FENC_STRIDE, fref, lumaStride); + } + } +}; + +/* lowres buffers, sizes and strides */ +struct Lowres : public ReferencePlanes +{ + pixel *buffer[4]; + + int frameNum; // Presentation frame number + int sliceType; // Slice type decided by lookahead + int width; // width of lowres frame in pixels + int lines; // height of lowres frame in pixel lines + int leadingBframes; // number of leading B frames for P or I + + bool bIntraCalculated; + bool bScenecut; // Set to false if the frame cannot possibly be part of a real scenecut. + bool bKeyframe; + bool bLastMiniGopBFrame; + + /* lookahead output data */ + int64_t costEst[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; + int64_t costEstAq[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; + int32_t* rowSatds[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; + int intraMbs[X265_BFRAME_MAX + 2]; + int32_t* intraCost; + int64_t satdCost; + uint16_t* lowresCostForRc; + uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]); + int32_t* lowresMvCosts[2][X265_BFRAME_MAX + 1]; + MV* lowresMvs[2][X265_BFRAME_MAX + 1]; + + /* used for vbvLookahead */ + int plannedType[X265_LOOKAHEAD_MAX + 1]; + int64_t plannedSatd[X265_LOOKAHEAD_MAX + 1]; + int indB; + int bframes; + + /* rate control / adaptive quant data */ + double* qpAqOffset; // AQ QP offset values for each 16x16 CU + double* qpCuTreeOffset; // cuTree QP offset values for each 16x16 CU + int* invQscaleFactor; // qScale values for qp Aq Offsets + uint64_t wp_ssd[3]; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame + uint64_t wp_sum[3]; + + /* cutree intermediate data */ + uint16_t* propagateCost; + double weightedCostDelta[X265_BFRAME_MAX + 2]; + + bool create(PicYuv *origPic, int _bframes, bool bAqEnabled); + void destroy(); + void init(PicYuv *origPic, int poc, int sliceType); +}; +} + +#endif // ifndef X265_LOWRES_H diff --git a/source/common/md5.cpp b/source/common/md5.cpp new file mode 100644 index 0000000..ce8a6fe --- /dev/null +++ b/source/common/md5.cpp @@ -0,0 +1,268 @@ +/***************************************************************************** + * md5.cpp: Calculate MD5 for SEI + ***************************************************************************** + * Copyright (C) 2011-2012 x265 project + * + * Authors: Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at chenm003@163.com. + *****************************************************************************/ + +#include "common.h" +#include "md5.h" + +namespace x265 { +// private x265 namespace + +#ifndef ARCH_BIG_ENDIAN +#define byteReverse(buf, len) /* Nothing */ +#else +static void byteReverse(uint8_t_t *buf, unsigned int nSize) +{ + int i; + uint32_t tmp; + + for (i = 0; i < nSize; i++) + { + tmp = ((unsigned int)buf[3] << 8 | buf[2]) << 16 | + ((unsigned int)buf[1] << 8 | buf[0]); + *(uint32_t*)buf = tmp; + buf += 4; + } +} + +#endif // ifndef ARCH_BIG_ENDIAN + +void MD5Transform(uint32_t *buf, uint32_t *in); + +/* + * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious + * initialization constants. + */ +void MD5Init(MD5Context *ctx) +{ + ctx->buf[0] = 0x67452301; + ctx->buf[1] = 0xefcdab89; + ctx->buf[2] = 0x98badcfe; + ctx->buf[3] = 0x10325476; + + ctx->bits[0] = 0; + ctx->bits[1] = 0; +} + +/* + * Update context to reflect the concatenation of another buffer full + * of bytes. + */ +void MD5Update(MD5Context *ctx, uint8_t *buf, uint32_t len) +{ + uint32_t t; + + /* Update bitcount */ + + t = ctx->bits[0]; + if ((ctx->bits[0] = t + ((uint32_t)len << 3)) < t) + ctx->bits[1]++; /* Carry from low to high */ + ctx->bits[1] += len >> 29; + + t = (t >> 3) & 0x3F; /* Bytes already in shsInfo->data */ + + /* Handle any leading odd-sized chunks */ + + if (t) + { + uint8_t *p = (uint8_t*)ctx->in + t; + + t = 64 - t; + if (len < t) + { + memcpy(p, buf, len); + return; + } + memcpy(p, buf, t); + byteReverse(ctx->in, 16); + MD5Transform(ctx->buf, (uint32_t*)ctx->in); + buf += t; + len -= t; + } + /* Process data in 64-byte chunks */ + + while (len >= 64) + { + memcpy(ctx->in, buf, 64); + byteReverse(ctx->in, 16); + MD5Transform(ctx->buf, (uint32_t*)ctx->in); + buf += 64; + len -= 64; + } + + /* Handle any remaining bytes of data. */ + + memcpy(ctx->in, buf, len); +} + +/* + * Final wrapup - pad to 64-byte boundary with the bit pattern + * 1 0* (64-bit count of bits processed, MSB-first) + */ +void MD5Final(MD5Context *ctx, uint8_t *digest) +{ + uint32_t count; + uint8_t *p; + + /* Compute number of bytes mod 64 */ + count = (ctx->bits[0] >> 3) & 0x3F; + + /* Set the first char of padding to 0x80. This is safe since there is + always at least one byte free */ + p = ctx->in + count; + *p++ = 0x80; + + /* Bytes of padding needed to make 64 bytes */ + count = 64 - 1 - count; + + /* Pad out to 56 mod 64 */ + if (count < 8) + { + /* Two lots of padding: Pad the first block to 64 bytes */ + memset(p, 0, count); + byteReverse(ctx->in, 16); + MD5Transform(ctx->buf, (uint32_t*)ctx->in); + + /* Now fill the next block with 56 bytes */ + memset(ctx->in, 0, 56); + } + else + { + /* Pad block to 56 bytes */ + memset(p, 0, count - 8); + } + byteReverse(ctx->in, 14); + + /* Append length in bits and transform */ + // CHECK_ME: Always use 32-bits operator + uint32_t *table = (uint32_t*)&ctx->in; + table[14] = ctx->bits[0]; + table[15] = ctx->bits[1]; + + MD5Transform(ctx->buf, (uint32_t*)ctx->in); + byteReverse((uint8_t*)ctx->buf, 4); + memcpy(digest, ctx->buf, 16); + + memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ +} + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, data, s) \ + (w += f(x, y, z) + data, w = w << s | w >> (32 - s), w += x) + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +void MD5Transform(uint32_t *buf, uint32_t *in) +{ + register uint32_t a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} +} diff --git a/source/common/md5.h b/source/common/md5.h new file mode 100644 index 0000000..762ae0c --- /dev/null +++ b/source/common/md5.h @@ -0,0 +1,79 @@ +/***************************************************************************** + * md5.h: Calculate MD5 + ***************************************************************************** + * Copyright (C) 2011-2012 x265 project + * + * Authors: Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at chenm003@163.com. + *****************************************************************************/ + +#ifndef X265_MD5_H +#define X265_MD5_H + +#include "common.h" + +namespace x265 { +//private x265 namespace + +typedef struct MD5Context +{ + uint32_t buf[4]; + uint32_t bits[2]; + unsigned char in[64]; +} MD5Context; + +void MD5Init(MD5Context *context); +void MD5Update(MD5Context *context, unsigned char *buf, uint32_t len); +void MD5Final(MD5Context *ctx, uint8_t *digest); + +class MD5 +{ +public: + + /** + * initialize digest state + */ + MD5() + { + MD5Init(&m_state); + } + + /** + * compute digest over buf of length len. + * multiple calls may extend the digest over more data. + */ + void update(unsigned char *buf, unsigned len) + { + MD5Update(&m_state, buf, len); + } + + /** + * flush any outstanding MD5 data, write the digest into digest. + */ + void finalize(unsigned char digest[16]) + { + MD5Final(&m_state, digest); + } + +private: + + MD5Context m_state; +}; +} + +#endif // ifndef X265_MD5_H diff --git a/source/common/mv.h b/source/common/mv.h new file mode 100644 index 0000000..22a7073 --- /dev/null +++ b/source/common/mv.h @@ -0,0 +1,104 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_MV_H +#define X265_MV_H + +#include "common.h" +#include "primitives.h" + +namespace x265 { +// private x265 namespace + +#if _MSC_VER +#pragma warning(disable: 4201) // non-standard extension used (nameless struct/union) +#endif + +struct MV +{ +public: + + union { + struct { int16_t x, y; }; + + int32_t word; + }; + + MV() : word(0) {} + + MV(int16_t _x, int16_t _y) : x(_x), y(_y) {} + + const MV& operator =(uint32_t w) { word = w; return *this; } + + const MV& operator +=(const MV& other) { x += other.x; y += other.y; return *this; } + + const MV& operator -=(const MV& other) { x -= other.x; y -= other.y; return *this; } + + const MV& operator >>=(int i) { x >>= i; y >>= i; return *this; } + + const MV& operator <<=(int i) { x <<= i; y <<= i; return *this; } + + MV operator >>(int i) const { return MV(x >> i, y >> i); } + + MV operator <<(int i) const { return MV(x << i, y << i); } + + MV operator *(int16_t i) const { return MV(x * i, y * i); } + + const MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); } + + const MV operator +(const MV& other) const { return MV(x + other.x, y + other.y); } + + bool operator ==(const MV& other) const { return word == other.word; } + + bool operator !=(const MV& other) const { return word != other.word; } + + // Scale down a QPEL mv to FPEL mv, rounding up by one HPEL offset + MV roundToFPel() const { return MV(x + 2, y + 2) >> 2; } + + // Scale up an FPEL mv to QPEL by shifting up two bits + MV toQPel() const { return *this << 2; } + + bool inline notZero() const { return this->word != 0; } + + bool inline isSubpel() const { return (this->word & 0x00030003) != 0; } + + MV mvmin(const MV& m) const { return MV(x > m.x ? m.x : x, y > m.y ? m.y : y); } + + MV mvmax(const MV& m) const { return MV(x < m.x ? m.x : x, y < m.y ? m.y : y); } + + MV clipped(const MV& _min, const MV& _max) const + { + MV cl = mvmin(_max); + + return cl.mvmax(_min); + } + + // returns true if MV is within range (inclusive) + bool checkRange(const MV& _min, const MV& _max) const + { + return x >= _min.x && x <= _max.x && y >= _min.y && y <= _max.y; + } +}; +} + +#endif // ifndef X265_MV_H diff --git a/source/common/param.cpp b/source/common/param.cpp new file mode 100644 index 0000000..af70058 --- /dev/null +++ b/source/common/param.cpp @@ -0,0 +1,1341 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Nandakumar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "slice.h" +#include "threading.h" +#include "param.h" +#include "cpu.h" +#include "x265.h" + +#if _MSC_VER +#pragma warning(disable: 4996) // POSIX functions are just fine, thanks +#pragma warning(disable: 4706) // assignment within conditional +#pragma warning(disable: 4127) // conditional expression is constant +#endif + +#if _WIN32 +#define strcasecmp _stricmp +#endif + +#if !defined(HAVE_STRTOK_R) + +/* + * adapted from public domain strtok_r() by Charlie Gordon + * + * from comp.lang.c 9/14/2007 + * + * http://groups.google.com/group/comp.lang.c/msg/2ab1ecbb86646684 + * + * (Declaration that it's public domain): + * http://groups.google.com/group/comp.lang.c/msg/7c7b39328fefab9c + */ + +#undef strtok_r +char* strtok_r(char * str, + const char *delim, + char ** nextp) +{ + if (!str) + str = *nextp; + + str += strspn(str, delim); + + if (!*str) + return NULL; + + char *ret = str; + + str += strcspn(str, delim); + + if (*str) + *str++ = '\0'; + + *nextp = str; + + return ret; +} + +#endif // if !defined(HAVE_STRTOK_R) + +using namespace x265; + +extern "C" +x265_param *x265_param_alloc() +{ + return (x265_param*)x265_malloc(sizeof(x265_param)); +} + +extern "C" +void x265_param_free(x265_param *p) +{ + return x265_free(p); +} + +extern "C" +void x265_param_default(x265_param *param) +{ + memset(param, 0, sizeof(x265_param)); + + /* Applying default values to all elements in the param structure */ + param->cpuid = x265::cpu_detect(); + param->bEnableWavefront = 1; + param->poolNumThreads = 0; + param->frameNumThreads = 0; + + param->logLevel = X265_LOG_INFO; + param->csvfn = NULL; + param->rc.lambdaFileName = NULL; + param->bLogCuStats = 0; + param->decodedPictureHashSEI = 0; + + /* Quality Measurement Metrics */ + param->bEnablePsnr = 0; + param->bEnableSsim = 0; + + /* Source specifications */ + param->internalBitDepth = x265_max_bit_depth; + param->internalCsp = X265_CSP_I420; + + param->levelIdc = 0; + param->bHighTier = 0; + param->interlaceMode = 0; + param->bRepeatHeaders = 0; + param->bEnableAccessUnitDelimiters = 0; + param->bEmitHRDSEI = 0; + param->bEmitInfoSEI = 1; + + /* CU definitions */ + param->maxCUSize = 64; + param->tuQTMaxInterDepth = 1; + param->tuQTMaxIntraDepth = 1; + + /* Coding Structure */ + param->keyframeMin = 0; + param->keyframeMax = 250; + param->bOpenGOP = 1; + param->bframes = 4; + param->lookaheadDepth = 20; + param->bFrameAdaptive = X265_B_ADAPT_TRELLIS; + param->bBPyramid = 1; + param->scenecutThreshold = 40; /* Magic number pulled in from x264 */ + + /* Intra Coding Tools */ + param->bEnableConstrainedIntra = 0; + param->bEnableStrongIntraSmoothing = 1; + param->bEnableFastIntra = 0; + + /* Inter Coding tools */ + param->searchMethod = X265_HEX_SEARCH; + param->subpelRefine = 2; + param->searchRange = 57; + param->maxNumMergeCand = 2; + param->bEnableWeightedPred = 1; + param->bEnableWeightedBiPred = 0; + param->bEnableEarlySkip = 0; + param->bEnableCbfFastMode = 0; + param->bEnableAMP = 0; + param->bEnableRectInter = 0; + param->rdLevel = 3; + param->bEnableSignHiding = 1; + param->bEnableTransformSkip = 0; + param->bEnableTSkipFast = 0; + param->maxNumReferences = 3; + param->bEnableTemporalMvp = 1; + + /* Loop Filter */ + param->bEnableLoopFilter = 1; + + /* SAO Loop Filter */ + param->bEnableSAO = 1; + param->bSaoNonDeblocked = 0; + + /* Coding Quality */ + param->cbQpOffset = 0; + param->crQpOffset = 0; + param->rdPenalty = 0; + param->psyRd = 0.0; + param->psyRdoq = 0.0; + param->bIntraInBFrames = 0; + param->bLossless = 0; + param->bCULossless = 0; + + /* Rate control options */ + param->rc.vbvMaxBitrate = 0; + param->rc.vbvBufferSize = 0; + param->rc.vbvBufferInit = 0.9; + param->rc.rfConstant = 28; + param->rc.bitrate = 0; + param->rc.rateTolerance = 1.0; + param->rc.qCompress = 0.6; + param->rc.ipFactor = 1.4f; + param->rc.pbFactor = 1.3f; + param->rc.qpStep = 4; + param->rc.rateControlMode = X265_RC_CRF; + param->rc.qp = 32; + param->rc.aqMode = X265_AQ_AUTO_VARIANCE; + param->rc.aqStrength = 1.0; + param->rc.cuTree = 1; + param->rc.rfConstantMax = 0; + param->rc.rfConstantMin = 0; + param->rc.bStatRead = 0; + param->rc.bStatWrite = 0; + param->rc.statFileName = NULL; + param->rc.complexityBlur = 20; + param->rc.qblur = 0.5; + param->rc.bEnableSlowFirstPass = 0; + + /* Video Usability Information (VUI) */ + param->vui.aspectRatioIdc = 0; + param->vui.sarWidth = 0; + param->vui.sarHeight = 0; + param->vui.bEnableOverscanAppropriateFlag = 0; + param->vui.bEnableVideoSignalTypePresentFlag = 0; + param->vui.videoFormat = 5; + param->vui.bEnableVideoFullRangeFlag = 0; + param->vui.bEnableColorDescriptionPresentFlag = 0; + param->vui.colorPrimaries = 2; + param->vui.transferCharacteristics = 2; + param->vui.matrixCoeffs = 2; + param->vui.bEnableChromaLocInfoPresentFlag = 0; + param->vui.chromaSampleLocTypeTopField = 0; + param->vui.chromaSampleLocTypeBottomField = 0; + param->vui.bEnableDefaultDisplayWindowFlag = 0; + param->vui.defDispWinLeftOffset = 0; + param->vui.defDispWinRightOffset = 0; + param->vui.defDispWinTopOffset = 0; + param->vui.defDispWinBottomOffset = 0; +} + +extern "C" +int x265_param_default_preset(x265_param *param, const char *preset, const char *tune) +{ + x265_param_default(param); + + if (preset) + { + char *end; + int i = strtol(preset, &end, 10); + if (*end == 0 && i >= 0 && i < (int)(sizeof(x265_preset_names) / sizeof(*x265_preset_names) - 1)) + preset = x265_preset_names[i]; + + if (!strcmp(preset, "ultrafast")) + { + param->lookaheadDepth = 10; + param->scenecutThreshold = 0; // disable lookahead + param->maxCUSize = 32; + param->searchRange = 25; + param->bFrameAdaptive = 0; + param->subpelRefine = 0; + param->searchMethod = X265_DIA_SEARCH; + param->bEnableEarlySkip = 1; + param->bEnableSAO = 0; + param->bEnableSignHiding = 0; + param->bEnableWeightedPred = 0; + param->rdLevel = 2; + param->maxNumReferences = 1; + param->bEnableLoopFilter = 0; + param->rc.aqStrength = 0.0; + param->rc.aqMode = X265_AQ_NONE; + param->rc.cuTree = 0; + param->bEnableFastIntra = 1; + } + else if (!strcmp(preset, "superfast")) + { + param->lookaheadDepth = 10; + param->maxCUSize = 32; + param->searchRange = 44; + param->bFrameAdaptive = 0; + param->subpelRefine = 1; + param->bEnableEarlySkip = 1; + param->bEnableWeightedPred = 0; + param->rdLevel = 2; + param->maxNumReferences = 1; + param->rc.aqStrength = 0.0; + param->rc.aqMode = X265_AQ_NONE; + param->rc.cuTree = 0; + param->bEnableSAO = 0; + param->bEnableFastIntra = 1; + } + else if (!strcmp(preset, "veryfast")) + { + param->lookaheadDepth = 15; + param->maxCUSize = 32; + param->bFrameAdaptive = 0; + param->subpelRefine = 1; + param->bEnableEarlySkip = 1; + param->rdLevel = 2; + param->maxNumReferences = 1; + param->rc.cuTree = 0; + param->bEnableFastIntra = 1; + } + else if (!strcmp(preset, "faster")) + { + param->lookaheadDepth = 15; + param->bFrameAdaptive = 0; + param->bEnableEarlySkip = 1; + param->rdLevel = 2; + param->maxNumReferences = 1; + param->rc.cuTree = 0; + param->bEnableFastIntra = 1; + } + else if (!strcmp(preset, "fast")) + { + param->lookaheadDepth = 15; + param->bFrameAdaptive = 0; + param->rdLevel = 2; + param->maxNumReferences = 2; + param->bEnableFastIntra = 1; + } + else if (!strcmp(preset, "medium")) + { + /* defaults */ + } + else if (!strcmp(preset, "slow")) + { + param->bEnableRectInter = 1; + param->lookaheadDepth = 25; + param->rdLevel = 4; + param->subpelRefine = 3; + param->maxNumMergeCand = 3; + param->searchMethod = X265_STAR_SEARCH; + } + else if (!strcmp(preset, "slower")) + { + param->bEnableWeightedBiPred = 1; + param->bEnableAMP = 1; + param->bEnableRectInter = 1; + param->lookaheadDepth = 30; + param->bframes = 8; + param->tuQTMaxInterDepth = 2; + param->tuQTMaxIntraDepth = 2; + param->rdLevel = 6; + param->subpelRefine = 3; + param->maxNumMergeCand = 3; + param->searchMethod = X265_STAR_SEARCH; + param->bIntraInBFrames = 1; + } + else if (!strcmp(preset, "veryslow")) + { + param->bEnableWeightedBiPred = 1; + param->bEnableAMP = 1; + param->bEnableRectInter = 1; + param->lookaheadDepth = 40; + param->bframes = 8; + param->tuQTMaxInterDepth = 3; + param->tuQTMaxIntraDepth = 3; + param->rdLevel = 6; + param->subpelRefine = 4; + param->maxNumMergeCand = 4; + param->searchMethod = X265_STAR_SEARCH; + param->maxNumReferences = 5; + param->bIntraInBFrames = 1; + } + else if (!strcmp(preset, "placebo")) + { + param->bEnableWeightedBiPred = 1; + param->bEnableAMP = 1; + param->bEnableRectInter = 1; + param->lookaheadDepth = 60; + param->searchRange = 92; + param->bframes = 8; + param->tuQTMaxInterDepth = 4; + param->tuQTMaxIntraDepth = 4; + param->rdLevel = 6; + param->subpelRefine = 5; + param->maxNumMergeCand = 5; + param->searchMethod = X265_STAR_SEARCH; + param->bEnableTransformSkip = 1; + param->maxNumReferences = 5; + param->rc.bEnableSlowFirstPass = 1; + param->bIntraInBFrames = 1; + // TODO: optimized esa + } + else + return -1; + } + if (tune) + { + if (!strcmp(tune, "psnr")) + { + param->rc.aqStrength = 0.0; + param->psyRd = 0.0; + param->psyRdoq = 0.0; + } + else if (!strcmp(tune, "ssim")) + { + param->rc.aqMode = X265_AQ_AUTO_VARIANCE; + param->psyRd = 0.0; + param->psyRdoq = 0.0; + } + else if (!strcmp(tune, "fastdecode") || + !strcmp(tune, "fast-decode")) + { + param->bEnableLoopFilter = 0; + param->bEnableSAO = 0; + param->bEnableWeightedPred = 0; + param->bEnableWeightedBiPred = 0; + param->bIntraInBFrames = 0; + } + else if (!strcmp(tune, "zerolatency") || + !strcmp(tune, "zero-latency")) + { + param->bFrameAdaptive = 0; + param->bframes = 0; + param->lookaheadDepth = 0; + param->scenecutThreshold = 0; + param->rc.cuTree = 0; + } + else + return -1; + } + + return 0; +} + +static int x265_atobool(const char *str, bool& bError) +{ + if (!strcmp(str, "1") || + !strcmp(str, "true") || + !strcmp(str, "yes")) + return 1; + if (!strcmp(str, "0") || + !strcmp(str, "false") || + !strcmp(str, "no")) + return 0; + bError = true; + return 0; +} + +static double x265_atof(const char *str, bool& bError) +{ + char *end; + double v = strtod(str, &end); + + if (end == str || *end != '\0') + bError = true; + return v; +} + +static int parseName(const char *arg, const char * const * names, bool& bError) +{ + for (int i = 0; names[i]; i++) + { + if (!strcmp(arg, names[i])) + { + return i; + } + } + + return x265_atoi(arg, bError); +} + +/* internal versions of string-to-int with additional error checking */ +#undef atoi +#undef atof +#define atoi(str) x265_atoi(str, bError) +#define atof(str) x265_atof(str, bError) +#define atobool(str) (bNameWasBool = true, x265_atobool(str, bError)) + +extern "C" +int x265_param_parse(x265_param *p, const char *name, const char *value) +{ + bool bError = false; + bool bNameWasBool = false; + bool bValueWasNull = !value; + char nameBuf[64]; + + if (!name) + return X265_PARAM_BAD_NAME; + + // skip -- prefix if provided + if (name[0] == '-' && name[1] == '-') + name += 2; + + // s/_/-/g + if (strlen(name) + 1 < sizeof(nameBuf) && strchr(name, '_')) + { + char *c; + strcpy(nameBuf, name); + while ((c = strchr(nameBuf, '_')) != 0) + { + *c = '-'; + } + + name = nameBuf; + } + + if (!strncmp(name, "no-", 3)) + { + name += 3; + value = !value || x265_atobool(value, bError) ? "false" : "true"; + } + else if (!strncmp(name, "no", 2)) + { + name += 2; + value = !value || x265_atobool(value, bError) ? "false" : "true"; + } + else if (!value) + value = "true"; + else if (value[0] == '=') + value++; + +#if defined(_MSC_VER) +#pragma warning(disable: 4127) // conditional expression is constant +#endif +#define OPT(STR) else if (!strcmp(name, STR)) +#define OPT2(STR1, STR2) else if (!strcmp(name, STR1) || !strcmp(name, STR2)) + if (0) ; + OPT("asm") + { + if (bValueWasNull) + p->cpuid = atobool(value); + else + p->cpuid = parseCpuName(value, bError); + } + OPT("fps") + { + if (sscanf(value, "%u/%u", &p->fpsNum, &p->fpsDenom) == 2) + ; + else + { + float fps = (float)atof(value); + if (fps > 0 && fps <= INT_MAX / 1000) + { + p->fpsNum = (int)(fps * 1000 + .5); + p->fpsDenom = 1000; + } + else + { + p->fpsNum = atoi(value); + p->fpsDenom = 1; + } + } + } + OPT("csv") p->csvfn = value; + OPT("scaling-list") p->scalingLists = value; + OPT("lambda-file") p->rc.lambdaFileName = value; + OPT("threads") p->poolNumThreads = atoi(value); + OPT("frame-threads") p->frameNumThreads = atoi(value); + OPT("pmode") p->bDistributeModeAnalysis = atobool(value); + OPT("pme") p->bDistributeMotionEstimation = atobool(value); + OPT2("level-idc", "level") + { + /* allow "5.1" or "51", both converted to integer 51 */ + if (atof(value) < 7) + p->levelIdc = (int)(10 * atof(value) + .5); + else + p->levelIdc = atoi(value); + } + OPT("high-tier") p->bHighTier = atobool(value); + OPT2("log-level", "log") + { + p->logLevel = atoi(value); + if (bError) + { + bError = false; + p->logLevel = parseName(value, logLevelNames, bError) - 1; + } + } + OPT("cu-stats") p->bLogCuStats = atobool(value); + OPT("repeat-headers") p->bRepeatHeaders = atobool(value); + OPT("wpp") p->bEnableWavefront = atobool(value); + OPT("ctu") p->maxCUSize = (uint32_t)atoi(value); + OPT("tu-intra-depth") p->tuQTMaxIntraDepth = (uint32_t)atoi(value); + OPT("tu-inter-depth") p->tuQTMaxInterDepth = (uint32_t)atoi(value); + OPT("subme") p->subpelRefine = atoi(value); + OPT("merange") p->searchRange = atoi(value); + OPT("rect") p->bEnableRectInter = atobool(value); + OPT("amp") p->bEnableAMP = atobool(value); + OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value); + OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value); + OPT("early-skip") p->bEnableEarlySkip = atobool(value); + OPT("fast-cbf") p->bEnableCbfFastMode = atobool(value); + OPT("rdpenalty") p->rdPenalty = atoi(value); + OPT("tskip") p->bEnableTransformSkip = atobool(value); + OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value); + OPT("tskip-fast") p->bEnableTSkipFast = atobool(value); + OPT("strong-intra-smoothing") p->bEnableStrongIntraSmoothing = atobool(value); + OPT("lossless") p->bLossless = atobool(value); + OPT("cu-lossless") p->bCULossless = atobool(value); + OPT("constrained-intra") p->bEnableConstrainedIntra = atobool(value); + OPT("fast-intra") p->bEnableFastIntra = atobool(value); + OPT("open-gop") p->bOpenGOP = atobool(value); + OPT("scenecut") + { + p->scenecutThreshold = atobool(value); + if (bError || p->scenecutThreshold) + { + bError = false; + p->scenecutThreshold = atoi(value); + } + } + OPT("keyint") p->keyframeMax = atoi(value); + OPT("min-keyint") p->keyframeMin = atoi(value); + OPT("rc-lookahead") p->lookaheadDepth = atoi(value); + OPT("bframes") p->bframes = atoi(value); + OPT("bframe-bias") p->bFrameBias = atoi(value); + OPT("b-adapt") + { + p->bFrameAdaptive = atobool(value); + if (bError || p->bFrameAdaptive) + { + bError = false; + p->bFrameAdaptive = atoi(value); + } + } + OPT("interlace") + { + p->interlaceMode = atobool(value); + if (bError || p->interlaceMode) + { + bError = false; + p->interlaceMode = parseName(value, x265_interlace_names, bError); + } + } + OPT("ref") p->maxNumReferences = atoi(value); + OPT("weightp") p->bEnableWeightedPred = atobool(value); + OPT("weightb") p->bEnableWeightedBiPred = atobool(value); + OPT("cbqpoffs") p->cbQpOffset = atoi(value); + OPT("crqpoffs") p->crQpOffset = atoi(value); + OPT("rd") p->rdLevel = atoi(value); + OPT("psy-rd") p->psyRd = atof(value); + OPT("psy-rdoq") p->psyRdoq = atof(value); + OPT("signhide") p->bEnableSignHiding = atobool(value); + OPT("b-intra") p->bIntraInBFrames = atobool(value); + OPT("lft") p->bEnableLoopFilter = atobool(value); + OPT("sao") p->bEnableSAO = atobool(value); + OPT("sao-non-deblock") p->bSaoNonDeblocked = atobool(value); + OPT("ssim") p->bEnableSsim = atobool(value); + OPT("psnr") p->bEnablePsnr = atobool(value); + OPT("hash") p->decodedPictureHashSEI = atoi(value); + OPT("aud") p->bEnableAccessUnitDelimiters = atobool(value); + OPT("info") p->bEmitInfoSEI = atobool(value); + OPT("b-pyramid") p->bBPyramid = atobool(value); + OPT("hrd") p->bEmitHRDSEI = atobool(value); + OPT2("ipratio", "ip-factor") p->rc.ipFactor = atof(value); + OPT2("pbratio", "pb-factor") p->rc.pbFactor = atof(value); + OPT("aq-mode") p->rc.aqMode = atoi(value); + OPT("aq-strength") p->rc.aqStrength = atof(value); + OPT("vbv-maxrate") p->rc.vbvMaxBitrate = atoi(value); + OPT("vbv-bufsize") p->rc.vbvBufferSize = atoi(value); + OPT("vbv-init") p->rc.vbvBufferInit = atof(value); + OPT("crf-max") p->rc.rfConstantMax = atof(value); + OPT("crf-min") p->rc.rfConstantMin = atof(value); + OPT("crf") + { + p->rc.rfConstant = atof(value); + p->rc.rateControlMode = X265_RC_CRF; + } + OPT("bitrate") + { + p->rc.bitrate = atoi(value); + p->rc.rateControlMode = X265_RC_ABR; + } + OPT("qp") + { + p->rc.qp = atoi(value); + p->rc.rateControlMode = X265_RC_CQP; + } + OPT("input-res") bError |= sscanf(value, "%dx%d", &p->sourceWidth, &p->sourceHeight) != 2; + OPT("input-csp") p->internalCsp = parseName(value, x265_source_csp_names, bError); + OPT("me") p->searchMethod = parseName(value, x265_motion_est_names, bError); + OPT("cutree") p->rc.cuTree = atobool(value); + OPT("slow-firstpass") p->rc.bEnableSlowFirstPass = atobool(value); + OPT("analysis-mode") p->analysisMode = parseName(value, x265_analysis_names, bError); + OPT("sar") + { + p->vui.aspectRatioIdc = parseName(value, x265_sar_names, bError); + if (bError) + { + p->vui.aspectRatioIdc = X265_EXTENDED_SAR; + bError = sscanf(value, "%d:%d", &p->vui.sarWidth, &p->vui.sarHeight) != 2; + } + } + OPT("overscan") + { + if (!strcmp(value, "show")) + p->vui.bEnableOverscanInfoPresentFlag = 1; + else if (!strcmp(value, "crop")) + { + p->vui.bEnableOverscanInfoPresentFlag = 1; + p->vui.bEnableOverscanAppropriateFlag = 1; + } + else if (!strcmp(value, "undef")) + p->vui.bEnableOverscanInfoPresentFlag = 0; + else + bError = true; + } + OPT("videoformat") + { + p->vui.bEnableVideoSignalTypePresentFlag = 1; + p->vui.videoFormat = parseName(value, x265_video_format_names, bError); + } + OPT("range") + { + p->vui.bEnableVideoSignalTypePresentFlag = 1; + p->vui.bEnableVideoFullRangeFlag = parseName(value, x265_fullrange_names, bError); + } + OPT("colorprim") + { + p->vui.bEnableVideoSignalTypePresentFlag = 1; + p->vui.bEnableColorDescriptionPresentFlag = 1; + p->vui.colorPrimaries = parseName(value, x265_colorprim_names, bError); + } + OPT("transfer") + { + p->vui.bEnableVideoSignalTypePresentFlag = 1; + p->vui.bEnableColorDescriptionPresentFlag = 1; + p->vui.transferCharacteristics = parseName(value, x265_transfer_names, bError); + } + OPT("colormatrix") + { + p->vui.bEnableVideoSignalTypePresentFlag = 1; + p->vui.bEnableColorDescriptionPresentFlag = 1; + p->vui.matrixCoeffs = parseName(value, x265_colmatrix_names, bError); + } + OPT("chromaloc") + { + p->vui.bEnableChromaLocInfoPresentFlag = 1; + p->vui.chromaSampleLocTypeTopField = atoi(value); + p->vui.chromaSampleLocTypeBottomField = p->vui.chromaSampleLocTypeTopField; + } + OPT("crop-rect") + { + p->vui.bEnableDefaultDisplayWindowFlag = 1; + bError |= sscanf(value, "%d,%d,%d,%d", + &p->vui.defDispWinLeftOffset, + &p->vui.defDispWinTopOffset, + &p->vui.defDispWinRightOffset, + &p->vui.defDispWinBottomOffset) != 4; + } + OPT("nr") p->noiseReduction = atoi(value); + OPT("pass") + { + int pass = Clip3(0, 3, atoi(value)); + p->rc.bStatWrite = pass & 1; + p->rc.bStatRead = pass & 2; + } + OPT("stats") p->rc.statFileName = strdup(value); + else + return X265_PARAM_BAD_NAME; +#undef OPT +#undef atobool +#undef atoi +#undef atof + + bError |= bValueWasNull && !bNameWasBool; + return bError ? X265_PARAM_BAD_VALUE : 0; +} + +namespace x265 { +// internal encoder functions + +int x265_atoi(const char *str, bool& bError) +{ + char *end; + int v = strtol(str, &end, 0); + + if (end == str || *end != '\0') + bError = true; + return v; +} + +/* cpu name can be: + * auto || true - x265::cpu_detect() + * false || no - disabled + * integer bitmap value + * comma separated list of SIMD names, eg: SSE4.1,XOP */ +int parseCpuName(const char *value, bool& bError) +{ + if (!value) + { + bError = 1; + return 0; + } + int cpu; + if (isdigit(value[0])) + cpu = x265_atoi(value, bError); + else + cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? x265::cpu_detect() : 0; + + if (bError) + { + char *buf = strdup(value); + char *tok, *saveptr = NULL, *init; + bError = 0; + cpu = 0; + for (init = buf; (tok = strtok_r(init, ",", &saveptr)); init = NULL) + { + int i; + for (i = 0; x265::cpu_names[i].flags && strcasecmp(tok, x265::cpu_names[i].name); i++) + { + } + + cpu |= x265::cpu_names[i].flags; + if (!x265::cpu_names[i].flags) + bError = 1; + } + + free(buf); + if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE2_IS_SLOW)) + cpu |= X265_CPU_SSE2_IS_FAST; + } + + return cpu; +} + +static const int fixedRatios[][2] = +{ + { 1, 1 }, + { 12, 11 }, + { 10, 11 }, + { 16, 11 }, + { 40, 33 }, + { 24, 11 }, + { 20, 11 }, + { 32, 11 }, + { 80, 33 }, + { 18, 11 }, + { 15, 11 }, + { 64, 33 }, + { 160, 99 }, + { 4, 3 }, + { 3, 2 }, + { 2, 1 }, +}; + +void setParamAspectRatio(x265_param *p, int width, int height) +{ + p->vui.aspectRatioIdc = X265_EXTENDED_SAR; + p->vui.sarWidth = width; + p->vui.sarHeight = height; + for (size_t i = 0; i < sizeof(fixedRatios) / sizeof(fixedRatios[0]); i++) + { + if (width == fixedRatios[i][0] && height == fixedRatios[i][1]) + { + p->vui.aspectRatioIdc = (int)i + 1; + return; + } + } +} + +void getParamAspectRatio(x265_param *p, int& width, int& height) +{ + if (!p->vui.aspectRatioIdc) + { + width = height = 0; + } + else if ((size_t)p->vui.aspectRatioIdc <= sizeof(fixedRatios) / sizeof(fixedRatios[0])) + { + width = fixedRatios[p->vui.aspectRatioIdc - 1][0]; + height = fixedRatios[p->vui.aspectRatioIdc - 1][1]; + } + else if (p->vui.aspectRatioIdc == X265_EXTENDED_SAR) + { + width = p->vui.sarWidth; + height = p->vui.sarHeight; + } + else + { + width = height = 0; + } +} + +static inline int _confirm(x265_param *param, bool bflag, const char* message) +{ + if (!bflag) + return 0; + + x265_log(param, X265_LOG_ERROR, "%s\n", message); + return 1; +} + +int x265_check_params(x265_param *param) +{ +#define CHECK(expr, msg) check_failed |= _confirm(param, expr, msg) + int check_failed = 0; /* abort if there is a fatal configuration problem */ + + CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16, + "max ctu size must be 16, 32, or 64"); + if (check_failed == 1) + return check_failed; + + uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize]; + uint32_t tuQTMaxLog2Size = X265_MIN(maxLog2CUSize, 5); + uint32_t tuQTMinLog2Size = 2; //log2(4) + + /* These checks might be temporary */ +#if HIGH_BIT_DEPTH + CHECK(param->internalBitDepth != 10, + "x265 was compiled for 10bit encodes, only 10bit internal depth supported"); +#else + CHECK(param->internalBitDepth != 8, + "x265 was compiled for 8bit encodes, only 8bit internal depth supported"); +#endif + + CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > 51, + "QP exceeds supported range (-QpBDOffsety to 51)"); + CHECK(param->fpsNum == 0 || param->fpsDenom == 0, + "Frame rate numerator and denominator must be specified"); + CHECK(param->interlaceMode < 0 || param->interlaceMode > 2, + "Interlace mode must be 0 (progressive) 1 (top-field first) or 2 (bottom field first)"); + CHECK(param->searchMethod<0 || param->searchMethod> X265_FULL_SEARCH, + "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 5:FULL)"); + CHECK(param->searchRange < 0, + "Search Range must be more than 0"); + CHECK(param->searchRange >= 32768, + "Search Range must be less than 32768"); + CHECK(param->subpelRefine > X265_MAX_SUBPEL_LEVEL, + "subme must be less than or equal to X265_MAX_SUBPEL_LEVEL (7)"); + CHECK(param->subpelRefine < 0, + "subme must be greater than or equal to 0"); + CHECK(param->frameNumThreads < 0, + "frameNumThreads (--frame-threads) must be 0 or higher"); + CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12"); + CHECK(param->cbQpOffset > 12, "Max. Chroma Cb QP Offset is 12"); + CHECK(param->crQpOffset < -12, "Min. Chroma Cr QP Offset is -12"); + CHECK(param->crQpOffset > 12, "Max. Chroma Cr QP Offset is 12"); + + CHECK(tuQTMaxLog2Size > maxLog2CUSize, + "QuadtreeTULog2MaxSize must be log2(maxCUSize) or smaller."); + + CHECK(param->tuQTMaxInterDepth < 1 || param->tuQTMaxInterDepth > 4, + "QuadtreeTUMaxDepthInter must be greater than 0 and less than 5"); + CHECK(maxLog2CUSize < tuQTMinLog2Size + param->tuQTMaxInterDepth - 1, + "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1"); + CHECK(param->tuQTMaxIntraDepth < 1 || param->tuQTMaxIntraDepth > 4, + "QuadtreeTUMaxDepthIntra must be greater 0 and less than 5"); + CHECK(maxLog2CUSize < tuQTMinLog2Size + param->tuQTMaxIntraDepth - 1, + "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1"); + + CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater."); + CHECK(param->maxNumMergeCand > 5, "MaxNumMergeCand must be 5 or smaller."); + + CHECK(param->maxNumReferences < 1, "maxNumReferences must be 1 or greater."); + CHECK(param->maxNumReferences > MAX_NUM_REF, "maxNumReferences must be 16 or smaller."); + + CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceHeight < (int)param->maxCUSize, + "Picture size must be at least one CTU"); + CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp, + "Color space must be i420, i422, or i444"); + CHECK(param->sourceWidth & !!CHROMA_H_SHIFT(param->internalCsp), + "Picture width must be an integer multiple of the specified chroma subsampling"); + CHECK(param->sourceHeight & !!CHROMA_V_SHIFT(param->internalCsp), + "Picture height must be an integer multiple of the specified chroma subsampling"); + + CHECK(param->rc.rateControlMode > X265_RC_CRF || param->rc.rateControlMode < X265_RC_ABR, + "Rate control mode is out of range"); + CHECK(param->rdLevel < 0 || param->rdLevel > 6, + "RD Level is out of range"); + CHECK(param->bframes > param->lookaheadDepth && !param->rc.bStatRead, + "Lookahead depth must be greater than the max consecutive bframe count"); + CHECK(param->bframes < 0, + "bframe count should be greater than zero"); + CHECK(param->bframes > X265_BFRAME_MAX, + "max consecutive bframe count must be 16 or smaller"); + CHECK(param->lookaheadDepth > X265_LOOKAHEAD_MAX, + "Lookahead depth must be less than 256"); + CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE < param->rc.aqMode, + "Aq-Mode is out of range"); + CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3, + "Aq-Strength is out of range"); + CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0"); + CHECK(param->psyRdoq < 0 || 50.0 < param->psyRdoq, "Psy-rdoq strength must be between 0 and 50.0"); + CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative"); + CHECK(!param->bEnableWavefront && param->rc.vbvBufferSize, "VBV requires wave-front parallelism (--wpp)"); + CHECK((param->vui.aspectRatioIdc < 0 + || param->vui.aspectRatioIdc > 16) + && param->vui.aspectRatioIdc != X265_EXTENDED_SAR, + "Sample Aspect Ratio must be 0-16 or 255"); + CHECK(param->vui.aspectRatioIdc == X265_EXTENDED_SAR && param->vui.sarWidth <= 0, + "Sample Aspect Ratio width must be greater than 0"); + CHECK(param->vui.aspectRatioIdc == X265_EXTENDED_SAR && param->vui.sarHeight <= 0, + "Sample Aspect Ratio height must be greater than 0"); + CHECK(param->vui.videoFormat < 0 || param->vui.videoFormat > 5, + "Video Format must be component," + " pal, ntsc, secam, mac or undef"); + CHECK(param->vui.colorPrimaries < 0 + || param->vui.colorPrimaries > 9 + || param->vui.colorPrimaries == 3, + "Color Primaries must be undef, bt709, bt470m," + " bt470bg, smpte170m, smpte240m, film or bt2020"); + CHECK(param->vui.transferCharacteristics < 0 + || param->vui.transferCharacteristics > 15 + || param->vui.transferCharacteristics == 3, + "Transfer Characteristics must be undef, bt709, bt470m, bt470bg," + " smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, bt1361e," + " iec61966-2-1, bt2020-10 or bt2020-12"); + CHECK(param->vui.matrixCoeffs < 0 + || param->vui.matrixCoeffs > 10 + || param->vui.matrixCoeffs == 3, + "Matrix Coefficients must be undef, bt709, fcc, bt470bg, smpte170m," + " smpte240m, GBR, YCgCo, bt2020nc or bt2020c"); + CHECK(param->vui.chromaSampleLocTypeTopField < 0 + || param->vui.chromaSampleLocTypeTopField > 5, + "Chroma Sample Location Type Top Field must be 0-5"); + CHECK(param->vui.chromaSampleLocTypeBottomField < 0 + || param->vui.chromaSampleLocTypeBottomField > 5, + "Chroma Sample Location Type Bottom Field must be 0-5"); + CHECK(param->vui.defDispWinLeftOffset < 0, + "Default Display Window Left Offset must be 0 or greater"); + CHECK(param->vui.defDispWinRightOffset < 0, + "Default Display Window Right Offset must be 0 or greater"); + CHECK(param->vui.defDispWinTopOffset < 0, + "Default Display Window Top Offset must be 0 or greater"); + CHECK(param->vui.defDispWinBottomOffset < 0, + "Default Display Window Bottom Offset must be 0 or greater"); + CHECK(param->rc.rfConstant < -6 * (param->internalBitDepth - 8) || param->rc.rfConstant > 51, + "Valid quality based range: -qpBDOffsetY to 51"); + CHECK(param->rc.rfConstantMax < -6 * (param->internalBitDepth - 8) || param->rc.rfConstantMax > 51, + "Valid quality based range: -qpBDOffsetY to 51"); + CHECK(param->rc.rfConstantMin < -6 * (param->internalBitDepth - 8) || param->rc.rfConstantMin > 51, + "Valid quality based range: -qpBDOffsetY to 51"); + CHECK(param->bFrameAdaptive < 0 || param->bFrameAdaptive > 2, + "Valid adaptive b scheduling values 0 - none, 1 - fast, 2 - full"); + CHECK(param->logLevel<-1 || param->logLevel> X265_LOG_FULL, + "Valid Logging level -1:none 0:error 1:warning 2:info 3:debug 4:full"); + CHECK(param->scenecutThreshold < 0, + "scenecutThreshold must be greater than 0"); + CHECK(param->rdPenalty < 0 || param->rdPenalty > 2, + "Valid penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum"); + CHECK(param->keyframeMax < -1, + "Invalid max IDR period in frames. value should be greater than -1"); + CHECK(param->decodedPictureHashSEI < 0 || param->decodedPictureHashSEI > 3, + "Invalid hash option. Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum"); + CHECK(param->rc.vbvBufferSize < 0, + "Size of the vbv buffer can not be less than zero"); + CHECK(param->rc.vbvMaxBitrate < 0, + "Maximum local bit rate can not be less than zero"); + CHECK(param->rc.vbvBufferInit < 0, + "Valid initial VBV buffer occupancy must be a fraction 0 - 1, or size in kbits"); + CHECK(param->rc.bitrate < 0, + "Target bitrate can not be less than zero"); + if (param->noiseReduction) + CHECK(100 > param->noiseReduction || param->noiseReduction > 1000, "Valid noise reduction range 100 - 1000"); + CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead, + "Constant rate-factor is incompatible with 2pass"); + CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead, + "Constant QP is incompatible with 2pass"); + return check_failed; +} + +void x265_param_apply_fastfirstpass(x265_param* param) +{ + /* Set faster options in case of turbo firstpass */ + if (param->rc.bStatWrite && !param->rc.bStatRead) + { + param->maxNumReferences = 1; + param->maxNumMergeCand = 1; + param->bEnableRectInter = 0; + param->bEnableFastIntra = 1; + param->bEnableAMP = 0; + param->searchMethod = X265_DIA_SEARCH; + param->subpelRefine = X265_MIN(2, param->subpelRefine); + param->bEnableEarlySkip = 1; + param->rdLevel = X265_MIN(2, param->rdLevel); + } +} + +int x265_set_globals(x265_param *param) +{ + static int once /* = 0 */; + + if (ATOMIC_CAS32(&once, 0, 1) == 1) + { + if (param->maxCUSize != g_maxCUSize) + { + x265_log(param, X265_LOG_ERROR, "maxCUSize must be the same for all encoders in a single process"); + return -1; + } + } + else + { + uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize]; + + // set max CU width & height + g_maxCUSize = param->maxCUSize; + g_maxLog2CUSize = maxLog2CUSize; + + // compute actual CU depth with respect to config depth and max transform size + g_maxCUDepth = maxLog2CUSize - MIN_LOG2_CU_SIZE; + g_maxFullDepth = maxLog2CUSize - LOG2_UNIT_SIZE; + + // initialize partition order + uint32_t* tmp = &g_zscanToRaster[0]; + initZscanToRaster(g_maxFullDepth, 1, 0, tmp); + initRasterToZscan(g_maxFullDepth); + } + return 0; +} + +void x265_print_params(x265_param *param) +{ + if (param->logLevel < X265_LOG_INFO) + return; + +#if HIGH_BIT_DEPTH + x265_log(param, X265_LOG_INFO, "Internal bit depth : %d\n", param->internalBitDepth); +#endif + if (param->interlaceMode) + x265_log(param, X265_LOG_INFO, "Interlaced field inputs : %s\n", x265_interlace_names[param->interlaceMode]); + + x265_log(param, X265_LOG_INFO, "CTU size / RQT depth inter / intra : %d / %d / %d\n", + param->maxCUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth); + + x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge : %s / %d / %d / %d\n", + x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand); + + if (param->keyframeMax != INT_MAX || param->scenecutThreshold) + x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut : %d / %d / %d\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold); + else + x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut : disabled\n"); + + if (param->cbQpOffset || param->crQpOffset) + x265_log(param, X265_LOG_INFO, "Cb/Cr QP Offset : %d / %d\n", param->cbQpOffset, param->crQpOffset); + + if (param->rdPenalty) + x265_log(param, X265_LOG_INFO, "Intra 32x32 TU penalty type : %d\n", param->rdPenalty); + + x265_log(param, X265_LOG_INFO, "Lookahead / bframes / badapt : %d / %d / %d\n", param->lookaheadDepth, param->bframes, param->bFrameAdaptive); + x265_log(param, X265_LOG_INFO, "b-pyramid / weightp / weightb / refs: %d / %d / %d / %d\n", + param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred, param->maxNumReferences); + + if (param->bLossless) + x265_log(param, X265_LOG_INFO, "Rate Control : Lossless\n"); + else switch (param->rc.rateControlMode) + { + case X265_RC_ABR: + x265_log(param, X265_LOG_INFO, "Rate Control / AQ-Strength / CUTree : ABR-%d kbps / %0.1f / %d\n", param->rc.bitrate, + param->rc.aqStrength, param->rc.cuTree); + break; + case X265_RC_CQP: + x265_log(param, X265_LOG_INFO, "Rate Control / AQ-Strength / CUTree : CQP-%d / %0.1f / %d\n", param->rc.qp, param->rc.aqStrength, + param->rc.cuTree); + break; + case X265_RC_CRF: + x265_log(param, X265_LOG_INFO, "Rate Control / AQ-Strength / CUTree : CRF-%0.1f / %0.1f / %d\n", param->rc.rfConstant, + param->rc.aqStrength, param->rc.cuTree); + break; + } + + if (param->rc.vbvBufferSize) + x265_log(param, X265_LOG_INFO, "VBV/HRD buffer / max-rate / init : %d / %d / %.3f\n", + param->rc.vbvBufferSize, param->rc.vbvMaxBitrate, param->rc.vbvBufferInit); + + x265_log(param, X265_LOG_INFO, "tools: "); +#define TOOLOPT(FLAG, STR) if (FLAG) fprintf(stderr, "%s ", STR) + TOOLOPT(param->bEnableRectInter, "rect"); + TOOLOPT(param->bEnableAMP, "amp"); + fprintf(stderr, "rd=%d ", param->rdLevel); + if (param->psyRd > 0.) + fprintf(stderr, "psy-rd=%.2lf ", param->psyRd); + if (param->psyRdoq > 0.) + fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq); + TOOLOPT(param->bEnableEarlySkip, "esd"); + TOOLOPT(param->bEnableCbfFastMode, "cfm"); + if (param->noiseReduction) + fprintf(stderr, "nr=%d ", param->noiseReduction); + TOOLOPT(param->bEnableLoopFilter, "lft"); + if (param->bEnableSAO) + fprintf(stderr, "sao%s ", param->bSaoNonDeblocked ? "-non-deblock" : ""); + TOOLOPT(param->bEnableSignHiding, "signhide"); + TOOLOPT(param->bEnableConstrainedIntra, "cip"); + TOOLOPT(param->bIntraInBFrames, "b-intra"); + TOOLOPT(param->bEnableFastIntra, "fast-intra"); + TOOLOPT(param->bEnableTemporalMvp, "tmvp"); + if (param->bEnableTransformSkip) + fprintf(stderr, "tskip%s ", param->bEnableTSkipFast ? "-fast" : ""); + TOOLOPT(param->bCULossless, "cu-lossless"); + TOOLOPT(param->rc.bStatWrite, "stats-write"); + TOOLOPT(param->rc.bStatRead, "stats-read"); + fprintf(stderr, "\n"); + fflush(stderr); +} + +char *x265_param2string(x265_param *p) +{ + char *buf, *s; + + buf = s = X265_MALLOC(char, MAXPARAMSIZE); + if (!buf) + return NULL; + +#define BOOL(param, cliopt) \ + s += sprintf(s, " %s", (param) ? cliopt : "no-"cliopt); + + s += sprintf(s, "%dx%d", p->sourceWidth,p->sourceHeight); + s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom); + s += sprintf(s, " bitdepth=%d", p->internalBitDepth); + BOOL(p->bEnableWavefront, "wpp"); + s += sprintf(s, " ctu=%d", p->maxCUSize); + s += sprintf(s, " tu-intra-depth=%d", p->tuQTMaxIntraDepth); + s += sprintf(s, " tu-inter-depth=%d", p->tuQTMaxInterDepth); + s += sprintf(s, " me=%d", p->searchMethod); + s += sprintf(s, " subme=%d", p->subpelRefine); + s += sprintf(s, " merange=%d", p->searchRange); + BOOL(p->bEnableRectInter, "rect"); + BOOL(p->bEnableAMP, "amp"); + s += sprintf(s, " max-merge=%d", p->maxNumMergeCand); + BOOL(p->bEnableTemporalMvp, "temporal-mvp"); + BOOL(p->bEnableEarlySkip, "early-skip"); + BOOL(p->bEnableCbfFastMode, "fast-cbf"); + s += sprintf(s, " rdpenalty=%d", p->rdPenalty); + BOOL(p->bEnableTransformSkip, "tskip"); + BOOL(p->bEnableTSkipFast, "tskip-fast"); + BOOL(p->bEnableStrongIntraSmoothing, "strong-intra-smoothing"); + BOOL(p->bLossless, "lossless"); + BOOL(p->bCULossless, "cu-lossless"); + BOOL(p->bEnableConstrainedIntra, "constrained-intra"); + BOOL(p->bEnableFastIntra, "fast-intra"); + BOOL(p->bOpenGOP, "open-gop"); + s += sprintf(s, " interlace=%d", p->interlaceMode); + s += sprintf(s, " keyint=%d", p->keyframeMax); + s += sprintf(s, " min-keyint=%d", p->keyframeMin); + s += sprintf(s, " scenecut=%d", p->scenecutThreshold); + s += sprintf(s, " rc-lookahead=%d", p->lookaheadDepth); + s += sprintf(s, " bframes=%d", p->bframes); + s += sprintf(s, " bframe-bias=%d", p->bFrameBias); + s += sprintf(s, " b-adapt=%d", p->bFrameAdaptive); + s += sprintf(s, " ref=%d", p->maxNumReferences); + BOOL(p->bEnableWeightedPred, "weightp"); + BOOL(p->bEnableWeightedBiPred, "weightb"); + s += sprintf(s, " aq-mode=%d", p->rc.aqMode); + s += sprintf(s, " aq-strength=%.2f", p->rc.aqStrength); + s += sprintf(s, " cbqpoffs=%d", p->cbQpOffset); + s += sprintf(s, " crqpoffs=%d", p->crQpOffset); + s += sprintf(s, " rd=%d", p->rdLevel); + s += sprintf(s, " psy-rd=%.2f", p->psyRd); + s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq); + BOOL(p->bEnableSignHiding, "signhide"); + BOOL(p->bEnableLoopFilter, "lft"); + BOOL(p->bEnableSAO, "sao"); + BOOL(p->bSaoNonDeblocked, "sao-non-deblock"); + BOOL(p->bBPyramid, "b-pyramid"); + BOOL(p->rc.cuTree, "cutree"); + s += sprintf(s, " rc=%s", p->rc.rateControlMode == X265_RC_ABR ? ( + p->rc.bStatRead ? "2 pass" : p->rc.bitrate == p->rc.vbvMaxBitrate ? "cbr" : "abr") + : p->rc.rateControlMode == X265_RC_CRF ? "crf" : "cqp"); + if (p->rc.rateControlMode == X265_RC_ABR || p->rc.rateControlMode == X265_RC_CRF) + { + if (p->rc.rateControlMode == X265_RC_CRF) + s += sprintf(s, " crf=%.1f", p->rc.rfConstant); + else + s += sprintf(s, " bitrate=%d ratetol=%.1f", + p->rc.bitrate, p->rc.rateTolerance); + s += sprintf(s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d", + p->rc.qCompress, QP_MIN, QP_MAX_SPEC, p->rc.qpStep); + if (p->rc.bStatRead) + s += sprintf( s, " cplxblur=%.1f qblur=%.1f", + p->rc.complexityBlur, p->rc.qblur); + if (p->rc.vbvBufferSize) + { + s += sprintf(s, " vbv-maxrate=%d vbv-bufsize=%d", + p->rc.vbvMaxBitrate, p->rc.vbvBufferSize); + if (p->rc.rateControlMode == X265_RC_CRF) + s += sprintf(s, " crf-max=%.1f", p->rc.rfConstantMax); + } + } + else if (p->rc.rateControlMode == X265_RC_CQP) + s += sprintf(s, " qp=%d", p->rc.qp); + if (!(p->rc.rateControlMode == X265_RC_CQP && p->rc.qp == 0)) + { + s += sprintf(s, " ipratio=%.2f", p->rc.ipFactor); + if (p->bframes) + s += sprintf(s, " pbratio=%.2f", p->rc.pbFactor); + } +#undef BOOL + return buf; +} + +bool parseLambdaFile(x265_param *param) +{ + if (!param->rc.lambdaFileName) + return false; + + FILE *lfn = fopen(param->rc.lambdaFileName, "r"); + if (!lfn) + { + x265_log(param, X265_LOG_ERROR, "unable to read lambda file <%s>\n", param->rc.lambdaFileName); + return true; + } + + char line[2048]; + char *toksave = NULL, *tok = NULL, *buf = NULL; + + for (int t = 0; t < 3; t++) + { + double *table = t ? x265_lambda2_tab : x265_lambda_tab; + + for (int i = 0; i < QP_MAX_MAX + 1; i++) + { + double value; + + do + { + if (!tok) + { + /* consume a line of text file */ + if (!fgets(line, sizeof(line), lfn)) + { + fclose(lfn); + + if (t < 2) + { + x265_log(param, X265_LOG_ERROR, "lambda file is incomplete\n"); + return true; + } + else + return false; + } + + /* truncate at first hash */ + char *hash = strchr(line, '#'); + if (hash) *hash = 0; + buf = line; + } + + tok = strtok_r(buf, " ,", &toksave); + buf = NULL; + if (tok && sscanf(tok, "%lf", &value) == 1) + break; + } + while (1); + + if (t == 2) + { + x265_log(param, X265_LOG_ERROR, "lambda file contains too many values\n"); + fclose(lfn); + return true; + } + else + x265_log(param, X265_LOG_DEBUG, "lambda%c[%d] = %lf\n", t ? '2' : ' ', i, value); + table[i] = value; + } + } + + fclose(lfn); + return false; +} + +} diff --git a/source/common/param.h b/source/common/param.h new file mode 100644 index 0000000..fa42006 --- /dev/null +++ b/source/common/param.h @@ -0,0 +1,45 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Nandakumar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_PARAM_H +#define X265_PARAM_H + +namespace x265 { +int x265_check_params(x265_param *param); +int x265_set_globals(x265_param *param); +void x265_print_params(x265_param *param); +void x265_param_apply_fastfirstpass(x265_param *p); +char* x265_param2string(x265_param *param); +int x265_atoi(const char *str, bool& bError); +int parseCpuName(const char *value, bool& bError); +void setParamAspectRatio(x265_param *p, int width, int height); +void getParamAspectRatio(x265_param *p, int& width, int& height); +bool parseLambdaFile(x265_param *param); + +/* this table is kept internal to avoid confusion, since log level indices start at -1 */ +static const char * const logLevelNames[] = { "none", "error", "warning", "info", "debug", "full", 0 }; + +#define MAXPARAMSIZE 2000 +} + +#endif // ifndef X265_PARAM_H diff --git a/source/common/piclist.cpp b/source/common/piclist.cpp new file mode 100644 index 0000000..735f05b --- /dev/null +++ b/source/common/piclist.cpp @@ -0,0 +1,151 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Gopu Govindaswamy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "piclist.h" +#include "frame.h" + +using namespace x265; + +void PicList::pushFront(Frame& curFrame) +{ + X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list + curFrame.m_next = m_start; + curFrame.m_prev = NULL; + + if (m_count) + { + m_start->m_prev = &curFrame; + m_start = &curFrame; + } + else + { + m_start = m_end = &curFrame; + } + m_count++; +} + +void PicList::pushBack(Frame& curFrame) +{ + X265_CHECK(!curFrame.m_next && !curFrame.m_prev, "piclist: picture already in list\n"); // ensure frame is not in a list + curFrame.m_next = NULL; + curFrame.m_prev = m_end; + + if (m_count) + { + m_end->m_next = &curFrame; + m_end = &curFrame; + } + else + { + m_start = m_end = &curFrame; + } + m_count++; +} + +Frame *PicList::popFront() +{ + if (m_start) + { + Frame *temp = m_start; + m_count--; + + if (m_count) + { + m_start = m_start->m_next; + m_start->m_prev = NULL; + } + else + { + m_start = m_end = NULL; + } + temp->m_next = temp->m_prev = NULL; + return temp; + } + else + return NULL; +} + +Frame* PicList::getPOC(int poc) +{ + Frame *curFrame = m_start; + while (curFrame && curFrame->m_poc != poc) + curFrame = curFrame->m_next; + return curFrame; +} + +Frame *PicList::popBack() +{ + if (m_end) + { + Frame* temp = m_end; + m_count--; + + if (m_count) + { + m_end = m_end->m_prev; + m_end->m_next = NULL; + } + else + { + m_start = m_end = NULL; + } + temp->m_next = temp->m_prev = NULL; + return temp; + } + else + return NULL; +} + +void PicList::remove(Frame& curFrame) +{ +#if _DEBUG + Frame *tmp = m_start; + while (tmp && tmp != &curFrame) + { + tmp = tmp->m_next; + } + + X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list +#endif + + m_count--; + if (m_count) + { + if (m_start == &curFrame) + m_start = curFrame.m_next; + if (m_end == &curFrame) + m_end = curFrame.m_prev; + + if (curFrame.m_next) + curFrame.m_next->m_prev = curFrame.m_prev; + if (curFrame.m_prev) + curFrame.m_prev->m_next = curFrame.m_next; + } + else + { + m_start = m_end = NULL; + } + + curFrame.m_next = curFrame.m_prev = NULL; +} diff --git a/source/common/piclist.h b/source/common/piclist.h new file mode 100644 index 0000000..d6936f7 --- /dev/null +++ b/source/common/piclist.h @@ -0,0 +1,79 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Gopu Govindaswamy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_PICLIST_H +#define X265_PICLIST_H + +#include + +namespace x265 { +class Frame; + +class PicList +{ +protected: + + Frame* m_start; + Frame* m_end; + int m_count; + +public: + + PicList() + { + m_start = NULL; + m_end = NULL; + m_count = 0; + } + + /** Push picture to end of the list */ + void pushBack(Frame& pic); + + /** Push picture to beginning of the list */ + void pushFront(Frame& pic); + + /** Pop picture from end of the list */ + Frame* popBack(); + + /** Pop picture from beginning of the list */ + Frame* popFront(); + + /** Find frame with specified POC */ + Frame* getPOC(int poc); + + /** Remove picture from list */ + void remove(Frame& pic); + + Frame* first() { return m_start; } + + Frame* last() { return m_end; } + + int size() { return m_count; } + + bool empty() const { return !m_count; } + + operator bool() const { return !!m_count; } +}; +} + +#endif // ifndef X265_PICLIST_H diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp new file mode 100644 index 0000000..7f4fd06 --- /dev/null +++ b/source/common/picyuv.cpp @@ -0,0 +1,397 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "picyuv.h" +#include "slice.h" +#include "primitives.h" + +using namespace x265; + +PicYuv::PicYuv() +{ + m_picBuf[0] = NULL; + m_picBuf[1] = NULL; + m_picBuf[2] = NULL; + + m_picOrg[0] = NULL; + m_picOrg[1] = NULL; + m_picOrg[2] = NULL; + + m_cuOffsetY = NULL; + m_cuOffsetC = NULL; + m_buOffsetY = NULL; + m_buOffsetC = NULL; +} + +bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp) +{ + m_picWidth = picWidth; + m_picHeight = picHeight; + m_hChromaShift = CHROMA_H_SHIFT(picCsp); + m_vChromaShift = CHROMA_V_SHIFT(picCsp); + m_picCsp = picCsp; + + uint32_t numCuInWidth = (m_picWidth + g_maxCUSize - 1) / g_maxCUSize; + uint32_t numCuInHeight = (m_picHeight + g_maxCUSize - 1) / g_maxCUSize; + + m_lumaMarginX = g_maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment + m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding + m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1); + + m_chromaMarginX = m_lumaMarginX; // keep 16-byte alignment for chroma CTUs + m_chromaMarginY = m_lumaMarginY >> m_vChromaShift; + + m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2); + int maxHeight = numCuInHeight * g_maxCUSize; + + CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); + CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2))); + CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2))); + + m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX; + m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX; + m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX; + + return true; + +fail: + return false; +} + +/* the first picture allocated by the encoder will be asked to generate these + * offset arrays. Once generated, they will be provided to all future PicYuv + * allocated by the same encoder. */ +bool PicYuv::createOffsets(const SPS& sps) +{ + uint32_t numPartitions = 1 << (g_maxFullDepth * 2); + CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight); + CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight); + for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++) + { + for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++) + { + m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize; + m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift); + } + } + + CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions); + CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions); + for (uint32_t idx = 0; idx < numPartitions; ++idx) + { + intptr_t x = g_zscanToPelX[idx]; + intptr_t y = g_zscanToPelY[idx]; + m_buOffsetY[idx] = m_stride * y + x; + m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift); + } + + return true; + +fail: + return false; +} + +void PicYuv::destroy() +{ + X265_FREE(m_picBuf[0]); + X265_FREE(m_picBuf[1]); + X265_FREE(m_picBuf[2]); +} + +/* Copy pixels from an x265_picture into internal PicYuv instance. + * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */ +void PicYuv::copyFromPicture(const x265_picture& pic, int padx, int pady) +{ + /* m_picWidth is the width that is being encoded, padx indicates how many + * of those pixels are padding to reach multiple of MinCU(4) size. + * + * Internally, we need to extend rows out to a multiple of 16 for lowres + * downscale and other operations. But those padding pixels are never + * encoded. + * + * The same applies to m_picHeight and pady */ + + /* width and height - without padsize (input picture raw width and height) */ + int width = m_picWidth - padx; + int height = m_picHeight - pady; + + /* internal pad to multiple of 16x16 blocks */ + uint8_t rem = width & 15; + + padx = rem ? 16 - rem : padx; + rem = height & 15; + pady = rem ? 16 - rem : pady; + + /* add one more row and col of pad for downscale interpolation, fixes + * warnings from valgrind about using uninitialized pixels */ + padx++; + pady++; + + if (pic.bitDepth < X265_DEPTH) + { + pixel *yPixel = m_picOrg[0]; + pixel *uPixel = m_picOrg[1]; + pixel *vPixel = m_picOrg[2]; + + uint8_t *yChar = (uint8_t*)pic.planes[0]; + uint8_t *uChar = (uint8_t*)pic.planes[1]; + uint8_t *vChar = (uint8_t*)pic.planes[2]; + int shift = X265_MAX(0, X265_DEPTH - pic.bitDepth); + + primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift); + primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift); + primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift); + } + else if (pic.bitDepth == 8) + { + pixel *yPixel = m_picOrg[0]; + pixel *uPixel = m_picOrg[1]; + pixel *vPixel = m_picOrg[2]; + + uint8_t *yChar = (uint8_t*)pic.planes[0]; + uint8_t *uChar = (uint8_t*)pic.planes[1]; + uint8_t *vChar = (uint8_t*)pic.planes[2]; + + for (int r = 0; r < height; r++) + { + for (int c = 0; c < width; c++) + { + yPixel[c] = (pixel)yChar[c]; + } + + yPixel += m_stride; + yChar += pic.stride[0] / sizeof(*yChar); + } + + for (int r = 0; r < height >> m_vChromaShift; r++) + { + for (int c = 0; c < width >> m_hChromaShift; c++) + { + uPixel[c] = (pixel)uChar[c]; + vPixel[c] = (pixel)vChar[c]; + } + + uPixel += m_strideC; + vPixel += m_strideC; + uChar += pic.stride[1] / sizeof(*uChar); + vChar += pic.stride[2] / sizeof(*vChar); + } + } + else /* pic.bitDepth > 8 */ + { + pixel *yPixel = m_picOrg[0]; + pixel *uPixel = m_picOrg[1]; + pixel *vPixel = m_picOrg[2]; + + uint16_t *yShort = (uint16_t*)pic.planes[0]; + uint16_t *uShort = (uint16_t*)pic.planes[1]; + uint16_t *vShort = (uint16_t*)pic.planes[2]; + + /* defensive programming, mask off bits that are supposed to be zero */ + uint16_t mask = (1 << X265_DEPTH) - 1; + int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH); + + /* shift and mask pixels to final size */ + + primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); + primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + } + + /* extend the right edge if width was not multiple of the minimum CU size */ + if (padx) + { + pixel *Y = m_picOrg[0]; + pixel *U = m_picOrg[1]; + pixel *V = m_picOrg[2]; + + for (int r = 0; r < height; r++) + { + for (int x = 0; x < padx; x++) + { + Y[width + x] = Y[width - 1]; + } + + Y += m_stride; + } + + for (int r = 0; r < height >> m_vChromaShift; r++) + { + for (int x = 0; x < padx >> m_hChromaShift; x++) + { + U[(width >> m_hChromaShift) + x] = U[(width >> m_hChromaShift) - 1]; + V[(width >> m_hChromaShift) + x] = V[(width >> m_hChromaShift) - 1]; + } + + U += m_strideC; + V += m_strideC; + } + } + + /* extend the bottom if height was not multiple of the minimum CU size */ + if (pady) + { + pixel *Y = m_picOrg[0] + (height - 1) * m_stride; + pixel *U = m_picOrg[1] + ((height >> m_vChromaShift) - 1) * m_strideC; + pixel *V = m_picOrg[2] + ((height >> m_vChromaShift) - 1) * m_strideC; + + for (int i = 1; i <= pady; i++) + { + memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel)); + } + + for (int j = 1; j <= pady >> m_vChromaShift; j++) + { + memcpy(U + j * m_strideC, U, ((width + padx) >> m_hChromaShift) * sizeof(pixel)); + memcpy(V + j * m_strideC, V, ((width + padx) >> m_hChromaShift) * sizeof(pixel)); + } + } +} + +namespace x265 { + +template +static void md5_block(MD5Context& md5, const pixel* plane, uint32_t n) +{ + /* create a 64 byte buffer for packing pixel's into */ + uint8_t buf[64 / OUTPUT_BITDEPTH_DIV8][OUTPUT_BITDEPTH_DIV8]; + + for (uint32_t i = 0; i < n; i++) + { + pixel pel = plane[i]; + /* perform bitdepth and endian conversion */ + for (uint32_t d = 0; d < OUTPUT_BITDEPTH_DIV8; d++) + buf[i][d] = (uint8_t)(pel >> (d * 8)); + } + + MD5Update(&md5, (uint8_t*)buf, n * OUTPUT_BITDEPTH_DIV8); +} + +/* Update md5 with all samples in plane in raster order, each sample + * is adjusted to OUTBIT_BITDEPTH_DIV8 */ +template +static void md5_plane(MD5Context& md5, const pixel* plane, uint32_t width, uint32_t height, intptr_t stride) +{ + /* N is the number of samples to process per md5 update. + * All N samples must fit in buf */ + uint32_t N = 32; + uint32_t width_modN = width % N; + uint32_t width_less_modN = width - width_modN; + + for (uint32_t y = 0; y < height; y++) + { + /* convert pel's into uint32_t chars in little endian byte order. + * NB, for 8bit data, data is truncated to 8bits. */ + for (uint32_t x = 0; x < width_less_modN; x += N) + md5_block(md5, &plane[y * stride + x], N); + + /* mop up any of the remaining line */ + md5_block(md5, &plane[y * stride + width_less_modN], width_modN); + } +} + +void updateCRC(const pixel* plane, uint32_t& crcVal, uint32_t height, uint32_t width, intptr_t stride) +{ + uint32_t crcMsb; + uint32_t bitVal; + uint32_t bitIdx; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + // take CRC of first pictureData byte + for (bitIdx = 0; bitIdx < 8; bitIdx++) + { + crcMsb = (crcVal >> 15) & 1; + bitVal = (plane[y * stride + x] >> (7 - bitIdx)) & 1; + crcVal = (((crcVal << 1) + bitVal) & 0xffff) ^ (crcMsb * 0x1021); + } + +#if _MSC_VER +#pragma warning(disable: 4127) // conditional expression is constant +#endif + // take CRC of second pictureData byte if bit depth is greater than 8-bits + if (X265_DEPTH > 8) + { + for (bitIdx = 0; bitIdx < 8; bitIdx++) + { + crcMsb = (crcVal >> 15) & 1; + bitVal = (plane[y * stride + x] >> (15 - bitIdx)) & 1; + crcVal = (((crcVal << 1) + bitVal) & 0xffff) ^ (crcMsb * 0x1021); + } + } + } + } +} + +void crcFinish(uint32_t& crcVal, uint8_t digest[16]) +{ + uint32_t crcMsb; + + for (int bitIdx = 0; bitIdx < 16; bitIdx++) + { + crcMsb = (crcVal >> 15) & 1; + crcVal = ((crcVal << 1) & 0xffff) ^ (crcMsb * 0x1021); + } + + digest[0] = (crcVal >> 8) & 0xff; + digest[1] = crcVal & 0xff; +} + +void updateChecksum(const pixel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, intptr_t stride, int row, uint32_t cuHeight) +{ + uint8_t xor_mask; + + for (uint32_t y = row * cuHeight; y < ((row * cuHeight) + height); y++) + { + for (uint32_t x = 0; x < width; x++) + { + xor_mask = (uint8_t)((x & 0xff) ^ (y & 0xff) ^ (x >> 8) ^ (y >> 8)); + checksumVal = (checksumVal + ((plane[y * stride + x] & 0xff) ^ xor_mask)) & 0xffffffff; + + if (X265_DEPTH > 8) + checksumVal = (checksumVal + ((plane[y * stride + x] >> 7 >> 1) ^ xor_mask)) & 0xffffffff; + } + } +} + +void checksumFinish(uint32_t checksum, uint8_t digest[16]) +{ + digest[0] = (checksum >> 24) & 0xff; + digest[1] = (checksum >> 16) & 0xff; + digest[2] = (checksum >> 8) & 0xff; + digest[3] = checksum & 0xff; +} + +void updateMD5Plane(MD5Context& md5, const pixel* plane, uint32_t width, uint32_t height, intptr_t stride) +{ + /* choose an md5_plane packing function based on the system bitdepth */ + typedef void(*MD5PlaneFunc)(MD5Context&, const pixel*, uint32_t, uint32_t, intptr_t); + MD5PlaneFunc md5_plane_func; + md5_plane_func = X265_DEPTH <= 8 ? (MD5PlaneFunc)md5_plane<1> : (MD5PlaneFunc)md5_plane<2>; + + md5_plane_func(md5, plane, width, height, stride); +} +} diff --git a/source/common/picyuv.h b/source/common/picyuv.h new file mode 100644 index 0000000..1e18d8c --- /dev/null +++ b/source/common/picyuv.h @@ -0,0 +1,94 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_PICYUV_H +#define X265_PICYUV_H + +#include "common.h" +#include "md5.h" +#include "x265.h" + +namespace x265 { +// private namespace + +class ShortYuv; +struct SPS; + +class PicYuv +{ +public: + + pixel* m_picBuf[3]; // full allocated buffers, including margins + pixel* m_picOrg[3]; // pointers to plane starts + + uint32_t m_picWidth; + uint32_t m_picHeight; + intptr_t m_stride; + intptr_t m_strideC; + + uint32_t m_picCsp; + uint32_t m_hChromaShift; + uint32_t m_vChromaShift; + + intptr_t* m_cuOffsetY; /* these four buffers are owned by the top-level encoder */ + intptr_t* m_cuOffsetC; + intptr_t* m_buOffsetY; + intptr_t* m_buOffsetC; + + uint32_t m_lumaMarginX; + uint32_t m_lumaMarginY; + uint32_t m_chromaMarginX; + uint32_t m_chromaMarginY; + + PicYuv(); + + bool create(uint32_t picWidth, uint32_t picHeight, uint32_t csp); + bool createOffsets(const SPS& sps); + void destroy(); + + void copyFromPicture(const x265_picture&, int padx, int pady); + + intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } + + /* get pointer to CTU start address */ + pixel* getLumaAddr(uint32_t ctuAddr) { return m_picOrg[0] + m_cuOffsetY[ctuAddr]; } + pixel* getCbAddr(uint32_t ctuAddr) { return m_picOrg[1] + m_cuOffsetC[ctuAddr]; } + pixel* getCrAddr(uint32_t ctuAddr) { return m_picOrg[2] + m_cuOffsetC[ctuAddr]; } + pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr]; } + pixel* getPlaneAddr(uint32_t plane, uint32_t ctuAddr) { return m_picOrg[plane] + (plane ? m_cuOffsetC[ctuAddr] : m_cuOffsetY[ctuAddr]); } + + /* get pointer to CU start address */ + pixel* getLumaAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[0] + m_cuOffsetY[ctuAddr] + m_buOffsetY[absPartIdx]; } + pixel* getCbAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[1] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } + pixel* getCrAddr(uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[2] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } + pixel* getChromaAddr(uint32_t chromaId, uint32_t ctuAddr, uint32_t absPartIdx) { return m_picOrg[chromaId] + m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; } +}; + +void updateChecksum(const pixel* plane, uint32_t& checksumVal, uint32_t height, uint32_t width, intptr_t stride, int row, uint32_t cuHeight); +void updateCRC(const pixel* plane, uint32_t& crcVal, uint32_t height, uint32_t width, intptr_t stride); +void crcFinish(uint32_t & crc, uint8_t digest[16]); +void checksumFinish(uint32_t checksum, uint8_t digest[16]); +void updateMD5Plane(MD5Context& md5, const pixel* plane, uint32_t width, uint32_t height, intptr_t stride); +} + +#endif // ifndef X265_PICYUV_H diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp new file mode 100644 index 0000000..3e0530d --- /dev/null +++ b/source/common/pixel.cpp @@ -0,0 +1,1387 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Mandar Gurav + * Mahesh Pittala + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "x265.h" + +#include // abs() + +using namespace x265; + +#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \ + p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; + +#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \ + p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \ + p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \ + p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \ + p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \ + p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \ + p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \ + p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \ + p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \ + p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \ + p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \ + p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \ + p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \ + p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \ + p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \ + p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \ + p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \ + p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \ + p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \ + p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \ + p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \ + p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \ + p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \ + p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \ + p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \ + p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>; + +namespace { +// place functions in anonymous namespace (file static) + +template +int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +{ + int sum = 0; + + for (int y = 0; y < ly; y++) + { + for (int x = 0; x < lx; x++) + { + sum += abs(pix1[x] - pix2[x]); + } + + pix1 += stride_pix1; + pix2 += stride_pix2; + } + + return sum; +} + +template +int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2) +{ + int sum = 0; + + for (int y = 0; y < ly; y++) + { + for (int x = 0; x < lx; x++) + { + sum += abs(pix1[x] - pix2[x]); + } + + pix1 += stride_pix1; + pix2 += stride_pix2; + } + + return sum; +} + +template +void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res) +{ + res[0] = 0; + res[1] = 0; + res[2] = 0; + for (int y = 0; y < ly; y++) + { + for (int x = 0; x < lx; x++) + { + res[0] += abs(pix1[x] - pix2[x]); + res[1] += abs(pix1[x] - pix3[x]); + res[2] += abs(pix1[x] - pix4[x]); + } + + pix1 += FENC_STRIDE; + pix2 += frefstride; + pix3 += frefstride; + pix4 += frefstride; + } +} + +template +void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res) +{ + res[0] = 0; + res[1] = 0; + res[2] = 0; + res[3] = 0; + for (int y = 0; y < ly; y++) + { + for (int x = 0; x < lx; x++) + { + res[0] += abs(pix1[x] - pix2[x]); + res[1] += abs(pix1[x] - pix3[x]); + res[2] += abs(pix1[x] - pix4[x]); + res[3] += abs(pix1[x] - pix5[x]); + } + + pix1 += FENC_STRIDE; + pix2 += frefstride; + pix3 += frefstride; + pix4 += frefstride; + pix5 += frefstride; + } +} + +template +int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2) +{ + int sum = 0; + int iTemp; + + for (int y = 0; y < ly; y++) + { + for (int x = 0; x < lx; x++) + { + iTemp = pix1[x] - pix2[x]; + sum += (iTemp * iTemp); + } + + pix1 += stride_pix1; + pix2 += stride_pix2; + } + + return sum; +} + +#define BITS_PER_SUM (8 * sizeof(sum_t)) + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \ + sum2_t t0 = s0 + s1; \ + sum2_t t1 = s0 - s1; \ + sum2_t t2 = s2 + s3; \ + sum2_t t3 = s2 - s3; \ + d0 = t0 + t2; \ + d2 = t0 - t2; \ + d1 = t1 + t3; \ + d3 = t1 - t3; \ +} + +// in: a pseudo-simd number of the form x+(y<<16) +// return: abs(x)+(abs(y)<<16) +inline sum2_t abs2(sum2_t a) +{ + sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1); + + return (a + s) ^ s; +} + +int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +{ + sum2_t tmp[4][2]; + sum2_t a0, a1, a2, a3, b0, b1; + sum2_t sum = 0; + + for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) + { + a0 = pix1[0] - pix2[0]; + a1 = pix1[1] - pix2[1]; + b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); + a2 = pix1[2] - pix2[2]; + a3 = pix1[3] - pix2[3]; + b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); + tmp[i][0] = b0 + b1; + tmp[i][1] = b0 - b1; + } + + for (int i = 0; i < 2; i++) + { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); + sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM); + } + + return (int)(sum >> 1); +} + +int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2) +{ + ssum2_t tmp[4][2]; + ssum2_t a0, a1, a2, a3, b0, b1; + ssum2_t sum = 0; + + for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) + { + a0 = pix1[0] - pix2[0]; + a1 = pix1[1] - pix2[1]; + b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); + a2 = pix1[2] - pix2[2]; + a3 = pix1[3] - pix2[3]; + b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); + tmp[i][0] = b0 + b1; + tmp[i][1] = b0 - b1; + } + + for (int i = 0; i < 2; i++) + { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); + sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM); + } + + return (int)(sum >> 1); +} + +// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once +int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +{ + sum2_t tmp[4][4]; + sum2_t a0, a1, a2, a3; + sum2_t sum = 0; + + for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) + { + a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); + a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); + a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); + a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); + HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); + } + + for (int i = 0; i < 4; i++) + { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); + } + + return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1; +} + +template +// calculate satd in blocks of 4x4 +int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +{ + int satd = 0; + + for (int row = 0; row < h; row += 4) + { + for (int col = 0; col < w; col += 4) + { + satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, + pix2 + row * stride_pix2 + col, stride_pix2); + } + } + + return satd; +} + +template +// calculate satd in blocks of 8x4 +int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +{ + int satd = 0; + + for (int row = 0; row < h; row += 4) + { + for (int col = 0; col < w; col += 8) + { + satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1, + pix2 + row * stride_pix2 + col, stride_pix2); + } + } + + return satd; +} + +inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +{ + sum2_t tmp[8][4]; + sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; + sum2_t sum = 0; + + for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0 = pix1[0] - pix2[0]; + a1 = pix1[1] - pix2[1]; + b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); + a2 = pix1[2] - pix2[2]; + a3 = pix1[3] - pix2[3]; + b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); + a4 = pix1[4] - pix2[4]; + a5 = pix1[5] - pix2[5]; + b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); + a6 = pix1[6] - pix2[6]; + a7 = pix1[7] - pix2[7]; + b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); + HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); + } + + for (int i = 0; i < 4; i++) + { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); + b0 = abs2(a0 + a4) + abs2(a0 - a4); + b0 += abs2(a1 + a5) + abs2(a1 - a5); + b0 += abs2(a2 + a6) + abs2(a2 - a6); + b0 += abs2(a3 + a7) + abs2(a3 - a7); + sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); + } + + return (int)sum; +} + +int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +{ + return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); +} + +inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2) +{ + ssum2_t tmp[8][4]; + ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; + ssum2_t sum = 0; + + for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0 = pix1[0] - pix2[0]; + a1 = pix1[1] - pix2[1]; + b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); + a2 = pix1[2] - pix2[2]; + a3 = pix1[3] - pix2[3]; + b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); + a4 = pix1[4] - pix2[4]; + a5 = pix1[5] - pix2[5]; + b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); + a6 = pix1[6] - pix2[6]; + a7 = pix1[7] - pix2[7]; + b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); + HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); + } + + for (int i = 0; i < 4; i++) + { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); + b0 = abs2(a0 + a4) + abs2(a0 - a4); + b0 += abs2(a1 + a5) + abs2(a1 - a5); + b0 += abs2(a2 + a6) + abs2(a2 - a6); + b0 += abs2(a3 + a7) + abs2(a3 - a7); + sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); + } + + return (int)sum; +} + +int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2) +{ + return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); +} + +int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +{ + int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) + + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) + + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); + + // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because + // this version only rounds once at the end + return (sum + 2) >> 2; +} + +template +// Calculate sa8d in blocks of 8x8 +int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +{ + int cost = 0; + + for (int y = 0; y < h; y += 8) + { + for (int x = 0; x < w; x += 8) + { + cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); + } + } + + return cost; +} + +template +// Calculate sa8d in blocks of 16x16 +int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +{ + int cost = 0; + + for (int y = 0; y < h; y += 16) + { + for (int x = 0; x < w; x += 16) + { + cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); + } + } + + return cost; +} + +template +int pixel_ssd_s_c(short *a, intptr_t dstride) +{ + int sum = 0; + for (int y = 0; y < size; y++) + { + for (int x = 0; x < size; x++) + { + sum += a[x] * a[x]; + } + a += dstride; + } + return sum; +} + +template +void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val) +{ + for (int y = 0; y < size; y++) + { + for (int x = 0; x < size; x++) + { + dst[y * dstride + x] = val; + } + } +} + +void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size) +{ + for (int i = 0; i < size; i++) + { + for (int j = 0; j < size; j++) + { + dst[i * size + j] = ((int)src[i * stride + j]) << shift; + } + } +} + +template +void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset) +{ + for (int i = 0; i < size; i++) + { + for (int j = 0; j < size; j++) + { + dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift; + } + } +} + +void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size) +{ + int round = 1 << (shift - 1); + + for (int i = 0; i < size; i++) + { + for (int j = 0; j < size; j++) + { + dst[j] = (int16_t)((src[j] + round) >> shift); + } + + src += size; + dst += stride; + } +} + +void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size) +{ + int round = 1 << (shift - 1); + + for (int i = 0; i < size; i++) + { + for (int j = 0; j < size; j++) + { + dst[j] = (int16_t)((src[j] + round) >> shift); + } + + src += size; + dst += stride; + } +} + +template +void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +{ + for (int i = 0; i < size; i++) + { + for (int j = 0; j < size; j++) + { + dst[j] = ((int16_t)src[j] << shift); + } + + src += size; + dst += stride; + } +} + +template +void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +{ + for (int i = 0; i < size; i++) + { + for (int j = 0; j < size; j++) + { + dst[j] = (src[j] << shift); + } + + src += size; + dst += stride; + } +} + +template +void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) +{ + for (int y = 0; y < blockSize; y++) + { + for (int x = 0; x < blockSize; x++) + { + residual[x] = static_cast(fenc[x]) - static_cast(pred[x]); + } + + fenc += stride; + residual += stride; + pred += stride; + } +} + +template +void transpose(pixel* dst, pixel* src, intptr_t stride) +{ + for (int k = 0; k < blockSize; k++) + { + for (int l = 0; l < blockSize; l++) + { + dst[k * blockSize + l] = src[l * stride + k]; + } + } +} + +void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) +{ + int x, y; + + for (y = 0; y <= height - 1; y++) + { + for (x = 0; x <= width - 1; ) + { + // note: width can be odd + dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset); + x++; + } + + src += srcStride; + dst += dstStride; + } +} + +void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) +{ + int x, y; + + X265_CHECK(!(width & 15), "weightp alignment error\n"); + X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n"); + X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n"); + + for (y = 0; y <= height - 1; y++) + { + for (x = 0; x <= width - 1; ) + { + // simulating pixel to short conversion + int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH); + dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset); + x++; + } + + src += stride; + dst += stride; + } +} + +template +void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int) +{ + for (int y = 0; y < ly; y++) + { + for (int x = 0; x < lx; x++) + { + dst[x] = (src0[x] + src1[x] + 1) >> 1; + } + + src0 += sstride0; + src1 += sstride1; + dst += dstride; + } +} + +void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) +{ + int x; + + for (x = 0; x < 128; x += 2) + { + pixel pix0 = src[(x + 0)]; + pixel pix1 = src[(x + 1)]; + int sum = pix0 + pix1; + + dst[x >> 1] = (pixel)((sum + 1) >> 1); + } +} + +void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) +{ + int x, y; + + for (y = 0; y < 64; y += 2) + { + for (x = 0; x < 64; x += 2) + { + pixel pix0 = src[(y + 0) * stride + (x + 0)]; + pixel pix1 = src[(y + 0) * stride + (x + 1)]; + pixel pix2 = src[(y + 1) * stride + (x + 0)]; + pixel pix3 = src[(y + 1) * stride + (x + 1)]; + int sum = pix0 + pix1 + pix2 + pix3; + + dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2); + } + } +} + +void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc, + intptr_t src_stride, intptr_t dst_stride, int width, int height) +{ + for (int y = 0; y < height; y++) + { + pixel *src1 = src0 + src_stride; + pixel *src2 = src1 + src_stride; + for (int x = 0; x < width; x++) + { + // slower than naive bilinear, but matches asm +#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1) + dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]); + dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]); + dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]); + dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]); +#undef FILTER + } + src0 += src_stride * 2; + dst0 += dst_stride; + dsth += dst_stride; + dstv += dst_stride; + dstc += dst_stride; + } +} + +/* structural similarity metric */ +void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]) +{ + for (int z = 0; z < 2; z++) + { + uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0; + for (int y = 0; y < 4; y++) + { + for (int x = 0; x < 4; x++) + { + int a = pix1[x + y * stride1]; + int b = pix2[x + y * stride2]; + s1 += a; + s2 += b; + ss += a * a; + ss += b * b; + s12 += a * b; + } + } + + sums[z][0] = s1; + sums[z][1] = s2; + sums[z][2] = ss; + sums[z][3] = s12; + pix1 += 4; + pix2 += 4; + } +} + +float ssim_end_1(int s1, int s2, int ss, int s12) +{ +/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases. + * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784. + * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */ + +#define PIXEL_MAX ((1 << X265_DEPTH) - 1) +#if HIGH_BIT_DEPTH + X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n"); +#define type float + static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64); + static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63); +#else + X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n"); +#define type int + static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); + static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); +#endif + type fs1 = (type)s1; + type fs2 = (type)s2; + type fss = (type)ss; + type fs12 = (type)s12; + type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2); + type covar = (type)(fs12 * 64 - fs1 * fs2); + return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2) + / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2)); +#undef type +#undef PIXEL_MAX +} + +float ssim_end_4(int sum0[5][4], int sum1[5][4], int width) +{ + float ssim = 0.0; + + for (int i = 0; i < width; i++) + { + ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0], + sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1], + sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2], + sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]); + } + + return ssim; +} + +template +uint64_t pixel_var(pixel *pix, intptr_t i_stride) +{ + uint32_t sum = 0, sqr = 0; + + for (int y = 0; y < size; y++) + { + for (int x = 0; x < size; x++) + { + sum += pix[x]; + sqr += pix[x] * pix[x]; + } + + pix += i_stride; + } + + return sum + ((uint64_t)sqr << 32); +} + +#if defined(_MSC_VER) +#pragma warning(disable: 4127) // conditional expression is constant +#endif + +template +int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) +{ + static pixel zeroBuf[8] /* = { 0 } */; + + if (size) + { + int dim = 1 << (size + 2); + uint32_t totEnergy = 0; + for (int i = 0; i < dim; i += 8) + { + for (int j = 0; j < dim; j+= 8) + { + /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */ + int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) - + (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2); + int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) - + (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2); + + totEnergy += abs(sourceEnergy - reconEnergy); + } + } + return totEnergy; + } + else + { + /* 4x4 is too small for sa8d */ + int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2); + int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2); + return abs(sourceEnergy - reconEnergy); + } +} + +template +int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) +{ + static int16_t zeroBuf[8] /* = { 0 } */; + + if (size) + { + int dim = 1 << (size + 2); + uint32_t totEnergy = 0; + for (int i = 0; i < dim; i += 8) + { + for (int j = 0; j < dim; j+= 8) + { + /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */ + int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) - + (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2); + int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) - + (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2); + + totEnergy += abs(sourceEnergy - reconEnergy); + } + } + return totEnergy; + } + else + { + /* 4x4 is too small for sa8d */ + int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2); + int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2); + return abs(sourceEnergy - reconEnergy); + } +} + +void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, + pixel *src, intptr_t srcStride, int w, int h) +{ + for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride) + { + for (int x = 0; x < w; x++) + { + dstu[x] = src[2 * x]; + dstv[x] = src[2 * x + 1]; + } + } +} + +template +void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb) +{ + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x++) + { + a[x] = b[x]; + } + + a += stridea; + b += strideb; + } +} + +template +void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb) +{ + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x++) + { + a[x] = b[x]; + } + + a += stridea; + b += strideb; + } +} + +template +void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb) +{ + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x++) + { + X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n"); + a[x] = (pixel)b[x]; + } + + a += stridea; + b += strideb; + } +} + +template +void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb) +{ + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x++) + { + a[x] = (int16_t)b[x]; + } + + a += stridea; + b += strideb; + } +} + +template +void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1) +{ + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x++) + { + a[x] = (int16_t)(b0[x] - b1[x]); + } + + b0 += sstride0; + b1 += sstride1; + a += dstride; + } +} + +template +void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1) +{ + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x++) + { + a[x] = Clip(b0[x] + b1[x]); + } + + b0 += sstride0; + b1 += sstride1; + a += dstride; + } +} + +template +void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +{ + int shiftNum, offset; + + shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH; + offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; + + for (int y = 0; y < by; y++) + { + for (int x = 0; x < bx; x += 2) + { + dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum); + dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum); + } + + src0 += src0Stride; + src1 += src1Stride; + dst += dstStride; + } +} + +void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) +{ + for (int r = 0; r < height; r++) + { + for (int c = 0; c < width; c++) + { + dst[c] = ((pixel)src[c]) << shift; + } + + dst += dstStride; + src += srcStride; + } +} + +void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) +{ + for (int r = 0; r < height; r++) + { + for (int c = 0; c < width; c++) + { + dst[c] = (pixel)((src[c] >> shift) & mask); + } + + dst += dstStride; + src += srcStride; + } +} + +/* Estimate the total amount of influence on future quality that could be had if we + * were to improve the reference samples used to inter predict any given CU. */ +void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, + int32_t *invQscales, double *fpsFactor, int len) +{ + double fps = *fpsFactor / 256; + + for (int i = 0; i < len; i++) + { + double intraCost = intraCosts[i] * invQscales[i]; + double propagateAmount = (double)propagateIn[i] + intraCost * fps; + double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1)); + double propagateDenom = (double)intraCosts[i]; + dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5); + } +} +} // end anonymous namespace + +namespace x265 { +// x265 private namespace + +/* Extend the edges of a picture so that it may safely be used for motion + * compensation. This function assumes the picture is stored in a buffer with + * sufficient padding for the X and Y margins */ +void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY) +{ + /* extend left and right margins */ + primitives.extendRowBorder(pic, stride, width, height, marginX); + + /* copy top row to create above margin */ + pixel *top = pic - marginX; + for (int y = 0; y < marginY; y++) + memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel)); + + /* copy bottom row to create below margin */ + pixel *bot = pic - marginX + (height - 1) * stride; + for (int y = 0; y < marginY; y++) + memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel)); +} + +/* Initialize entries for pixel functions defined in this file */ +void Setup_C_PixelPrimitives(EncoderPrimitives &p) +{ + SET_FUNC_PRIMITIVE_TABLE_C2(sad) + SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3) + SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4) + SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp) + + // satd + p.satd[LUMA_4x4] = satd_4x4; + p.satd[LUMA_8x8] = satd8<8, 8>; + p.satd[LUMA_8x4] = satd_8x4; + p.satd[LUMA_4x8] = satd4<4, 8>; + p.satd[LUMA_16x16] = satd8<16, 16>; + p.satd[LUMA_16x8] = satd8<16, 8>; + p.satd[LUMA_8x16] = satd8<8, 16>; + p.satd[LUMA_16x12] = satd8<16, 12>; + p.satd[LUMA_12x16] = satd4<12, 16>; + p.satd[LUMA_16x4] = satd8<16, 4>; + p.satd[LUMA_4x16] = satd4<4, 16>; + p.satd[LUMA_32x32] = satd8<32, 32>; + p.satd[LUMA_32x16] = satd8<32, 16>; + p.satd[LUMA_16x32] = satd8<16, 32>; + p.satd[LUMA_32x24] = satd8<32, 24>; + p.satd[LUMA_24x32] = satd8<24, 32>; + p.satd[LUMA_32x8] = satd8<32, 8>; + p.satd[LUMA_8x32] = satd8<8, 32>; + p.satd[LUMA_64x64] = satd8<64, 64>; + p.satd[LUMA_64x32] = satd8<64, 32>; + p.satd[LUMA_32x64] = satd8<32, 64>; + p.satd[LUMA_64x48] = satd8<64, 48>; + p.satd[LUMA_48x64] = satd8<48, 64>; + p.satd[LUMA_64x16] = satd8<64, 16>; + p.satd[LUMA_16x64] = satd8<16, 64>; + +#define CHROMA_420(W, H) \ + p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg; \ + p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c; \ + p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c; \ + p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c; \ + p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c; + +#define CHROMA_422(W, H) \ + p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg; \ + p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c; \ + p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c; \ + p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c; \ + p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c; + +#define CHROMA_444(W, H) \ + p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg; \ + p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c; \ + p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c; \ + p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c; \ + p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c; + +#define LUMA(W, H) \ + p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg; \ + p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c; \ + p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c; \ + p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c; \ + p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c; + +#define LUMA_PIXELSUB(W, H) \ + p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c; \ + p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c; + +#define CHROMA_PIXELSUB_420(W, H) \ + p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c; \ + p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c; + +#define CHROMA_PIXELSUB_422(W, H) \ + p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c; \ + p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c; + +#define CHROMA_PIXELSUB_444(W, H) \ + p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c; \ + p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c; + + + + LUMA(4, 4); + LUMA(8, 8); + CHROMA_420(4, 4); + LUMA(4, 8); + CHROMA_420(2, 4); + LUMA(8, 4); + CHROMA_420(4, 2); + LUMA(16, 16); + CHROMA_420(8, 8); + LUMA(16, 8); + CHROMA_420(8, 4); + LUMA(8, 16); + CHROMA_420(4, 8); + LUMA(16, 12); + CHROMA_420(8, 6); + LUMA(12, 16); + CHROMA_420(6, 8); + LUMA(16, 4); + CHROMA_420(8, 2); + LUMA(4, 16); + CHROMA_420(2, 8); + LUMA(32, 32); + CHROMA_420(16, 16); + LUMA(32, 16); + CHROMA_420(16, 8); + LUMA(16, 32); + CHROMA_420(8, 16); + LUMA(32, 24); + CHROMA_420(16, 12); + LUMA(24, 32); + CHROMA_420(12, 16); + LUMA(32, 8); + CHROMA_420(16, 4); + LUMA(8, 32); + CHROMA_420(4, 16); + LUMA(64, 64); + CHROMA_420(32, 32); + LUMA(64, 32); + CHROMA_420(32, 16); + LUMA(32, 64); + CHROMA_420(16, 32); + LUMA(64, 48); + CHROMA_420(32, 24); + LUMA(48, 64); + CHROMA_420(24, 32); + LUMA(64, 16); + CHROMA_420(32, 8); + LUMA(16, 64); + CHROMA_420(8, 32); + + LUMA_PIXELSUB(4, 4); + LUMA_PIXELSUB(8, 8); + LUMA_PIXELSUB(16, 16); + LUMA_PIXELSUB(32, 32); + LUMA_PIXELSUB(64, 64); + CHROMA_PIXELSUB_420(4, 4) + CHROMA_PIXELSUB_420(8, 8) + CHROMA_PIXELSUB_420(16, 16) + CHROMA_PIXELSUB_420(32, 32) + CHROMA_PIXELSUB_422(4, 8) + CHROMA_PIXELSUB_422(8, 16) + CHROMA_PIXELSUB_422(16, 32) + CHROMA_PIXELSUB_422(32, 64) + CHROMA_PIXELSUB_444(8, 8) + CHROMA_PIXELSUB_444(16, 16) + CHROMA_PIXELSUB_444(32, 32) + CHROMA_PIXELSUB_444(64, 64) + + CHROMA_422(4, 8); + CHROMA_422(4, 4); + CHROMA_422(2, 8); + CHROMA_422(8, 16); + CHROMA_422(8, 8); + CHROMA_422(4, 16); + CHROMA_422(8, 12); + CHROMA_422(6, 16); + CHROMA_422(8, 4); + CHROMA_422(2, 16); + CHROMA_422(16, 32); + CHROMA_422(16, 16); + CHROMA_422(8, 32); + CHROMA_422(16, 24); + CHROMA_422(12, 32); + CHROMA_422(16, 8); + CHROMA_422(4, 32); + CHROMA_422(32, 64); + CHROMA_422(32, 32); + CHROMA_422(16, 64); + CHROMA_422(32, 48); + CHROMA_422(24, 64); + CHROMA_422(32, 16); + CHROMA_422(8, 64); + + CHROMA_444(4, 4); + CHROMA_444(8, 8); + CHROMA_444(4, 8); + CHROMA_444(8, 4); + CHROMA_444(16, 16); + CHROMA_444(16, 8); + CHROMA_444(8, 16); + CHROMA_444(16, 12); + CHROMA_444(12, 16); + CHROMA_444(16, 4); + CHROMA_444(4, 16); + CHROMA_444(32, 32); + CHROMA_444(32, 16); + CHROMA_444(16, 32); + CHROMA_444(32, 24); + CHROMA_444(24, 32); + CHROMA_444(32, 8); + CHROMA_444(8, 32); + CHROMA_444(64, 64); + CHROMA_444(64, 32); + CHROMA_444(32, 64); + CHROMA_444(64, 48); + CHROMA_444(48, 64); + CHROMA_444(64, 16); + CHROMA_444(16, 64); + + SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel) + SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel) + SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t) + + p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>; + p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>; + p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>; + p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>; + p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>; + + p.cvt16to32_shl = convert16to32_shl; + p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>; + p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>; + p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>; + p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>; + p.cvt32to16_shr = convert32to16_shr; + p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>; + p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>; + p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>; + p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>; + + p.copy_shr = copy_shr; + p.copy_shl[BLOCK_4x4] = copy_shl<4>; + p.copy_shl[BLOCK_8x8] = copy_shl<8>; + p.copy_shl[BLOCK_16x16] = copy_shl<16>; + p.copy_shl[BLOCK_32x32] = copy_shl<32>; + + p.sa8d[BLOCK_4x4] = satd_4x4; + p.sa8d[BLOCK_8x8] = sa8d_8x8; + p.sa8d[BLOCK_16x16] = sa8d_16x16; + p.sa8d[BLOCK_32x32] = sa8d16<32, 32>; + p.sa8d[BLOCK_64x64] = sa8d16<64, 64>; + + p.psy_cost_pp[BLOCK_4x4] = psyCost_pp; + p.psy_cost_pp[BLOCK_8x8] = psyCost_pp; + p.psy_cost_pp[BLOCK_16x16] = psyCost_pp; + p.psy_cost_pp[BLOCK_32x32] = psyCost_pp; + p.psy_cost_pp[BLOCK_64x64] = psyCost_pp; + + p.psy_cost_ss[BLOCK_4x4] = psyCost_ss; + p.psy_cost_ss[BLOCK_8x8] = psyCost_ss; + p.psy_cost_ss[BLOCK_16x16] = psyCost_ss; + p.psy_cost_ss[BLOCK_32x32] = psyCost_ss; + p.psy_cost_ss[BLOCK_64x64] = psyCost_ss; + + p.sa8d_inter[LUMA_4x4] = satd_4x4; + p.sa8d_inter[LUMA_8x8] = sa8d_8x8; + p.sa8d_inter[LUMA_8x4] = satd_8x4; + p.sa8d_inter[LUMA_4x8] = satd4<4, 8>; + p.sa8d_inter[LUMA_16x16] = sa8d_16x16; + p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>; + p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>; + p.sa8d_inter[LUMA_16x12] = satd8<16, 12>; + p.sa8d_inter[LUMA_12x16] = satd4<12, 16>; + p.sa8d_inter[LUMA_4x16] = satd4<4, 16>; + p.sa8d_inter[LUMA_16x4] = satd8<16, 4>; + p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>; + p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>; + p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>; + p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>; + p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>; + p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>; + p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>; + p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>; + p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>; + p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>; + p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>; + p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>; + p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>; + p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>; + + p.calcresidual[BLOCK_4x4] = getResidual<4>; + p.calcresidual[BLOCK_8x8] = getResidual<8>; + p.calcresidual[BLOCK_16x16] = getResidual<16>; + p.calcresidual[BLOCK_32x32] = getResidual<32>; + p.calcresidual[BLOCK_64x64] = NULL; + + p.transpose[BLOCK_4x4] = transpose<4>; + p.transpose[BLOCK_8x8] = transpose<8>; + p.transpose[BLOCK_16x16] = transpose<16>; + p.transpose[BLOCK_32x32] = transpose<32>; + p.transpose[BLOCK_64x64] = transpose<64>; + + p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>; + p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>; + p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>; + p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>; + + p.weight_pp = weight_pp_c; + p.weight_sp = weight_sp_c; + + p.scale1D_128to64 = scale1D_128to64; + p.scale2D_64to32 = scale2D_64to32; + p.frame_init_lowres_core = frame_init_lowres_core; + p.ssim_4x4x2_core = ssim_4x4x2_core; + p.ssim_end_4 = ssim_end_4; + + p.var[BLOCK_8x8] = pixel_var<8>; + p.var[BLOCK_16x16] = pixel_var<16>; + p.var[BLOCK_32x32] = pixel_var<32>; + p.var[BLOCK_64x64] = pixel_var<64>; + p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; + p.planecopy_cp = planecopy_cp_c; + p.planecopy_sp = planecopy_sp_c; + p.propagateCost = estimateCUPropagateCost; +} +} diff --git a/source/common/predict.cpp b/source/common/predict.cpp new file mode 100644 index 0000000..a142c5a --- /dev/null +++ b/source/common/predict.cpp @@ -0,0 +1,1060 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Deepthi Nandakumar +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "slice.h" +#include "framedata.h" +#include "picyuv.h" +#include "predict.h" +#include "primitives.h" + +using namespace x265; + +namespace +{ +inline pixel weightBidir(int w0, int16_t P0, int w1, int16_t P1, int round, int shift, int offset) +{ + return Clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset << (shift - 1))) >> shift); +} +} + +Predict::Predict() +{ + m_predBuf = NULL; + m_refAbove = NULL; + m_refAboveFlt = NULL; + m_refLeft = NULL; + m_refLeftFlt = NULL; + m_immedVals = NULL; +} + +Predict::~Predict() +{ + X265_FREE(m_predBuf); + X265_FREE(m_refAbove); + X265_FREE(m_immedVals); + m_predShortYuv[0].destroy(); + m_predShortYuv[1].destroy(); +} + +bool Predict::allocBuffers(int csp) +{ + m_csp = csp; + m_hChromaShift = CHROMA_H_SHIFT(csp); + m_vChromaShift = CHROMA_V_SHIFT(csp); + + int predBufHeight = ((MAX_CU_SIZE + 2) << 4); + int predBufStride = ((MAX_CU_SIZE + 8) << 4); + CHECKED_MALLOC(m_predBuf, pixel, predBufStride * predBufHeight); + CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1)); + CHECKED_MALLOC(m_refAbove, pixel, 12 * MAX_CU_SIZE); + + m_refAboveFlt = m_refAbove + 3 * MAX_CU_SIZE; + m_refLeft = m_refAboveFlt + 3 * MAX_CU_SIZE; + m_refLeftFlt = m_refLeft + 3 * MAX_CU_SIZE; + + return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp); + +fail: + return false; +} + +void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize) +{ + int tuSize = 1 << log2TrSize; + + pixel *refLft, *refAbv; + + if (!(g_intraFilterFlags[dirMode] & tuSize)) + { + refLft = m_refLeft + tuSize - 1; + refAbv = m_refAbove + tuSize - 1; + } + else + { + refLft = m_refLeftFlt + tuSize - 1; + refAbv = m_refAboveFlt + tuSize - 1; + } + + bool bFilter = log2TrSize <= 4; + int sizeIdx = log2TrSize - 2; + X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n"); + primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft, refAbv, dirMode, bFilter); +} + +void Predict::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt) +{ + int tuSize = 1 << log2TrSizeC; + int tuSize2 = tuSize << 1; + + // Create the prediction + const int bufOffset = tuSize - 1; + pixel buf0[3 * MAX_CU_SIZE]; + pixel buf1[3 * MAX_CU_SIZE]; + pixel* above; + pixel* left = buf0 + bufOffset; + + int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1); + for (int k = 0; k < limit; k++) + left[k] = src[k * ADI_BUF_STRIDE]; + + if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize)) + { + // generate filtered intra prediction samples + buf0[bufOffset - 1] = src[1]; + left = buf1 + bufOffset; + for (int i = 0; i < tuSize2; i++) + left[i] = (buf0[bufOffset + i - 1] + 2 * buf0[bufOffset + i] + buf0[bufOffset + i + 1] + 2) >> 2; + left[tuSize2] = buf0[bufOffset + tuSize2]; + + above = buf0 + bufOffset; + above[0] = left[0]; + for (int i = 1; i < tuSize2; i++) + above[i] = (src[i - 1] + 2 * src[i] + src[i + 1] + 2) >> 2; + above[tuSize2] = src[tuSize2]; + } + else + { + above = buf1 + bufOffset; + memcpy(above, src, (tuSize2 + 1) * sizeof(pixel)); + } + + int sizeIdx = log2TrSizeC - 2; + X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n"); + primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0); +} + +void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx) +{ + m_predSlice = cu.m_slice; + cu.getPartIndexAndSize(partIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); + m_ctuAddr = cu.m_cuAddr; + m_cuAbsPartIdx = cuGeom.encodeIdx; +} + +void Predict::prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx) +{ + initMotionCompensation(cu, cuGeom, partIdx); + + m_refIdx0 = cu.m_refIdx[0][m_puAbsPartIdx]; + m_clippedMv[0] = cu.m_mv[0][m_puAbsPartIdx]; + m_refIdx1 = cu.m_refIdx[1][m_puAbsPartIdx]; + m_clippedMv[1] = cu.m_mv[1][m_puAbsPartIdx]; + cu.clipMv(m_clippedMv[0]); + cu.clipMv(m_clippedMv[1]); +} + +void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) +{ + if (m_predSlice->isInterP()) + { + /* P Slice */ + WeightValues wv0[3]; + X265_CHECK(m_refIdx0 >= 0, "invalid P refidx\n"); + X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "P refidx out of range\n"); + const WeightParam *wp0 = m_predSlice->m_weightPredTable[0][m_refIdx0]; + + if (m_predSlice->m_pps->bUseWeightPred && wp0->bPresentFlag) + { + for (int plane = 0; plane < 3; plane++) + { + wv0[plane].w = wp0[plane].inputWeight; + wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8)); + wv0[plane].shift = wp0[plane].log2WeightDenom; + wv0[plane].round = wp0[plane].log2WeightDenom >= 1 ? 1 << (wp0[plane].log2WeightDenom - 1) : 0; + } + + ShortYuv& shortYuv = m_predShortYuv[0]; + + if (bLuma) + predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + if (bChroma) + predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + + addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); + } + else + { + if (bLuma) + predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + if (bChroma) + predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + } + } + else + { + /* B Slice */ + + WeightValues wv0[3], wv1[3]; + const WeightParam *pwp0, *pwp1; + + if (m_predSlice->m_pps->bUseWeightedBiPred) + { + pwp0 = m_refIdx0 >= 0 ? m_predSlice->m_weightPredTable[0][m_refIdx0] : NULL; + pwp1 = m_refIdx1 >= 0 ? m_predSlice->m_weightPredTable[1][m_refIdx1] : NULL; + + if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag)) + { + /* biprediction weighting */ + for (int plane = 0; plane < 3; plane++) + { + wv0[plane].w = pwp0[plane].inputWeight; + wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8)); + wv0[plane].shift = pwp0[plane].log2WeightDenom; + wv0[plane].round = 1 << pwp0[plane].log2WeightDenom; + + wv1[plane].w = pwp1[plane].inputWeight; + wv1[plane].o = pwp1[plane].inputOffset * (1 << (X265_DEPTH - 8)); + wv1[plane].shift = wv0[plane].shift; + wv1[plane].round = wv0[plane].round; + } + } + else + { + /* uniprediction weighting, always outputs to wv0 */ + const WeightParam* pwp = (m_refIdx0 >= 0) ? pwp0 : pwp1; + for (int plane = 0; plane < 3; plane++) + { + wv0[plane].w = pwp[plane].inputWeight; + wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8)); + wv0[plane].shift = pwp[plane].log2WeightDenom; + wv0[plane].round = pwp[plane].log2WeightDenom >= 1 ? 1 << (pwp[plane].log2WeightDenom - 1) : 0; + } + } + } + else + pwp0 = pwp1 = NULL; + + if (m_refIdx0 >= 0 && m_refIdx1 >= 0) + { + /* Biprediction */ + X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "bidir refidx0 out of range\n"); + X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "bidir refidx1 out of range\n"); + + if (bLuma) + { + predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + } + if (bChroma) + { + predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + } + + if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag)) + addWeightBi(predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma); + else + predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], m_puAbsPartIdx, m_puWidth, m_puHeight, bLuma, bChroma); + } + else if (m_refIdx0 >= 0) + { + /* uniprediction to L0 */ + X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "unidir refidx0 out of range\n"); + + if (pwp0 && pwp0->bPresentFlag) + { + ShortYuv& shortYuv = m_predShortYuv[0]; + + if (bLuma) + predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + if (bChroma) + predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + + addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); + } + else + { + if (bLuma) + predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + if (bChroma) + predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPicYuv, m_clippedMv[0]); + } + } + else + { + /* uniprediction to L1 */ + X265_CHECK(m_refIdx1 >= 0, "refidx1 was not positive\n"); + X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "unidir refidx1 out of range\n"); + + if (pwp1 && pwp1->bPresentFlag) + { + ShortYuv& shortYuv = m_predShortYuv[0]; + + if (bLuma) + predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + if (bChroma) + predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + + addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); + } + else + { + if (bLuma) + predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + if (bChroma) + predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPicYuv, m_clippedMv[1]); + } + } + } +} + +void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const +{ + pixel *dst = dstYuv.getLumaAddr(m_puAbsPartIdx); + intptr_t dstStride = dstYuv.m_size; + + intptr_t srcStride = refPic.m_stride; + intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride; + int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + pixel* src = const_cast(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; + + int xFrac = mv.x & 0x3; + int yFrac = mv.y & 0x3; + + if (!(yFrac | xFrac)) + primitives.luma_copy_pp[partEnum](dst, dstStride, src, srcStride); + else if (!yFrac) + primitives.luma_hpp[partEnum](src, srcStride, dst, dstStride, xFrac); + else if (!xFrac) + primitives.luma_vpp[partEnum](src, srcStride, dst, dstStride, yFrac); + else + { + int tmpStride = m_puWidth; + int filterSize = NTAPS_LUMA; + int halfFilterSize = (filterSize >> 1); + primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1); + primitives.luma_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac); + } +} + +void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const +{ + int16_t *dst = dstSYuv.getLumaAddr(m_puAbsPartIdx); + int dstStride = dstSYuv.m_size; + + intptr_t srcStride = refPic.m_stride; + intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride; + pixel *src = const_cast(refPic).getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; + + int xFrac = mv.x & 0x3; + int yFrac = mv.y & 0x3; + + int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + + X265_CHECK((m_puWidth % 4) + (m_puHeight % 4) == 0, "width or height not divisible by 4\n"); + X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n"); + + if (!(yFrac | xFrac)) + primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight); + else if (!yFrac) + primitives.luma_hps[partEnum](src, srcStride, dst, dstStride, xFrac, 0); + else if (!xFrac) + primitives.luma_vps[partEnum](src, srcStride, dst, dstStride, yFrac); + else + { + int tmpStride = m_puWidth; + int filterSize = NTAPS_LUMA; + int halfFilterSize = (filterSize >> 1); + primitives.luma_hps[partEnum](src, srcStride, m_immedVals, tmpStride, xFrac, 1); + primitives.luma_vss[partEnum](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac); + } +} + +void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const +{ + intptr_t dstStride = dstYuv.m_csize; + intptr_t refStride = refPic.m_strideC; + + int shiftHor = (2 + m_hChromaShift); + int shiftVer = (2 + m_vChromaShift); + + intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride; + + pixel* refCb = const_cast(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + pixel* refCr = const_cast(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + + pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx); + pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx); + + int xFrac = mv.x & ((1 << shiftHor) - 1); + int yFrac = mv.y & ((1 << shiftVer) - 1); + + int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + + if (!(yFrac | xFrac)) + { + primitives.chroma[m_csp].copy_pp[partEnum](dstCb, dstStride, refCb, refStride); + primitives.chroma[m_csp].copy_pp[partEnum](dstCr, dstStride, refCr, refStride); + } + else if (!yFrac) + { + primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift)); + primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift)); + } + else if (!xFrac) + { + primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift)); + primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift)); + } + else + { + int extStride = m_puWidth >> m_hChromaShift; + int filterSize = NTAPS_CHROMA; + int halfFilterSize = (filterSize >> 1); + + primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1); + primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift)); + + primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1); + primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift)); + } +} + +void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const +{ + intptr_t refStride = refPic.m_strideC; + intptr_t dstStride = dstSYuv.m_csize; + + int shiftHor = (2 + m_hChromaShift); + int shiftVer = (2 + m_vChromaShift); + + intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride; + + pixel* refCb = const_cast(refPic).getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + pixel* refCr = const_cast(refPic).getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + + int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx); + int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx); + + int xFrac = mv.x & ((1 << shiftHor) - 1); + int yFrac = mv.y & ((1 << shiftVer) - 1); + + int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + + uint32_t cxWidth = m_puWidth >> m_hChromaShift; + uint32_t cxHeight = m_puHeight >> m_vChromaShift; + + X265_CHECK(((cxWidth | cxHeight) % 2) == 0, "chroma block size expected to be multiple of 2\n"); + + if (!(yFrac | xFrac)) + { + primitives.chroma_p2s[m_csp](refCb, refStride, dstCb, cxWidth, cxHeight); + primitives.chroma_p2s[m_csp](refCr, refStride, dstCr, cxWidth, cxHeight); + } + else if (!yFrac) + { + primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0); + primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0); + } + else if (!xFrac) + { + primitives.chroma[m_csp].filter_vps[partEnum](refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift)); + primitives.chroma[m_csp].filter_vps[partEnum](refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift)); + } + else + { + int extStride = cxWidth; + int filterSize = NTAPS_CHROMA; + int halfFilterSize = (filterSize >> 1); + primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1); + primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift)); + primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1); + primitives.chroma[m_csp].filter_vss[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift)); + } +} + +/* weighted averaging for bi-pred */ +void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const +{ + int x, y; + + int w0, w1, offset, shiftNum, shift, round; + uint32_t src0Stride, src1Stride, dststride; + + pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); + pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); + pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); + + const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx); + const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx); + const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx); + + const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx); + const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx); + const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx); + + if (bLuma) + { + // Luma + w0 = wp0[0].w; + offset = wp0[0].o + wp1[0].o; + shiftNum = IF_INTERNAL_PREC - X265_DEPTH; + shift = wp0[0].shift + shiftNum + 1; + round = shift ? (1 << (shift - 1)) : 0; + w1 = wp1[0].w; + + src0Stride = srcYuv0.m_size; + src1Stride = srcYuv1.m_size; + dststride = predYuv.m_size; + + // TODO: can we use weight_sp here? + for (y = m_puHeight - 1; y >= 0; y--) + { + for (x = m_puWidth - 1; x >= 0; ) + { + // note: luma min width is 4 + dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset); + x--; + dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset); + x--; + dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset); + x--; + dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset); + x--; + } + + srcY0 += src0Stride; + srcY1 += src1Stride; + dstY += dststride; + } + } + + if (bChroma) + { + // Chroma U + w0 = wp0[1].w; + offset = wp0[1].o + wp1[1].o; + shiftNum = IF_INTERNAL_PREC - X265_DEPTH; + shift = wp0[1].shift + shiftNum + 1; + round = shift ? (1 << (shift - 1)) : 0; + w1 = wp1[1].w; + + src0Stride = srcYuv0.m_csize; + src1Stride = srcYuv1.m_csize; + dststride = predYuv.m_csize; + + uint32_t cwidth = m_puWidth >> srcYuv0.m_hChromaShift; + uint32_t cheight = m_puHeight >> srcYuv0.m_vChromaShift; + + // TODO: can we use weight_sp here? + for (y = cheight - 1; y >= 0; y--) + { + for (x = cwidth - 1; x >= 0;) + { + // note: chroma min width is 2 + dstU[x] = weightBidir(w0, srcU0[x], w1, srcU1[x], round, shift, offset); + x--; + dstU[x] = weightBidir(w0, srcU0[x], w1, srcU1[x], round, shift, offset); + x--; + } + + srcU0 += src0Stride; + srcU1 += src1Stride; + dstU += dststride; + } + + // Chroma V + w0 = wp0[2].w; + offset = wp0[2].o + wp1[2].o; + shift = wp0[2].shift + shiftNum + 1; + round = shift ? (1 << (shift - 1)) : 0; + w1 = wp1[2].w; + + for (y = cheight - 1; y >= 0; y--) + { + for (x = cwidth - 1; x >= 0;) + { + // note: chroma min width is 2 + dstV[x] = weightBidir(w0, srcV0[x], w1, srcV1[x], round, shift, offset); + x--; + dstV[x] = weightBidir(w0, srcV0[x], w1, srcV1[x], round, shift, offset); + x--; + } + + srcV0 += src0Stride; + srcV1 += src1Stride; + dstV += dststride; + } + } +} + +/* weighted averaging for uni-pred */ +void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const +{ + pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); + pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); + pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); + + const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx); + const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx); + const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx); + + int w0, offset, shiftNum, shift, round; + uint32_t srcStride, dstStride; + + if (bLuma) + { + // Luma + w0 = wp[0].w; + offset = wp[0].offset; + shiftNum = IF_INTERNAL_PREC - X265_DEPTH; + shift = wp[0].shift + shiftNum; + round = shift ? (1 << (shift - 1)) : 0; + srcStride = srcYuv.m_size; + dstStride = predYuv.m_size; + + primitives.weight_sp(const_cast(srcY0), dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset); + } + + if (bChroma) + { + // Chroma U + w0 = wp[1].w; + offset = wp[1].offset; + shiftNum = IF_INTERNAL_PREC - X265_DEPTH; + shift = wp[1].shift + shiftNum; + round = shift ? (1 << (shift - 1)) : 0; + + srcStride = srcYuv.m_csize; + dstStride = predYuv.m_csize; + + uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift; + uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift; + + primitives.weight_sp(const_cast(srcU0), dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); + + // Chroma V + w0 = wp[2].w; + offset = wp[2].offset; + shift = wp[2].shift + shiftNum; + round = shift ? (1 << (shift - 1)) : 0; + + primitives.weight_sp(const_cast(srcV0), dstV, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); + } +} + +void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode) +{ + IntraNeighbors intraNeighbors; + initIntraNeighbors(cu, absPartIdx, partDepth, true, &intraNeighbors); + + pixel* adiBuf = m_predBuf; + pixel* refAbove = m_refAbove; + pixel* refLeft = m_refLeft; + pixel* refAboveFlt = m_refAboveFlt; + pixel* refLeftFlt = m_refLeftFlt; + + int tuSize = intraNeighbors.tuSize; + int tuSize2 = tuSize << 1; + + pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = cu.m_encData->m_reconPicYuv->m_stride; + + fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors); + + // initialization of ADI buffers + const int bufOffset = tuSize - 1; + refAbove += bufOffset; + refLeft += bufOffset; + + // ADI_BUF_STRIDE * (2 * tuSize + 1); + memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel)); + for (int k = 0; k < tuSize2 + 1; k++) + refLeft[k] = adiBuf[k * ADI_BUF_STRIDE]; + + if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize) + { + // generate filtered intra prediction samples + refAboveFlt += bufOffset; + refLeftFlt += bufOffset; + + bool bStrongSmoothing = (tuSize == 32 && cu.m_slice->m_sps->bUseStrongIntraSmoothing); + + if (bStrongSmoothing) + { + const int trSize = 32; + const int trSize2 = 32 * 2; + const int threshold = 1 << (X265_DEPTH - 5); + int refBL = refLeft[trSize2]; + int refTL = refAbove[0]; + int refTR = refAbove[trSize2]; + bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize]) < threshold && + abs(refTL + refTR - 2 * refAbove[trSize]) < threshold); + + if (bStrongSmoothing) + { + // bilinear interpolation + const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1; + int init = (refTL << shift) + tuSize; + int delta; + + refLeftFlt[0] = refAboveFlt[0] = refAbove[0]; + + //TODO: Performance Primitive??? + delta = refBL - refTL; + for (int i = 1; i < trSize2; i++) + refLeftFlt[i] = (pixel)((init + delta * i) >> shift); + refLeftFlt[trSize2] = refLeft[trSize2]; + + delta = refTR - refTL; + for (int i = 1; i < trSize2; i++) + refAboveFlt[i] = (pixel)((init + delta * i) >> shift); + refAboveFlt[trSize2] = refAbove[trSize2]; + + return; + } + } + + refLeft[-1] = refAbove[1]; + for (int i = 0; i < tuSize2; i++) + refLeftFlt[i] = (refLeft[i - 1] + 2 * refLeft[i] + refLeft[i + 1] + 2) >> 2; + refLeftFlt[tuSize2] = refLeft[tuSize2]; + + refAboveFlt[0] = refLeftFlt[0]; + for (int i = 1; i < tuSize2; i++) + refAboveFlt[i] = (refAbove[i - 1] + 2 * refAbove[i] + refAbove[i + 1] + 2) >> 2; + refAboveFlt[tuSize2] = refAbove[tuSize2]; + } +} + +void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId) +{ + IntraNeighbors intraNeighbors; + initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors); + uint32_t tuSize = intraNeighbors.tuSize; + + const pixel* adiOrigin = cu.m_encData->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = cu.m_encData->m_reconPicYuv->m_strideC; + pixel* adiRef = getAdiChromaBuf(chromaId, tuSize); + + fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors); +} + +void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors) +{ + uint32_t log2TrSize = cu.m_log2CUSize[0] - partDepth; + int log2UnitWidth = LOG2_UNIT_SIZE; + int log2UnitHeight = LOG2_UNIT_SIZE; + + if (!isLuma) + { + log2TrSize -= cu.m_hChromaShift; + log2UnitWidth -= cu.m_hChromaShift; + log2UnitHeight -= cu.m_vChromaShift; + } + + int numIntraNeighbor = 0; + bool *bNeighborFlags = intraNeighbors->bNeighborFlags; + + uint32_t partIdxLT, partIdxRT, partIdxLB; + + cu.deriveLeftRightTopIdxAdi(partIdxLT, partIdxRT, absPartIdx, partDepth); + + uint32_t tuSize = 1 << log2TrSize; + int tuWidthInUnits = tuSize >> log2UnitWidth; + int tuHeightInUnits = tuSize >> log2UnitHeight; + int aboveUnits = tuWidthInUnits << 1; + int leftUnits = tuHeightInUnits << 1; + int partIdxStride = cu.m_slice->m_sps->numPartInCUSize; + partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)]; + + bNeighborFlags[leftUnits] = isAboveLeftAvailable(cu, partIdxLT); + numIntraNeighbor += (int)(bNeighborFlags[leftUnits]); + numIntraNeighbor += isAboveAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1)); + numIntraNeighbor += isAboveRightAvailable(cu, partIdxLT, partIdxRT, (bNeighborFlags + leftUnits + 1 + tuWidthInUnits)); + numIntraNeighbor += isLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1)); + numIntraNeighbor += isBelowLeftAvailable(cu, partIdxLT, partIdxLB, (bNeighborFlags + leftUnits - 1 - tuHeightInUnits)); + + intraNeighbors->numIntraNeighbor = numIntraNeighbor; + intraNeighbors->totalUnits = aboveUnits + leftUnits + 1; + intraNeighbors->aboveUnits = aboveUnits; + intraNeighbors->leftUnits = leftUnits; + intraNeighbors->unitWidth = 1 << log2UnitWidth; + intraNeighbors->unitHeight = 1 << log2UnitHeight; + intraNeighbors->tuSize = tuSize; + intraNeighbors->log2TrSize = log2TrSize; +} + +void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors) +{ + const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1)); + int numIntraNeighbor = intraNeighbors.numIntraNeighbor; + int totalUnits = intraNeighbors.totalUnits; + uint32_t tuSize = intraNeighbors.tuSize; + uint32_t refSize = tuSize * 2 + 1; + + if (numIntraNeighbor == 0) + { + // Fill border with DC value + for (uint32_t i = 0; i < refSize; i++) + adiRef[i] = dcValue; + + for (uint32_t i = 1; i < refSize; i++) + adiRef[i * ADI_BUF_STRIDE] = dcValue; + } + else if (numIntraNeighbor == totalUnits) + { + // Fill top border with rec. samples + const pixel* adiTemp = adiOrigin - picStride - 1; + memcpy(adiRef, adiTemp, refSize * sizeof(*adiRef)); + + // Fill left border with rec. samples + adiTemp = adiOrigin - 1; + for (uint32_t i = 1; i < refSize; i++) + { + adiRef[i * ADI_BUF_STRIDE] = adiTemp[0]; + adiTemp += picStride; + } + } + else // reference samples are partially available + { + const bool *bNeighborFlags = intraNeighbors.bNeighborFlags; + const bool *pNeighborFlags; + int aboveUnits = intraNeighbors.aboveUnits; + int leftUnits = intraNeighbors.leftUnits; + int unitWidth = intraNeighbors.unitWidth; + int unitHeight = intraNeighbors.unitHeight; + int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth); + pixel adiLineBuffer[5 * MAX_CU_SIZE]; + pixel *adi; + + // Initialize + for (int i = 0; i < totalSamples; i++) + adiLineBuffer[i] = dcValue; + + // Fill top-left sample + const pixel* adiTemp = adiOrigin - picStride - 1; + adi = adiLineBuffer + (leftUnits * unitHeight); + pNeighborFlags = bNeighborFlags + leftUnits; + if (*pNeighborFlags) + { + pixel topLeftVal = adiTemp[0]; + for (int i = 0; i < unitWidth; i++) + adi[i] = topLeftVal; + } + + // Fill left & below-left samples + adiTemp += picStride; + adi--; + pNeighborFlags--; + for (int j = 0; j < leftUnits; j++) + { + if (*pNeighborFlags) + for (int i = 0; i < unitHeight; i++) + adi[-i] = adiTemp[i * picStride]; + + adiTemp += unitHeight * picStride; + adi -= unitHeight; + pNeighborFlags--; + } + + // Fill above & above-right samples + adiTemp = adiOrigin - picStride; + adi = adiLineBuffer + (leftUnits * unitHeight) + unitWidth; + pNeighborFlags = bNeighborFlags + leftUnits + 1; + for (int j = 0; j < aboveUnits; j++) + { + if (*pNeighborFlags) + memcpy(adi, adiTemp, unitWidth * sizeof(*adiTemp)); + adiTemp += unitWidth; + adi += unitWidth; + pNeighborFlags++; + } + + // Pad reference samples when necessary + int curr = 0; + int next = 1; + adi = adiLineBuffer; + int pAdiLineTopRowOffset = leftUnits * (unitHeight - unitWidth); + if (!bNeighborFlags[0]) + { + // very bottom unit of bottom-left; at least one unit will be valid. + while (next < totalUnits && !bNeighborFlags[next]) + next++; + + pixel *pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth))); + const pixel refSample = *pAdiLineNext; + // Pad unavailable samples with new value + int nextOrTop = X265_MIN(next, leftUnits); + // fill left column + while (curr < nextOrTop) + { + for (int i = 0; i < unitHeight; i++) + adi[i] = refSample; + + adi += unitHeight; + curr++; + } + + // fill top row + while (curr < next) + { + for (int i = 0; i < unitWidth; i++) + adi[i] = refSample; + + adi += unitWidth; + curr++; + } + } + + // pad all other reference samples. + while (curr < totalUnits) + { + if (!bNeighborFlags[curr]) // samples not available + { + int numSamplesInCurrUnit = (curr >= leftUnits) ? unitWidth : unitHeight; + const pixel refSample = *(adi - 1); + for (int i = 0; i < numSamplesInCurrUnit; i++) + adi[i] = refSample; + + adi += numSamplesInCurrUnit; + curr++; + } + else + { + adi += (curr >= leftUnits) ? unitWidth : unitHeight; + curr++; + } + } + + // Copy processed samples + adi = adiLineBuffer + refSize + unitWidth - 2; + memcpy(adiRef, adi, refSize * sizeof(*adiRef)); + + adi = adiLineBuffer + refSize - 1; + for (int i = 1; i < (int)refSize; i++) + adiRef[i * ADI_BUF_STRIDE] = adi[-i]; + } +} + +bool Predict::isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT) +{ + uint32_t partAboveLeft; + const CUData* cuAboveLeft = cu.getPUAboveLeft(partAboveLeft, partIdxLT); + + if (!cu.m_slice->m_pps->bConstrainedIntraPred) + return cuAboveLeft ? true : false; + else + return cuAboveLeft && cuAboveLeft->isIntra(partAboveLeft); +} + +int Predict::isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags) +{ + const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT]; + const uint32_t rasterPartEnd = g_zscanToRaster[partIdxRT] + 1; + const uint32_t idxStep = 1; + bool *validFlagPtr = bValidFlags; + int numIntra = 0; + + for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep) + { + uint32_t partAbove; + const CUData* cuAbove = cu.getPUAbove(partAbove, g_rasterToZscan[rasterPart]); + if (cuAbove && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAbove->isIntra(partAbove))) + { + numIntra++; + *validFlagPtr = true; + } + else + *validFlagPtr = false; + + validFlagPtr++; + } + + return numIntra; +} + +int Predict::isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags) +{ + const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT]; + const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB] + 1; + const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize; + bool *validFlagPtr = bValidFlags; + int numIntra = 0; + + for (uint32_t rasterPart = rasterPartBegin; rasterPart < rasterPartEnd; rasterPart += idxStep) + { + uint32_t partLeft; + const CUData* cuLeft = cu.getPULeft(partLeft, g_rasterToZscan[rasterPart]); + if (cuLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuLeft->isIntra(partLeft))) + { + numIntra++; + *validFlagPtr = true; + } + else + *validFlagPtr = false; + + validFlagPtr--; // opposite direction + } + + return numIntra; +} + +int Predict::isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool *bValidFlags) +{ + const uint32_t numUnitsInPU = g_zscanToRaster[partIdxRT] - g_zscanToRaster[partIdxLT] + 1; + bool *validFlagPtr = bValidFlags; + int numIntra = 0; + + for (uint32_t offset = 1; offset <= numUnitsInPU; offset++) + { + uint32_t partAboveRight; + const CUData* cuAboveRight = cu.getPUAboveRightAdi(partAboveRight, partIdxRT, offset); + if (cuAboveRight && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuAboveRight->isIntra(partAboveRight))) + { + numIntra++; + *validFlagPtr = true; + } + else + *validFlagPtr = false; + + validFlagPtr++; + } + + return numIntra; +} + +int Predict::isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool *bValidFlags) +{ + const uint32_t numUnitsInPU = (g_zscanToRaster[partIdxLB] - g_zscanToRaster[partIdxLT]) / cu.m_slice->m_sps->numPartInCUSize + 1; + bool *validFlagPtr = bValidFlags; + int numIntra = 0; + + for (uint32_t offset = 1; offset <= numUnitsInPU; offset++) + { + uint32_t partBelowLeft; + const CUData* cuBelowLeft = cu.getPUBelowLeftAdi(partBelowLeft, partIdxLB, offset); + if (cuBelowLeft && (!cu.m_slice->m_pps->bConstrainedIntraPred || cuBelowLeft->isIntra(partBelowLeft))) + { + numIntra++; + *validFlagPtr = true; + } + else + *validFlagPtr = false; + + validFlagPtr--; // opposite direction + } + + return numIntra; +} diff --git a/source/common/predict.h b/source/common/predict.h new file mode 100644 index 0000000..a76a32c --- /dev/null +++ b/source/common/predict.h @@ -0,0 +1,137 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Deepthi Nandakumar +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_PREDICT_H +#define X265_PREDICT_H + +#include "common.h" +#include "frame.h" +#include "quant.h" +#include "shortyuv.h" +#include "yuv.h" + +namespace x265 { + +class CUData; +class Slice; +struct CUGeom; + +class Predict +{ +public: + + enum { ADI_BUF_STRIDE = (2 * MAX_CU_SIZE + 1 + 15) }; // alignment to 16 bytes + + /* Weighted prediction scaling values built from slice parameters (bitdepth scaled) */ + struct WeightValues + { + int w, o, offset, shift, round; + }; + + struct IntraNeighbors + { + int numIntraNeighbor; + int totalUnits; + int aboveUnits; + int leftUnits; + int unitWidth; + int unitHeight; + int tuSize; + uint32_t log2TrSize; + bool bNeighborFlags[4 * MAX_NUM_SPU_W + 1]; + }; + + ShortYuv m_predShortYuv[2]; /* temporary storage for weighted prediction */ + int16_t* m_immedVals; + + /* Intra prediction buffers */ + pixel* m_predBuf; + pixel* m_refAbove; + pixel* m_refAboveFlt; + pixel* m_refLeft; + pixel* m_refLeftFlt; + + /* Slice information */ + const Slice* m_predSlice; + int m_csp; + int m_hChromaShift; + int m_vChromaShift; + + /* cached CU information for prediction */ + uint32_t m_ctuAddr; // raster index of current CTU within its picture + uint32_t m_cuAbsPartIdx; // z-order index of current CU within its CTU + uint32_t m_puAbsPartIdx; // z-order index of current PU with its CU + int m_puWidth; + int m_puHeight; + int m_refIdx0; + int m_refIdx1; + + /* TODO: Need to investigate clipping while writing into the TComDataCU fields itself */ + MV m_clippedMv[2]; + + Predict(); + ~Predict(); + + bool allocBuffers(int csp); + + // motion compensation functions + void predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const; + void predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const; + + void predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const; + void predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const; + + void addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const; + void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const; + + /* Intra prediction helper functions */ + static void initIntraNeighbors(const CUData& cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors); + static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors); + + static bool isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT); + static int isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags); + static int isLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags); + static int isAboveRightAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags); + static int isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxLB, bool* bValidFlags); + +public: + + /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */ + void initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx); + void prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx); + void motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma); + + /* Angular Intra */ + void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize); + void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt); + + void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode); + void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId); + pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize) + { + return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE * (tuSize * 2 + 1)); + } +}; +} + +#endif // ifndef X265_PREDICT_H diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp new file mode 100644 index 0000000..7592d27 --- /dev/null +++ b/source/common/primitives.cpp @@ -0,0 +1,242 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" + +namespace x265 { +// x265 private namespace + +extern const uint8_t lumaPartitionMapTable[] = +{ +// 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 + LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 + LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 + 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 + LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 + 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 + 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 + 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64 +}; + +/* the "authoritative" set of encoder primitives */ +EncoderPrimitives primitives; + +void Setup_C_PixelPrimitives(EncoderPrimitives &p); +void Setup_C_DCTPrimitives(EncoderPrimitives &p); +void Setup_C_IPFilterPrimitives(EncoderPrimitives &p); +void Setup_C_IPredPrimitives(EncoderPrimitives &p); +void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p); + +void Setup_C_Primitives(EncoderPrimitives &p) +{ + Setup_C_PixelPrimitives(p); // pixel.cpp + Setup_C_DCTPrimitives(p); // dct.cpp + Setup_C_IPFilterPrimitives(p); // ipfilter.cpp + Setup_C_IPredPrimitives(p); // intrapred.cpp + Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp +} + +void Setup_Alias_Primitives(EncoderPrimitives &p) +{ + /* copy reusable luma primitives to chroma 4:4:4 */ + for (int i = 0; i < NUM_LUMA_PARTITIONS; i++) + { + p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i]; + p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i]; + p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i]; + p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i]; + p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i]; + } + + for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) + { + p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i]; + p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i]; + } + + for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) + { + int partL = partitionFromLog2Size(i + 2); + p.square_copy_pp[i] = p.luma_copy_pp[partL]; + p.square_copy_ps[i] = p.luma_copy_ps[partL]; + p.square_copy_sp[i] = p.luma_copy_sp[partL]; + p.square_copy_ss[i] = p.luma_copy_ss[partL]; + } + + primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4]; + primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8]; + primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16]; + primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32]; + primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64]; + + // SA8D devolves to SATD for blocks not even multiples of 8x8 + primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4]; + primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8]; + primitives.sa8d_inter[LUMA_4x16] = primitives.satd[LUMA_4x16]; + primitives.sa8d_inter[LUMA_8x4] = primitives.satd[LUMA_8x4]; + primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4]; + primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12]; + primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16]; +} +} +using namespace x265; + +/* cpuid >= 0 - force CPU type + * cpuid < 0 - auto-detect if uninitialized */ +extern "C" +void x265_setup_primitives(x265_param *param, int cpuid) +{ + if (cpuid < 0) + cpuid = x265::cpu_detect(); + + // initialize global variables + if (!primitives.sad[0]) + { + Setup_C_Primitives(primitives); + Setup_Instrinsic_Primitives(primitives, cpuid); + +#if ENABLE_ASSEMBLY + Setup_Assembly_Primitives(primitives, cpuid); +#else + x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n"); +#endif + + Setup_Alias_Primitives(primitives); + + initROM(); + } + + if (param->logLevel >= X265_LOG_INFO) + { + char buf[1000]; + char *p = buf + sprintf(buf, "using cpu capabilities:"); + char *none = p; + for (int i = 0; x265::cpu_names[i].flags; i++) + { + if (!strcmp(x265::cpu_names[i].name, "SSE") + && (cpuid & X265_CPU_SSE2)) + continue; + if (!strcmp(x265::cpu_names[i].name, "SSE2") + && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW))) + continue; + if (!strcmp(x265::cpu_names[i].name, "SSE3") + && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64))) + continue; + if (!strcmp(x265::cpu_names[i].name, "SSE4.1") + && (cpuid & X265_CPU_SSE42)) + continue; + if (!strcmp(x265::cpu_names[i].name, "BMI1") + && (cpuid & X265_CPU_BMI2)) + continue; + if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags + && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags)) + p += sprintf(p, " %s", x265::cpu_names[i].name); + } + + if (p == none) + sprintf(p, " none!"); + x265_log(param, X265_LOG_INFO, "%s\n", buf); + } +} + +#if !defined(ENABLE_ASSEMBLY) +#if defined(_MSC_VER) +#include +#endif + +extern "C" { +// the intrinsic primitives will not use MMX instructions, so if assembly +// is disabled there should be no reason to use EMMS. +void x265_cpu_emms(void) {} + +#if defined(X265_ARCH_X86) + +#if defined(_MSC_VER) +# pragma warning(disable: 4100) +#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax +# define __cpuidex(regsArray, level, index) \ + __asm__ __volatile__ ("cpuid" \ + : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \ + : "0" (level), "2" (index)); +#else +# error "compiler not supported" +#endif + +int x265_cpu_cpuid_test(void) +{ + return 0; +} + +void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + int output[4]; + + __cpuidex(output, op, 0); + *eax = output[0]; + *ebx = output[1]; + *ecx = output[2]; + *edx = output[3]; +} + +void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx) +{ + uint64_t out = 0; + +#if X265_ARCH_X86 + +#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) + + // MSVC 2010 SP1 or later, or similar Intel release + out = _xgetbv(op); + +#elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax + + uint32_t a, d; + __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :); + *eax = a; + *edx = d; + return; + +#elif defined(_WIN64) // On x64 with older compilers, this is impossible + +#endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) + +#endif // if x86 + + *eax = (uint32_t)out; + *edx = (uint32_t)(out >> 32); +} + +#endif // X265_ARCH_X86 +} +#endif // if !ENABLE_ASSEMBLY diff --git a/source/common/primitives.h b/source/common/primitives.h new file mode 100644 index 0000000..8300c21 --- /dev/null +++ b/source/common/primitives.h @@ -0,0 +1,319 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Mandar Gurav + * Deepthi Devaki Akkoorath + * Mahesh Pittala + * Rajesh Paulraj + * Praveen Kumar Tiwari + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_PRIMITIVES_H +#define X265_PRIMITIVES_H + +#include "common.h" +#include "cpu.h" + +namespace x265 { +// x265 private namespace + +enum LumaPartitions +{ + // Square + LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64, + // Rectangular + LUMA_8x4, LUMA_4x8, + LUMA_16x8, LUMA_8x16, + LUMA_32x16, LUMA_16x32, + LUMA_64x32, LUMA_32x64, + // Asymmetrical (0.75, 0.25) + LUMA_16x12, LUMA_12x16, LUMA_16x4, LUMA_4x16, + LUMA_32x24, LUMA_24x32, LUMA_32x8, LUMA_8x32, + LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64, + NUM_LUMA_PARTITIONS +}; + +// 4:2:0 chroma partition sizes. These enums are just a convenience for indexing into the +// chroma primitive arrays when instantiating templates. The function tables should always +// be indexed by the luma partition enum +enum Chroma420Partitions +{ + CHROMA_2x2, CHROMA_4x4, CHROMA_8x8, CHROMA_16x16, CHROMA_32x32, + CHROMA_4x2, CHROMA_2x4, + CHROMA_8x4, CHROMA_4x8, + CHROMA_16x8, CHROMA_8x16, + CHROMA_32x16, CHROMA_16x32, + CHROMA_8x6, CHROMA_6x8, CHROMA_8x2, CHROMA_2x8, + CHROMA_16x12, CHROMA_12x16, CHROMA_16x4, CHROMA_4x16, + CHROMA_32x24, CHROMA_24x32, CHROMA_32x8, CHROMA_8x32, + NUM_CHROMA_PARTITIONS +}; + +enum Chroma422Partitions +{ + CHROMA422_2x4, CHROMA422_4x8, CHROMA422_8x16, CHROMA422_16x32, CHROMA422_32x64, + CHROMA422_4x4, CHROMA422_2x8, + CHROMA422_8x8, CHROMA422_4x16, + CHROMA422_16x16, CHROMA422_8x32, + CHROMA422_32x32, CHROMA422_16x64, + CHROMA422_8x12, CHROMA422_6x16, CHROMA422_8x4, CHROMA422_2x16, + CHROMA422_16x24, CHROMA422_12x32, CHROMA422_16x8, CHROMA422_4x32, + CHROMA422_32x48, CHROMA422_24x64, CHROMA422_32x16, CHROMA422_8x64, + NUM_CHROMA_PARTITIONS422 +}; + +enum SquareBlocks // Routines can be indexed using log2n(width)-2 +{ + BLOCK_4x4, + BLOCK_8x8, + BLOCK_16x16, + BLOCK_32x32, + BLOCK_64x64, + NUM_SQUARE_BLOCKS +}; + +enum { NUM_TR_SIZE = 4 }; + +// NOTE: Not all DCT functions support dest stride +enum Dcts +{ + DST_4x4, + DCT_4x4, + DCT_8x8, + DCT_16x16, + DCT_32x32, + NUM_DCTS +}; + +enum IDcts +{ + IDST_4x4, + IDCT_4x4, + IDCT_8x8, + IDCT_16x16, + IDCT_32x32, + NUM_IDCTS +}; + +// Returns a LumaPartitions enum for the given size, always expected to return a valid enum +inline int partitionFromSizes(int width, int height) +{ + X265_CHECK(((width | height) & ~(4 | 8 | 16 | 32 | 64)) == 0, "Invalid block width/height\n"); + extern const uint8_t lumaPartitionMapTable[]; + int w = (width >> 2) - 1; + int h = (height >> 2) - 1; + int part = (int)lumaPartitionMapTable[(w << 4) + h]; + X265_CHECK(part != 255, "Invalid block width %d height %d\n", width, height); + return part; +} + +inline int partitionFromLog2Size(int log2Size) +{ + X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n"); + return log2Size - 2; +} + +typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned +typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride); +typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); +typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride); +typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res); +typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res); +typedef void (*blockcpy_sp_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned +typedef void (*blockcpy_sc_t)(int bx, int by, int16_t *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned +typedef void (*pixelsub_ps_t)(int bx, int by, int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); +typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight); +typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val); + +typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter); +typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); + +typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int); +typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int); +typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int); +typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int); +typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride); +typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size); +typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift); + +typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride); +typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride); +typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff); + +typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); +typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride); +typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); +typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); +typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift); +typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); +typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff); + +typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); +typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride); +typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc, + intptr_t src_stride, intptr_t dst_stride, int width, int height); +typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX); +typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]); +typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width); +typedef uint64_t (*var_t)(pixel *pix, intptr_t stride); +typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h); + +typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_hps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); +typedef void (*filter_ps_t) (pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_sp_t) (int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_ss_t) (int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); +typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY); +typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); + +typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned +typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride); +typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride); +typedef void (*copy_ss_t)(int16_t *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride); + +typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); +typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1); +typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride); + +typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft); +typedef void (*planecopy_cp_t) (uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift); +typedef void (*planecopy_sp_t) (uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); + +typedef void (*cutree_propagate_cost) (int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, int32_t *invQscales, double *fpsFactor, int len); + +/* Define a structure containing function pointers to optimized encoder + * primitives. Each pointer can reference either an assembly routine, + * a vectorized primitive, or a C function. */ +struct EncoderPrimitives +{ + pixelcmp_t sad[NUM_LUMA_PARTITIONS]; // Sum of Differences for each size + pixelcmp_x3_t sad_x3[NUM_LUMA_PARTITIONS]; // Sum of Differences 3x for each size + pixelcmp_x4_t sad_x4[NUM_LUMA_PARTITIONS]; // Sum of Differences 4x for each size + pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed + pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed + pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed + pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed + pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD) + pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions + pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks + pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks + pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS]; + + blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value + cvt16to32_shl_t cvt16to32_shl; + cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1]; + cvt32to16_shr_t cvt32to16_shr; + cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1]; + copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1]; + copy_shr_t copy_shr; + copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1]; + + copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS]; + copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS]; + copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS]; + copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS]; + pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS]; + pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS]; + copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS]; + copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS]; + copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS]; + copy_ss_t square_copy_ss[NUM_SQUARE_BLOCKS]; + + filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS]; + filter_hps_t luma_hps[NUM_LUMA_PARTITIONS]; + filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS]; + filter_ps_t luma_vps[NUM_LUMA_PARTITIONS]; + filter_sp_t luma_vsp[NUM_LUMA_PARTITIONS]; + filter_ss_t luma_vss[NUM_LUMA_PARTITIONS]; + filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS]; + filter_p2s_t luma_p2s; + filter_p2s_t chroma_p2s[X265_CSP_COUNT]; + + weightp_sp_t weight_sp; + weightp_pp_t weight_pp; + pixelavg_pp_t pixelavg_pp[NUM_LUMA_PARTITIONS]; + addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS]; + + intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE]; + intra_allangs_t intra_pred_allangs[NUM_TR_SIZE]; + scale_t scale1D_128to64; + scale_t scale2D_64to32; + + dct_t dct[NUM_DCTS]; + idct_t idct[NUM_IDCTS]; + quant_t quant; + nquant_t nquant; + dequant_scaling_t dequant_scaling; + dequant_normal_t dequant_normal; + count_nonzero_t count_nonzero; + denoiseDct_t denoiseDct; + + calcresidual_t calcresidual[NUM_SQUARE_BLOCKS]; + transpose_t transpose[NUM_SQUARE_BLOCKS]; + + var_t var[NUM_SQUARE_BLOCKS]; + ssim_4x4x2_core_t ssim_4x4x2_core; + ssim_end4_t ssim_end_4; + + downscale_t frame_init_lowres_core; + plane_copy_deinterleave_t plane_copy_deinterleave_c; + extendCURowBorder_t extendRowBorder; + // sao primitives + saoCuOrgE0_t saoCuOrgE0; + planecopy_cp_t planecopy_cp; + planecopy_sp_t planecopy_sp; + + cutree_propagate_cost propagateCost; + + struct + { + filter_pp_t filter_vpp[NUM_LUMA_PARTITIONS]; + filter_ps_t filter_vps[NUM_LUMA_PARTITIONS]; + filter_sp_t filter_vsp[NUM_LUMA_PARTITIONS]; + filter_ss_t filter_vss[NUM_LUMA_PARTITIONS]; + filter_pp_t filter_hpp[NUM_LUMA_PARTITIONS]; + filter_hps_t filter_hps[NUM_LUMA_PARTITIONS]; + addAvg_t addAvg[NUM_LUMA_PARTITIONS]; + copy_pp_t copy_pp[NUM_LUMA_PARTITIONS]; + copy_sp_t copy_sp[NUM_LUMA_PARTITIONS]; + copy_ps_t copy_ps[NUM_LUMA_PARTITIONS]; + copy_ss_t copy_ss[NUM_LUMA_PARTITIONS]; + pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS]; + pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS]; + } chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here +}; + +void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY); + +/* This copy of the table is what gets used by the encoder. + * It must be initialized before the encoder begins. */ +extern EncoderPrimitives primitives; + +void Setup_C_Primitives(EncoderPrimitives &p); +void Setup_Instrinsic_Primitives(EncoderPrimitives &p, int cpuMask); +void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask); +void Setup_Alias_Primitives(EncoderPrimitives &p); +} + +#endif // ifndef X265_PRIMITIVES_H diff --git a/source/common/quant.cpp b/source/common/quant.cpp new file mode 100644 index 0000000..387962c --- /dev/null +++ b/source/common/quant.cpp @@ -0,0 +1,1124 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "quant.h" +#include "framedata.h" +#include "entropy.h" +#include "yuv.h" +#include "cudata.h" +#include "contexts.h" + +using namespace x265; + +#define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) + +namespace { + +struct coeffGroupRDStats +{ + int nnzBeforePos0; /* indicates coeff other than pos 0 are coded */ + int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */ + int64_t uncodedDist; /* uncoded distortion cost of coded coefficients */ + int64_t sigCost; /* cost of signaling significant coeff bitmap */ + int64_t sigCost0; /* cost of signaling sig coeff bit of coeff 0 */ +}; + +inline int fastMin(int x, int y) +{ + return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) +} + +inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) +{ + X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n"); + X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); + if (!absLevel) + { + X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); + return 0; + } + int rate = 0; + + if (diffLevel < 0) + { + X265_CHECK(absLevel <= 2, "absLevel check failure\n"); + rate += greaterOneBits[(absLevel == 2)]; + + if (absLevel == 2) + rate += levelAbsBits[0]; + } + else + { + uint32_t symbol = diffLevel; + const uint32_t maxVlc = g_goRiceRange[absGoRice]; + bool expGolomb = (symbol > maxVlc); + + if (expGolomb) + { + absLevel = symbol - maxVlc; + + // NOTE: mapping to x86 hardware instruction BSR + unsigned long size; + CLZ32(size, absLevel); + int egs = size * 2 + 1; + + rate += egs << 15; + + // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1) + X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n"); + symbol = maxVlc + 1; + } + + uint32_t prefLen = (symbol >> absGoRice) + 1; + uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); + + rate += numBins << 15; + + if (c1c2Idx & 1) + rate += greaterOneBits[1]; + + if (c1c2Idx == 3) + rate += levelAbsBits[1]; + } + return rate; +} + +/* Calculates the cost for specific absolute transform level */ +inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) +{ + X265_CHECK(absLevel, "absLevel should not be zero\n"); + + if (diffLevel < 0) + { + X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n"); + + uint32_t rate = greaterOneBits[(absLevel == 2)]; + if (absLevel == 2) + rate += levelAbsBits[0]; + return rate; + } + else + { + uint32_t rate; + uint32_t symbol = diffLevel; + if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION) + { + uint32_t length = symbol >> absGoRice; + rate = (length + 1 + absGoRice) << 15; + } + else + { + uint32_t length = 0; + symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION; + if (symbol) + { + unsigned long idx; + CLZ32(idx, symbol + 1); + length = idx; + } + + rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15; + } + if (c1c2Idx & 1) + rate += greaterOneBits[1]; + if (c1c2Idx == 3) + rate += levelAbsBits[1]; + return rate; + } +} + +} + +Quant::Quant() +{ + m_resiDctCoeff = NULL; + m_fencDctCoeff = NULL; + m_fencShortBuf = NULL; + m_frameNr = NULL; + m_nr = NULL; +} + +bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy) +{ + m_entropyCoder = &entropy; + m_useRDOQ = useRDOQ; + m_psyRdoqScale = (int64_t)(psyScale * 256.0); + m_scalingList = &scalingList; + m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); + m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); + m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); + + return m_resiDctCoeff && m_fencShortBuf; +} + +bool Quant::allocNoiseReduction(const x265_param& param) +{ + m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads); + if (m_frameNr) + memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads); + else + return false; + return true; +} + +Quant::~Quant() +{ + X265_FREE(m_frameNr); + X265_FREE(m_resiDctCoeff); + X265_FREE(m_fencShortBuf); +} + +void Quant::setQPforQuant(const CUData& ctu) +{ + m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL; + int qpy = ctu.m_qp[0]; + m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET); + setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat); + setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat); +} + +void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) +{ + int qp = Clip3(-QP_BD_OFFSET, 57, qpin); + if (qp >= 30) + { + if (chFmt == X265_CSP_I420) + qp = g_chromaScale[qp]; + else + qp = X265_MIN(qp, 51); + } + m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET); +} + +/* To minimize the distortion only. No rate is considered */ +uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams) +{ + const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG; + const uint16_t *scan = codeParams.scan; + bool lastCG = true; + + for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--) + { + int cgStartPos = cg << LOG2_SCAN_SET_SIZE; + int n; + + for (n = SCAN_SET_SIZE - 1; n >= 0; --n) + if (coeff[scan[n + cgStartPos]]) + break; + if (n < 0) + continue; + + int lastNZPosInCG = n; + + for (n = 0;; n++) + if (coeff[scan[n + cgStartPos]]) + break; + + int firstNZPosInCG = n; + + if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) + { + uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1; + uint32_t absSum = 0; + + for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) + absSum += coeff[scan[n + cgStartPos]]; + + if (signbit != (absSum & 0x1)) // compare signbit with sum_parity + { + int minCostInc = MAX_INT, minPos = -1, curCost = MAX_INT; + int16_t finalChange = 0, curChange = 0; + + for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) + { + uint32_t blkPos = scan[n + cgStartPos]; + if (coeff[blkPos]) + { + if (deltaU[blkPos] > 0) + { + curCost = -deltaU[blkPos]; + curChange = 1; + } + else + { + if (n == firstNZPosInCG && abs(coeff[blkPos]) == 1) + curCost = MAX_INT; + else + { + curCost = deltaU[blkPos]; + curChange = -1; + } + } + } + else + { + if (n < firstNZPosInCG) + { + uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1; + if (thisSignBit != signbit) + curCost = MAX_INT; + else + { + curCost = -deltaU[blkPos]; + curChange = 1; + } + } + else + { + curCost = -deltaU[blkPos]; + curChange = 1; + } + } + + if (curCost < minCostInc) + { + minCostInc = curCost; + finalChange = curChange; + minPos = blkPos; + } + } + + /* do not allow change to violate coeff clamp */ + if (coeff[minPos] == 32767 || coeff[minPos] == -32768) + finalChange = -1; + + if (!coeff[minPos]) + numSig++; + else if (finalChange == -1 && abs(coeff[minPos]) == 1) + numSig--; + + if (m_resiDctCoeff[minPos] >= 0) + coeff[minPos] += finalChange; + else + coeff[minPos] -= finalChange; + } + } + + lastCG = false; + } + + return numSig; +} + +uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride, + coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip) +{ + if (cu.m_tqBypass[absPartIdx]) + { + X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n"); + return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride); + } + + bool isLuma = ttype == TEXT_LUMA; + bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip; + bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA; + int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform + int trSize = 1 << log2TrSize; + + X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n"); + if (useTransformSkip) + { +#if X265_DEPTH <= 10 + primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize); +#else + if (transformShift >= 0) + primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize); + else + { + int shift = -transformShift; + int offset = (1 << (shift - 1)); + primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset); + } +#endif + } + else + { + const uint32_t sizeIdx = log2TrSize - 2; + int useDST = !sizeIdx && isLuma && isIntra; + int index = DCT_4x4 + sizeIdx - useDST; + + primitives.dct[index](residual, m_resiDctCoeff, stride); + + /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so + * there is no risk of performing this DCT unnecessarily */ + if (usePsy) + { + /* perform DCT on source pixels for psy-rdoq */ + primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride); + primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize); + } + + if (m_nr && !isIntra) + { + /* denoise is not applied to intra residual, so DST can be ignored */ + int cat = sizeIdx + 4 * !isLuma; + int numCoeff = 1 << (log2TrSize * 2); + primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff); + m_nr->count[cat]++; + } + } + + if (m_useRDOQ) + return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy); + else + { + int deltaU[32 * 32]; + + int scalingListType = ttype + (isLuma ? 3 : 0); + int rem = m_qpParam[ttype].rem; + int per = m_qpParam[ttype].per; + int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; + + int qbits = QUANT_SHIFT + per + transformShift; + int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9); + int numCoeff = 1 << (log2TrSize * 2); + + uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff); + + if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled) + { + TUEntropyCodingParameters codeParams; + cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma); + return signBitHidingHDQ(coeff, deltaU, numSig, codeParams); + } + else + return numSig; + } +} + +void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, + uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) +{ + if (transQuantBypass) + { + primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0); + return; + } + + // Values need to pass as input parameter in dequant + int rem = m_qpParam[ttype].rem; + int per = m_qpParam[ttype].per; + int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; + int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; + int numCoeff = 1 << (log2TrSize * 2); + + if (m_scalingList->m_bEnabled) + { + int scalingListType = (bIntra ? 0 : 3) + ttype; + int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; + primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift); + } + else + { + int scale = m_scalingList->s_invQuantScales[rem] << per; + primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift); + } + + if (useTransformSkip) + { + int trSize = 1 << log2TrSize; + +#if X265_DEPTH <= 10 + primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize); +#else + if (transformShift > 0) + primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize); + else + primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift); +#endif + } + else + { + const uint32_t sizeIdx = log2TrSize - 2; + int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; + + X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n"); + + // DC only + if (numSig == 1 && coeff[0] != 0 && !useDST) + { + const int shift_1st = 7; + const int add_1st = 1 << (shift_1st - 1); + const int shift_2nd = 12 - (X265_DEPTH - 8); + const int add_2nd = 1 << (shift_2nd - 1); + + int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd; + primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val); + return; + } + + primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride); + } +} + +/* Rate distortion optimized quantization for entropy coding engines using + * probability models like CABAC */ +uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy) +{ + int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ + int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; + + X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); + + int rem = m_qpParam[ttype].rem; + int per = m_qpParam[ttype].per; + int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ + int add = (1 << (qbits - 1)); + int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; + + int numCoeff = 1 << (log2TrSize * 2); + + uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); + + X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n"); + if (!numSig) + return 0; + + uint32_t trSize = 1 << log2TrSize; + int64_t lambda2 = m_qpParam[ttype].lambda2; + int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda); + + /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) + * scale applied that must be removed during unquant. Note that in real dequant there is clipping + * at several stages. We skip the clipping for simplicity when measuring RD cost */ + int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; + int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); + int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; + int scaleBits = SCALE_BITS - 2 * transformShift; + +#define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) +#define SIGCOST(bits) ((lambda2 * (bits)) >> 8) +#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) +#define PSYVALUE(rec) ((psyScale * (rec)) >> (16 - scaleBits)) + + int64_t costCoeff[32 * 32]; /* d*d + lambda * bits */ + int64_t costUncoded[32 * 32]; /* d*d + lambda * 0 */ + int64_t costSig[32 * 32]; /* lambda * bits */ + + int rateIncUp[32 * 32]; /* signal overhead of increasing level */ + int rateIncDown[32 * 32]; /* signal overhead of decreasing level */ + int sigRateDelta[32 * 32]; /* signal difference between zero and non-zero */ + + int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ + uint64_t sigCoeffGroupFlag64 = 0; + + uint32_t ctxSet = 0; + int c1 = 1; + int c2 = 0; + uint32_t goRiceParam = 0; + uint32_t c1Idx = 0; + uint32_t c2Idx = 0; + int cgLastScanPos = -1; + int lastScanPos = -1; + const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ + bool bIsLuma = ttype == TEXT_LUMA; + + /* total rate distortion cost of transform block, as CBF=0 */ + int64_t totalUncodedCost = 0; + + /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, + * the distortion and signal cost of coded blocks, and the coding cost of significant + * coefficient and coefficient group bitmaps */ + int64_t totalRdCost = 0; + + TUEntropyCodingParameters codeParams; + cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); + const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2); + + /* TODO: update bit estimates if dirty */ + EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; + + uint32_t scanPos; + coeffGroupRDStats cgRdStats; + + /* iterate over coding groups in reverse scan order */ + for (int cgScanPos = cgNum - 1; cgScanPos >= 0; cgScanPos--) + { + const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; + const uint32_t cgPosY = cgBlkPos >> codeParams.log2TrSizeCG; + const uint32_t cgPosX = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG); + const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); + memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); + + const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); + + /* iterate over coefficients in each group in reverse scan order */ + for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) + { + scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; + uint32_t blkPos = codeParams.scan[scanPos]; + uint16_t maxAbsLevel = (int16_t)abs(dstCoeff[blkPos]); /* abs(quantized coeff) */ + int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ + int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ + + /* RDOQ measures distortion as the squared difference between the unquantized coded level + * and the original DCT coefficient. The result is shifted scaleBits to account for the + * FIX15 nature of the CABAC cost tables minus the forward transform scale */ + + /* cost of not coding this coefficient (all distortion, no signal bits) */ + costUncoded[scanPos] = (int64_t)(signCoef * signCoef) << scaleBits; + if (usePsy && blkPos) + /* when no residual coefficient is coded, predicted coef == recon coef */ + costUncoded[scanPos] -= PSYVALUE(predictedCoef); + + totalUncodedCost += costUncoded[scanPos]; + + if (maxAbsLevel && lastScanPos < 0) + { + /* remember the first non-zero coef found in this reverse scan as the last pos */ + lastScanPos = scanPos; + ctxSet = (scanPos < SCAN_SET_SIZE || !bIsLuma) ? 0 : 2; + cgLastScanPos = cgScanPos; + } + + if (lastScanPos < 0) + { + /* coefficients after lastNZ have no distortion signal cost */ + costCoeff[scanPos] = 0; + costSig[scanPos] = 0; + + /* No non-zero coefficient yet found, but this does not mean + * there is no uncoded-cost for this coefficient. Pre- + * quantization the coefficient may have been non-zero */ + totalRdCost += costUncoded[scanPos]; + } + else + { + const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; + const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3; // {1, 2, 1, 3} + + X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); + X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); + X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); + + // coefficient level estimation + const uint32_t oneCtx = 4 * ctxSet + c1; + const uint32_t absCtx = ctxSet + c2; + const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx]; + const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx]; + + uint16_t level = 0; + uint32_t sigCoefBits = 0; + costCoeff[scanPos] = MAX_INT64; + + if ((int)scanPos == lastScanPos) + sigRateDelta[blkPos] = 0; + else + { + const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext); + if (maxAbsLevel < 3) + { + /* set default costs to uncoded costs */ + costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]); + costCoeff[scanPos] = costUncoded[scanPos] + costSig[scanPos]; + } + sigRateDelta[blkPos] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0]; + sigCoefBits = estBitsSbac.significantBits[ctxSig][1]; + } + if (maxAbsLevel) + { + uint16_t minAbsLevel = X265_MAX(maxAbsLevel - 1, 1); + for (uint16_t lvl = maxAbsLevel; lvl >= minAbsLevel; lvl--) + { + uint32_t levelBits = getICRateCost(lvl, lvl - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE; + + int unquantAbsLevel = UNQUANT(lvl); + int d = abs(signCoef) - unquantAbsLevel; + int64_t curCost = RDCOST(d, sigCoefBits + levelBits); + + /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ + if (usePsy && blkPos) + { + int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); + curCost -= PSYVALUE(reconCoef); + } + + if (curCost < costCoeff[scanPos]) + { + level = lvl; + costCoeff[scanPos] = curCost; + costSig[scanPos] = SIGCOST(sigCoefBits); + } + } + } + + dstCoeff[blkPos] = level; + totalRdCost += costCoeff[scanPos]; + + /* record costs for sign-hiding performed at the end */ + if (level) + { + int rateNow = getICRate(level, level - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx); + rateIncUp[blkPos] = getICRate(level + 1, level + 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; + rateIncDown[blkPos] = getICRate(level - 1, level - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; + } + else + { + rateIncUp[blkPos] = greaterOneBits[0]; + rateIncDown[blkPos] = 0; + } + + /* Update CABAC estimation state */ + if (level >= baseLevel && goRiceParam < 4 && level > (3U << goRiceParam)) + goRiceParam++; + + c1Idx -= (-(int32_t)level) >> 31; + + /* update bin model */ + if (level > 1) + { + c1 = 0; + c2 += (uint32_t)(c2 - 2) >> 31; + c2Idx++; + } + else if ((c1 < 3) && (c1 > 0) && level) + c1++; + + /* context set update */ + if (!(scanPos % SCAN_SET_SIZE) && scanPos) + { + c2 = 0; + goRiceParam = 0; + + c1Idx = 0; + c2Idx = 0; + ctxSet = (scanPos == SCAN_SET_SIZE || !bIsLuma) ? 0 : 2; + X265_CHECK(c1 >= 0, "c1 is negative\n"); + ctxSet -= ((int32_t)(c1 - 1) >> 31); + c1 = 1; + } + } + + cgRdStats.sigCost += costSig[scanPos]; + if (!scanPosinCG) + cgRdStats.sigCost0 = costSig[scanPos]; + + if (dstCoeff[blkPos]) + { + sigCoeffGroupFlag64 |= cgBlkPosMask; + cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; + cgRdStats.uncodedDist += costUncoded[scanPos]; + cgRdStats.nnzBeforePos0 += scanPosinCG; + } + } /* end for (scanPosinCG) */ + + costCoeffGroupSig[cgScanPos] = 0; + + if (cgLastScanPos < 0) + { + /* nothing to do at this point */ + } + else if (!cgScanPos || cgScanPos == cgLastScanPos) + { + /* coeff group 0 is implied to be present, no signal cost */ + /* coeff group with last NZ is implied to be present, handled below */ + } + else if (sigCoeffGroupFlag64 & cgBlkPosMask) + { + if (!cgRdStats.nnzBeforePos0) + { + /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ + totalRdCost -= cgRdStats.sigCost0; + cgRdStats.sigCost -= cgRdStats.sigCost0; + } + + /* there are coded coefficients in this group, but now we include the signaling cost + * of the significant coefficient group flag and evaluate whether the RD cost of the + * coded group is more than the RD cost of the uncoded group */ + + uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); + + int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); + costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ + costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ + costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ + + costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); + totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ + + if (costZeroCG < totalRdCost) + { + sigCoeffGroupFlag64 &= ~cgBlkPosMask; + totalRdCost = costZeroCG; + costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); + + /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ + for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) + { + scanPos = cgScanPos * cgSize + scanPosinCG; + uint32_t blkPos = codeParams.scan[scanPos]; + if (dstCoeff[blkPos]) + { + costCoeff[scanPos] = costUncoded[scanPos]; + costSig[scanPos] = 0; + } + dstCoeff[blkPos] = 0; + } + } + } + else + { + /* there were no coded coefficients in this coefficient group */ + uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); + costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); + totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ + totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ + } + } /* end for (cgScanPos) */ + + X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); + + /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ + int64_t bestCost; + if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) + { + bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); + totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); + } + else + { + int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; + bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); + totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); + } + + /* This loop starts with the last non-zero found in the first loop and then refines this last + * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs + * at all previous coefficients until a coefficient greater than 1 is encountered or we run out + * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty + * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ + int bestLastIdx = 0; + bool foundLast = false; + for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) + { + if (!cgScanPos || cgScanPos == cgLastScanPos) + { + /* the presence of these coefficient groups are inferred, they have no bit in + * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ + } + else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) + { + /* remove cost of significant coeff group flag, the group's presence would be inferred + * from lastNZ if it were present in this group */ + totalRdCost -= costCoeffGroupSig[cgScanPos]; + } + else + { + /* remove cost of signaling this empty group as not present */ + totalRdCost -= costCoeffGroupSig[cgScanPos]; + continue; + } + + for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) + { + scanPos = cgScanPos * cgSize + scanPosinCG; + if ((int)scanPos > lastScanPos) + continue; + + /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then + * continue as if it were uncoded. If the coefficient was already uncoded, remove the + * cost of signaling it as not-significant */ + uint32_t blkPos = codeParams.scan[scanPos]; + if (dstCoeff[blkPos]) + { + /* Swap the cost of signaling its significant coeff bit with the cost of + * signaling its lastNZ pos */ + uint32_t posY = blkPos >> log2TrSize; + uint32_t posX = blkPos - (posY << log2TrSize); + uint32_t bitsLastNZ = codeParams.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY); + int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); + + if (costAsLast < bestCost) + { + bestLastIdx = scanPos + 1; + bestCost = costAsLast; + } + if (dstCoeff[blkPos] > 1) + { + foundLast = true; + break; + } + + totalRdCost -= costCoeff[scanPos]; + totalRdCost += costUncoded[scanPos]; + } + else + totalRdCost -= costSig[scanPos]; + } + } + + /* recount non-zero coefficients and re-apply sign of DCT coef */ + numSig = 0; + for (int pos = 0; pos < bestLastIdx; pos++) + { + int blkPos = codeParams.scan[pos]; + int level = dstCoeff[blkPos]; + numSig += (level != 0); + + uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; + dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); + } + + /* clean uncoded coefficients */ + for (int pos = bestLastIdx; pos <= lastScanPos; pos++) + dstCoeff[codeParams.scan[pos]] = 0; + + /* rate-distortion based sign-hiding */ + if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) + { + int lastCG = true; + for (int subSet = cgLastScanPos; subSet >= 0; subSet--) + { + int subPos = subSet << LOG2_SCAN_SET_SIZE; + int n; + + /* measure distance between first and last non-zero coef in this + * coding group */ + for (n = SCAN_SET_SIZE - 1; n >= 0; --n) + if (dstCoeff[codeParams.scan[n + subPos]]) + break; + if (n < 0) + continue; + + int lastNZPosInCG = n; + + for (n = 0;; n++) + if (dstCoeff[codeParams.scan[n + subPos]]) + break; + + int firstNZPosInCG = n; + + if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) + { + uint32_t signbit = (dstCoeff[codeParams.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1); + int absSum = 0; + + for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) + absSum += dstCoeff[codeParams.scan[n + subPos]]; + + if (signbit != (absSum & 1U)) + { + /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff + * is properly implied. Note dstCoeff[] are signed by this point but curChange and + * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ + + int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; + int minPos = -1; + int16_t finalChange = 0, curChange = 0; + + for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) + { + uint32_t blkPos = codeParams.scan[n + subPos]; + int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ + int absLevel = abs(dstCoeff[blkPos]); + + int d = abs(signCoef) - UNQUANT(absLevel); + int64_t origDist = (((int64_t)d * d)) << scaleBits; + +#define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8)) + + if (dstCoeff[blkPos]) + { + d = abs(signCoef) - UNQUANT(absLevel + 1); + int64_t costUp = DELTARDCOST(d, rateIncUp[blkPos]); + + /* if decrementing would make the coeff 0, we can include the + * significant coeff flag cost savings */ + d = abs(signCoef) - UNQUANT(absLevel - 1); + bool isOne = abs(dstCoeff[blkPos]) == 1; + int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); + int64_t costDown = DELTARDCOST(d, downBits); + + if (lastCG && lastNZPosInCG == n && isOne) + costDown -= 4 * IEP_RATE; + + if (costUp < costDown) + { + curCost = costUp; + curChange = 1; + } + else + { + curChange = -1; + if (n == firstNZPosInCG && isOne) + curCost = MAX_INT64; + else + curCost = costDown; + } + } + else if (n < firstNZPosInCG && signbit != (signCoef >= 0 ? 0 : 1U)) + { + /* don't try to make a new coded coeff before the first coeff if its + * sign would be different than the first coeff, the inferred sign would + * still be wrong and we'd have to do this again. */ + curCost = MAX_INT64; + } + else + { + /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ + d = abs(signCoef) - UNQUANT(1); + curCost = DELTARDCOST(d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); + curChange = 1; + } + + if (curCost < minCostInc) + { + minCostInc = curCost; + finalChange = curChange; + minPos = blkPos; + } + } + + if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) + /* don't allow sign hiding to violate the SPEC range */ + finalChange = -1; + + if (dstCoeff[minPos] == 0) + numSig++; + else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) + numSig--; + + if (m_resiDctCoeff[minPos] >= 0) + dstCoeff[minPos] += finalChange; + else + dstCoeff[minPos] -= finalChange; + } + } + + lastCG = false; + } + } + + return numSig; +} + +/* Pattern decision for context derivation process of significant_coeff_flag */ +uint32_t Quant::calcPatternSigCtx(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG) +{ + if (!log2TrSizeCG) + return 0; + + const uint32_t trSizeCG = 1 << log2TrSizeCG; + X265_CHECK(trSizeCG <= 8, "transform CG is too large\n"); + const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); + const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1); + const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2; + + return sigRight + sigLower; +} + +/* Context derivation process of coeff_abs_significant_flag */ +uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma, + uint32_t firstSignificanceMapContext) +{ + static const uint8_t ctxIndMap[16] = + { + 0, 1, 4, 5, + 2, 3, 4, 5, + 6, 6, 8, 8, + 7, 7, 8, 8 + }; + + if (!blkPos) // special case for the DC context variable + return 0; + + if (log2TrSize == 2) // 4x4 + return ctxIndMap[blkPos]; + + const uint32_t posY = blkPos >> log2TrSize; + const uint32_t posX = blkPos & (trSize - 1); + X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n"); + + int posXinSubset = blkPos & 3; + X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n"); + int posYinSubset = posY & 3; + + // NOTE: [patternSigCtx][posXinSubset][posYinSubset] + static const uint8_t table_cnt[4][4][4] = + { + // patternSigCtx = 0 + { + { 2, 1, 1, 0 }, + { 1, 1, 0, 0 }, + { 1, 0, 0, 0 }, + { 0, 0, 0, 0 }, + }, + // patternSigCtx = 1 + { + { 2, 1, 0, 0 }, + { 2, 1, 0, 0 }, + { 2, 1, 0, 0 }, + { 2, 1, 0, 0 }, + }, + // patternSigCtx = 2 + { + { 2, 2, 2, 2 }, + { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + }, + // patternSigCtx = 3 + { + { 2, 2, 2, 2 }, + { 2, 2, 2, 2 }, + { 2, 2, 2, 2 }, + { 2, 2, 2, 2 }, + } + }; + + int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset]; + int offset = firstSignificanceMapContext; + + offset += cnt; + + return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset; +} + +/* Calculates the cost of signaling the last significant coefficient in the block */ +inline uint32_t Quant::getRateLast(uint32_t posx, uint32_t posy) const +{ + uint32_t ctxX = getGroupIdx(posx); + uint32_t ctxY = getGroupIdx(posy); + uint32_t cost = m_entropyCoder->m_estBitsSbac.lastXBits[ctxX] + m_entropyCoder->m_estBitsSbac.lastYBits[ctxY]; + + int32_t maskX = (int32_t)(2 - posx) >> 31; + int32_t maskY = (int32_t)(2 - posy) >> 31; + + cost += maskX & (IEP_RATE * ((ctxX - 2) >> 1)); + cost += maskY & (IEP_RATE * ((ctxY - 2) >> 1)); + return cost; +} + +/* Context derivation process of coeff_abs_significant_flag */ +uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG) +{ + const uint32_t trSizeCG = 1 << log2TrSizeCG; + + const uint32_t sigPos = (uint32_t)(cgGroupMask >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); + const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos; + const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1)); + + return (sigRight | sigLower) & 1; +} diff --git a/source/common/quant.h b/source/common/quant.h new file mode 100644 index 0000000..ac575f7 --- /dev/null +++ b/source/common/quant.h @@ -0,0 +1,136 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_QUANT_H +#define X265_QUANT_H + +#include "common.h" +#include "scalinglist.h" +#include "contexts.h" + +namespace x265 { +// private namespace + +class CUData; +class Entropy; +struct TUEntropyCodingParameters; + +struct QpParam +{ + int rem; + int per; + int qp; + int64_t lambda2; /* FIX8 */ + int64_t lambda; /* FIX8 */ + + QpParam() : qp(MAX_INT) {} + + void setQpParam(int qpScaled) + { + if (qp != qpScaled) + { + rem = qpScaled % 6; + per = qpScaled / 6; + qp = qpScaled; + lambda2 = (int64_t)(x265_lambda2_tab[qp - QP_BD_OFFSET] * 256. + 0.5); + lambda = (int64_t)(x265_lambda_tab[qp - QP_BD_OFFSET] * 256. + 0.5); + } + } +}; + +class Quant +{ +protected: + + const ScalingList* m_scalingList; + Entropy* m_entropyCoder; + + QpParam m_qpParam[3]; + + bool m_useRDOQ; + int64_t m_psyRdoqScale; + int32_t* m_resiDctCoeff; + int32_t* m_fencDctCoeff; + int16_t* m_fencShortBuf; + + enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */ + +public: + + NoiseReduction* m_nr; + NoiseReduction* m_frameNr; // Array of NR structures, one for each frameEncoder + + Quant(); + ~Quant(); + + /* one-time setup */ + bool init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy); + bool allocNoiseReduction(const x265_param& param); + + /* CU setup */ + void setQPforQuant(const CUData& ctu); + + uint32_t transformNxN(CUData& cu, pixel *fenc, uint32_t fencstride, int16_t* residual, uint32_t stride, coeff_t* coeff, + uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip); + + void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, + uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig); + + /* static methods shared with entropy.cpp */ + static uint32_t calcPatternSigCtx(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG); + static uint32_t getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma, uint32_t firstSignificanceMapContext); + static uint32_t getSigCoeffGroupCtxInc(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG); + +protected: + + void setChromaQP(int qpin, TextType ttype, int chFmt); + + uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters); + + uint32_t rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy); + inline uint32_t getRateLast(uint32_t posx, uint32_t posy) const; +}; + +static inline uint32_t getGroupIdx(const uint32_t idx) +{ + // TODO: Why is this not a table lookup? + + uint32_t group = (idx >> 3); + + if (idx >= 24) + group = 2; + uint32_t groupIdx = ((idx >> (group + 1)) - 2) + 4 + (group << 1); + if (idx <= 3) + groupIdx = idx; + +#ifdef _DEBUG + static const uint8_t g_groupIdx[32] = { 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9 }; + assert(groupIdx == g_groupIdx[idx]); +#endif + + return groupIdx; +} + +} + +#endif // ifndef X265_QUANT_H diff --git a/source/common/scalinglist.cpp b/source/common/scalinglist.cpp new file mode 100644 index 0000000..d64bcee --- /dev/null +++ b/source/common/scalinglist.cpp @@ -0,0 +1,379 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "scalinglist.h" + +namespace { +// file-anonymous namespace + +/* Strings for scaling list file parsing */ +const char MatrixType[4][6][20] = +{ + { + "INTRA4X4_LUMA", + "INTRA4X4_CHROMAU", + "INTRA4X4_CHROMAV", + "INTER4X4_LUMA", + "INTER4X4_CHROMAU", + "INTER4X4_CHROMAV" + }, + { + "INTRA8X8_LUMA", + "INTRA8X8_CHROMAU", + "INTRA8X8_CHROMAV", + "INTER8X8_LUMA", + "INTER8X8_CHROMAU", + "INTER8X8_CHROMAV" + }, + { + "INTRA16X16_LUMA", + "INTRA16X16_CHROMAU", + "INTRA16X16_CHROMAV", + "INTER16X16_LUMA", + "INTER16X16_CHROMAU", + "INTER16X16_CHROMAV" + }, + { + "INTRA32X32_LUMA", + "INTER32X32_LUMA", + }, +}; +const char MatrixType_DC[4][12][22] = +{ + { + }, + { + }, + { + "INTRA16X16_LUMA_DC", + "INTRA16X16_CHROMAU_DC", + "INTRA16X16_CHROMAV_DC", + "INTER16X16_LUMA_DC", + "INTER16X16_CHROMAU_DC", + "INTER16X16_CHROMAV_DC" + }, + { + "INTRA32X32_LUMA_DC", + "INTER32X32_LUMA_DC", + }, +}; + +int quantTSDefault4x4[16] = +{ + 16, 16, 16, 16, + 16, 16, 16, 16, + 16, 16, 16, 16, + 16, 16, 16, 16 +}; + +int quantIntraDefault8x8[64] = +{ + 16, 16, 16, 16, 17, 18, 21, 24, + 16, 16, 16, 16, 17, 19, 22, 25, + 16, 16, 17, 18, 20, 22, 25, 29, + 16, 16, 18, 21, 24, 27, 31, 36, + 17, 17, 20, 24, 30, 35, 41, 47, + 18, 19, 22, 27, 35, 44, 54, 65, + 21, 22, 25, 31, 41, 54, 70, 88, + 24, 25, 29, 36, 47, 65, 88, 115 +}; + +int quantInterDefault8x8[64] = +{ + 16, 16, 16, 16, 17, 18, 20, 24, + 16, 16, 16, 17, 18, 20, 24, 25, + 16, 16, 17, 18, 20, 24, 25, 28, + 16, 17, 18, 20, 24, 25, 28, 33, + 17, 18, 20, 24, 25, 28, 33, 41, + 18, 20, 24, 25, 28, 33, 41, 54, + 20, 24, 25, 28, 33, 41, 54, 71, + 24, 25, 28, 33, 41, 54, 71, 91 +}; + +} + +namespace x265 { +// private namespace + +const int ScalingList::s_numCoefPerSize[NUM_SIZES] = { 16, 64, 256, 1024 }; +const int32_t ScalingList::s_quantScales[NUM_REM] = { 26214, 23302, 20560, 18396, 16384, 14564 }; +const int32_t ScalingList::s_invQuantScales[NUM_REM] = { 40, 45, 51, 57, 64, 72 }; + +ScalingList::ScalingList() +{ + memset(m_quantCoef, 0, sizeof(m_quantCoef)); + memset(m_dequantCoef, 0, sizeof(m_dequantCoef)); + memset(m_scalingListCoef, 0, sizeof(m_scalingListCoef)); +} + +bool ScalingList::init() +{ + bool ok = true; + for (int sizeId = 0; sizeId < NUM_SIZES; sizeId++) + { + for (int listId = 0; listId < NUM_LISTS; listId++) + { + m_scalingListCoef[sizeId][listId] = X265_MALLOC(int32_t, X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeId])); + ok &= !!m_scalingListCoef[sizeId][listId]; + for (int rem = 0; rem < NUM_REM; rem++) + { + m_quantCoef[sizeId][listId][rem] = X265_MALLOC(int32_t, s_numCoefPerSize[sizeId]); + m_dequantCoef[sizeId][listId][rem] = X265_MALLOC(int32_t, s_numCoefPerSize[sizeId]); + ok &= m_quantCoef[sizeId][listId][rem] && m_dequantCoef[sizeId][listId][rem]; + } + } + } + return ok; +} + +ScalingList::~ScalingList() +{ + for (int sizeId = 0; sizeId < NUM_SIZES; sizeId++) + { + for (int listId = 0; listId < NUM_LISTS; listId++) + { + X265_FREE(m_scalingListCoef[sizeId][listId]); + for (int rem = 0; rem < NUM_REM; rem++) + { + X265_FREE(m_quantCoef[sizeId][listId][rem]); + X265_FREE(m_dequantCoef[sizeId][listId][rem]); + } + } + } +} + +/* returns predicted list index if a match is found, else -1 */ +int ScalingList::checkPredMode(int size, int list) const +{ + for (int predList = list; predList >= 0; predList--) + { + // check DC value + if (size < BLOCK_16x16 && m_scalingListDC[size][list] != m_scalingListDC[size][predList]) + continue; + + // check value of matrix + if (!memcmp(m_scalingListCoef[size][list], + list == predList ? getScalingListDefaultAddress(size, predList) : m_scalingListCoef[size][predList], + sizeof(int32_t) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[size]))) + return predList; + } + + return -1; +} + +/* check if use default quantization matrix + * returns true if default quantization matrix is used in all sizes */ +bool ScalingList::checkDefaultScalingList() const +{ + int defaultCounter = 0; + + for (int s = 0; s < NUM_SIZES; s++) + for (int l = 0; l < NUM_LISTS; l++) + if (!memcmp(m_scalingListCoef[s][l], getScalingListDefaultAddress(s, l), + sizeof(int32_t) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[s])) && + ((s < BLOCK_16x16) || (m_scalingListDC[s][l] == 16))) + defaultCounter++; + + return defaultCounter != (NUM_LISTS * NUM_SIZES - 4); // -4 for 32x32 +} + +/* get address of default quantization matrix */ +const int32_t* ScalingList::getScalingListDefaultAddress(int sizeId, int listId) const +{ + switch (sizeId) + { + case BLOCK_4x4: + return quantTSDefault4x4; + case BLOCK_8x8: + return (listId < 3) ? quantIntraDefault8x8 : quantInterDefault8x8; + case BLOCK_16x16: + return (listId < 3) ? quantIntraDefault8x8 : quantInterDefault8x8; + case BLOCK_32x32: + return (listId < 1) ? quantIntraDefault8x8 : quantInterDefault8x8; + default: + break; + } + + X265_CHECK(0, "invalid scaling list size\n"); + return NULL; +} + +void ScalingList::processDefaultMarix(int sizeId, int listId) +{ + ::memcpy(m_scalingListCoef[sizeId][listId], getScalingListDefaultAddress(sizeId, listId), sizeof(int) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeId])); + m_scalingListDC[sizeId][listId] = SCALING_LIST_DC; +} + +void ScalingList::setDefaultScalingList() +{ + for (int sizeId = 0; sizeId < NUM_SIZES; sizeId++) + for (int listId = 0; listId < NUM_LISTS; listId++) + processDefaultMarix(sizeId, listId); + m_bEnabled = true; + m_bDataPresent = false; +} + +bool ScalingList::parseScalingList(const char* filename) +{ + FILE *fp = fopen(filename, "r"); + if (!fp) + { + x265_log(NULL, X265_LOG_ERROR, "can't open scaling list file %s\n", filename); + return true; + } + + char line[1024]; + int32_t *src = NULL; + + for (int sizeIdc = 0; sizeIdc < NUM_SIZES; sizeIdc++) + { + int size = X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeIdc]); + for (int listIdc = 0; listIdc < NUM_LISTS; listIdc++) + { + src = m_scalingListCoef[sizeIdc][listIdc]; + + fseek(fp, 0, 0); + do + { + char *ret = fgets(line, 1024, fp); + if (!ret || (!strstr(line, MatrixType[sizeIdc][listIdc]) && feof(fp))) + { + x265_log(NULL, X265_LOG_ERROR, "can't read matrix from %s\n", filename); + return true; + } + } + while (!strstr(line, MatrixType[sizeIdc][listIdc])); + + for (int i = 0; i < size; i++) + { + int data; + if (fscanf(fp, "%d,", &data) != 1) + { + x265_log(NULL, X265_LOG_ERROR, "can't read matrix from %s\n", filename); + return true; + } + src[i] = data; + } + + // set DC value for default matrix check + m_scalingListDC[sizeIdc][listIdc] = src[0]; + + if (sizeIdc > BLOCK_8x8) + { + fseek(fp, 0, 0); + do + { + char *ret = fgets(line, 1024, fp); + if (!ret || (!strstr(line, MatrixType_DC[sizeIdc][listIdc]) && feof(fp))) + { + x265_log(NULL, X265_LOG_ERROR, "can't read DC from %s\n", filename); + return true; + } + } + while (!strstr(line, MatrixType_DC[sizeIdc][listIdc])); + + int data; + if (fscanf(fp, "%d,", &data) != 1) + { + x265_log(NULL, X265_LOG_ERROR, "can't read matrix from %s\n", filename); + return true; + } + + // overwrite DC value when size of matrix is larger than 16x16 + m_scalingListDC[sizeIdc][listIdc] = data; + } + } + } + + fclose(fp); + + m_bEnabled = true; + m_bDataPresent = !checkDefaultScalingList(); + + return false; +} + +/** set quantized matrix coefficient for encode */ +void ScalingList::setupQuantMatrices() +{ + for (int size = 0; size < NUM_SIZES; size++) + { + int width = 1 << (size + 2); + int ratio = width / X265_MIN(MAX_MATRIX_SIZE_NUM, width); + int stride = X265_MIN(MAX_MATRIX_SIZE_NUM, width); + int count = s_numCoefPerSize[size]; + + for (int list = 0; list < NUM_LISTS; list++) + { + int32_t *coeff = m_scalingListCoef[size][list]; + int32_t dc = m_scalingListDC[size][list]; + + for (int rem = 0; rem < NUM_REM; rem++) + { + int32_t *quantCoeff = m_quantCoef[size][list][rem]; + int32_t *dequantCoeff = m_dequantCoef[size][list][rem]; + + if (m_bEnabled) + { + processScalingListEnc(coeff, quantCoeff, s_quantScales[rem] << 4, width, width, ratio, stride, dc); + processScalingListDec(coeff, dequantCoeff, s_invQuantScales[rem], width, width, ratio, stride, dc); + } + else + { + /* flat quant and dequant coefficients */ + for (int i = 0; i < count; i++) + { + quantCoeff[i] = s_quantScales[rem]; + dequantCoeff[i] = s_invQuantScales[rem]; + } + } + } + } + } +} + +void ScalingList::processScalingListEnc(int32_t *coeff, int32_t *quantcoeff, int32_t quantScales, int height, int width, + int ratio, int stride, int32_t dc) +{ + for (int j = 0; j < height; j++) + for (int i = 0; i < width; i++) + quantcoeff[j * width + i] = quantScales / coeff[stride * (j / ratio) + i / ratio]; + + if (ratio > 1) + quantcoeff[0] = quantScales / dc; +} + +void ScalingList::processScalingListDec(int32_t *coeff, int32_t *dequantcoeff, int32_t invQuantScales, int height, int width, + int ratio, int stride, int32_t dc) +{ + for (int j = 0; j < height; j++) + for (int i = 0; i < width; i++) + dequantcoeff[j * width + i] = invQuantScales * coeff[stride * (j / ratio) + i / ratio]; + + if (ratio > 1) + dequantcoeff[0] = invQuantScales * dc; +} + +} diff --git a/source/common/scalinglist.h b/source/common/scalinglist.h new file mode 100644 index 0000000..e133498 --- /dev/null +++ b/source/common/scalinglist.h @@ -0,0 +1,80 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_SCALINGLIST_H +#define X265_SCALINGLIST_H + +#include "common.h" + +namespace x265 { +// private namespace + +class ScalingList +{ +public: + + enum { NUM_SIZES = 4 }; // 4x4, 8x8, 16x16, 32x32 + enum { NUM_LISTS = 6 }; // number of quantization matrix lists (YUV * inter/intra) + enum { NUM_REM = 6 }; // number of remainders of QP/6 + enum { MAX_MATRIX_COEF_NUM = 64 }; // max coefficient number per quantization matrix + enum { MAX_MATRIX_SIZE_NUM = 8 }; // max size number for quantization matrix + + static const int s_numCoefPerSize[NUM_SIZES]; + static const int32_t s_invQuantScales[NUM_REM]; + static const int32_t s_quantScales[NUM_REM]; + + int32_t m_scalingListDC[NUM_SIZES][NUM_LISTS]; // the DC value of the matrix coefficient for 16x16 + int32_t* m_scalingListCoef[NUM_SIZES][NUM_LISTS]; // quantization matrix + + int32_t* m_quantCoef[NUM_SIZES][NUM_LISTS][NUM_REM]; // array of quantization matrix coefficient 4x4 + int32_t* m_dequantCoef[NUM_SIZES][NUM_LISTS][NUM_REM]; // array of dequantization matrix coefficient 4x4 + + bool m_bEnabled; + bool m_bDataPresent; // non-default scaling lists must be signaled + + ScalingList(); + ~ScalingList(); + + bool init(); + void setDefaultScalingList(); + bool parseScalingList(const char* filename); + void setupQuantMatrices(); + + /* used during SPS coding */ + int checkPredMode(int sizeId, int listId) const; + +protected: + + static const int SCALING_LIST_DC = 16; // default DC value + + const int32_t* getScalingListDefaultAddress(int sizeId, int listId) const; + void processDefaultMarix(int sizeId, int listId); + bool checkDefaultScalingList() const; + + void processScalingListEnc(int32_t *coeff, int32_t *quantcoeff, int32_t quantScales, int height, int width, int ratio, int stride, int32_t dc); + void processScalingListDec(int32_t *coeff, int32_t *dequantcoeff, int32_t invQuantScales, int height, int width, int ratio, int stride, int32_t dc); +}; + +} + +#endif // ifndef X265_SCALINGLIST_H diff --git a/source/common/shortyuv.cpp b/source/common/shortyuv.cpp new file mode 100644 index 0000000..2a7e153 --- /dev/null +++ b/source/common/shortyuv.cpp @@ -0,0 +1,120 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Nandakumar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#include "common.h" +#include "yuv.h" +#include "shortyuv.h" +#include "primitives.h" + +#include "x265.h" + +using namespace x265; + +ShortYuv::ShortYuv() +{ + m_buf[0] = NULL; + m_buf[1] = NULL; + m_buf[2] = NULL; +} + +bool ShortYuv::create(uint32_t size, int csp) +{ + m_csp = csp; + m_hChromaShift = CHROMA_H_SHIFT(csp); + m_vChromaShift = CHROMA_V_SHIFT(csp); + + m_size = size; + m_csize = size >> m_hChromaShift; + + size_t sizeL = size * size; + size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); + X265_CHECK((sizeC & 15) == 0, "invalid size"); + + CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2); + m_buf[1] = m_buf[0] + sizeL; + m_buf[2] = m_buf[0] + sizeL + sizeC; + return true; + +fail: + return false; +} + +void ShortYuv::destroy() +{ + X265_FREE(m_buf[0]); +} + +void ShortYuv::clear() +{ + ::memset(m_buf[0], 0, (m_size * m_size) * sizeof(int16_t)); + ::memset(m_buf[1], 0, (m_csize * m_csize) * sizeof(int16_t)); + ::memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t)); +} + +void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size) +{ + const int sizeIdx = log2Size - 2; + primitives.luma_sub_ps[sizeIdx](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size); + primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize); + primitives.chroma[m_csp].sub_ps[sizeIdx](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize); +} + +void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const +{ + const int16_t* src = getLumaAddr(absPartIdx); + int16_t* dst = dstYuv.getLumaAddr(absPartIdx); + + primitives.square_copy_ss[log2Size - 2](dst, dstYuv.m_size, const_cast(src), m_size); +} + +void ShortYuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const +{ + const int16_t* src = getLumaAddr(absPartIdx); + pixel* dst = dstYuv.getLumaAddr(absPartIdx); + + primitives.square_copy_sp[log2Size - 2](dst, dstYuv.m_size, const_cast(src), m_size); +} + +void ShortYuv::copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const +{ + int part = partitionFromLog2Size(log2SizeL); + const int16_t* srcU = getCbAddr(absPartIdx); + const int16_t* srcV = getCrAddr(absPartIdx); + int16_t* dstU = dstYuv.getCbAddr(absPartIdx); + int16_t* dstV = dstYuv.getCrAddr(absPartIdx); + + primitives.chroma[m_csp].copy_ss[part](dstU, dstYuv.m_csize, const_cast(srcU), m_csize); + primitives.chroma[m_csp].copy_ss[part](dstV, dstYuv.m_csize, const_cast(srcV), m_csize); +} + +void ShortYuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const +{ + int part = partitionFromLog2Size(log2SizeL); + const int16_t* srcU = getCbAddr(absPartIdx); + const int16_t* srcV = getCrAddr(absPartIdx); + pixel* dstU = dstYuv.getCbAddr(absPartIdx); + pixel* dstV = dstYuv.getCrAddr(absPartIdx); + + primitives.chroma[m_csp].copy_sp[part](dstU, dstYuv.m_csize, const_cast(srcU), m_csize); + primitives.chroma[m_csp].copy_sp[part](dstV, dstYuv.m_csize, const_cast(srcV), m_csize); +} diff --git a/source/common/shortyuv.h b/source/common/shortyuv.h new file mode 100644 index 0000000..c27093d --- /dev/null +++ b/source/common/shortyuv.h @@ -0,0 +1,93 @@ +/***************************************************************************** + * x265: ShortYUV class for short sized YUV-style frames + ***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Nandakumar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#ifndef X265_SHORTYUV_H +#define X265_SHORTYUV_H + +#include "common.h" + +namespace x265 { +// private namespace + +class Yuv; + +/* A ShortYuv instance holds int16_ts for a square CU (64x64 down to 8x8) for all three planes, + * these are typically used to hold residual or coefficients */ +class ShortYuv +{ +public: + + int16_t* m_buf[3]; + + uint32_t m_size; + uint32_t m_csize; + + int m_csp; + int m_hChromaShift; + int m_vChromaShift; + + ShortYuv(); + + bool create(uint32_t size, int csp); + void destroy(); + void clear(); + + int16_t* getLumaAddr(uint32_t absPartIdx) { return m_buf[0] + getAddrOffset(absPartIdx, m_size); } + int16_t* getCbAddr(uint32_t absPartIdx) { return m_buf[1] + getChromaAddrOffset(absPartIdx); } + int16_t* getCrAddr(uint32_t absPartIdx) { return m_buf[2] + getChromaAddrOffset(absPartIdx); } + int16_t* getChromaAddr(uint32_t chromaId, uint32_t partUnitIdx) { return m_buf[chromaId] + getChromaAddrOffset(partUnitIdx); } + + const int16_t* getLumaAddr(uint32_t absPartIdx) const { return m_buf[0] + getAddrOffset(absPartIdx, m_size); } + const int16_t* getCbAddr(uint32_t absPartIdx) const { return m_buf[1] + getChromaAddrOffset(absPartIdx); } + const int16_t* getCrAddr(uint32_t absPartIdx) const { return m_buf[2] + getChromaAddrOffset(absPartIdx); } + const int16_t* getChromaAddr(uint32_t chromaId, uint32_t partUnitIdx) const { return m_buf[chromaId] + getChromaAddrOffset(partUnitIdx); } + + void subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size); + + void copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const; + void copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const; + + void copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const; + void copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const; + + int getChromaAddrOffset(uint32_t idx) const + { + int blkX = g_zscanToPelX[idx] >> m_hChromaShift; + int blkY = g_zscanToPelY[idx] >> m_vChromaShift; + + return blkX + blkY * m_csize; + } + + static int getAddrOffset(uint32_t idx, uint32_t width) + { + int blkX = g_zscanToPelX[idx]; + int blkY = g_zscanToPelY[idx]; + + return blkX + blkY * width; + } +}; +} + +#endif // ifndef X265_SHORTYUV_H diff --git a/source/common/slice.cpp b/source/common/slice.cpp new file mode 100644 index 0000000..2e850cd --- /dev/null +++ b/source/common/slice.cpp @@ -0,0 +1,204 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "piclist.h" +#include "picyuv.h" +#include "slice.h" + +using namespace x265; + +void Slice::setRefPicList(PicList& picList) +{ + if (m_sliceType == I_SLICE) + { + ::memset(m_refPicList, 0, sizeof(m_refPicList)); + m_numRefIdx[1] = m_numRefIdx[0] = 0; + return; + } + + Frame* refPic = NULL; + Frame* refPicSetStCurr0[MAX_NUM_REF]; + Frame* refPicSetStCurr1[MAX_NUM_REF]; + Frame* refPicSetLtCurr[MAX_NUM_REF]; + int numPocStCurr0 = 0; + int numPocStCurr1 = 0; + int numPocLtCurr = 0; + int i; + + for (i = 0; i < m_rps.numberOfNegativePictures; i++) + { + if (m_rps.bUsed[i]) + { + refPic = picList.getPOC(m_poc + m_rps.deltaPOC[i]); + refPicSetStCurr0[numPocStCurr0] = refPic; + numPocStCurr0++; + } + } + + for (; i < m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures; i++) + { + if (m_rps.bUsed[i]) + { + refPic = picList.getPOC(m_poc + m_rps.deltaPOC[i]); + refPicSetStCurr1[numPocStCurr1] = refPic; + numPocStCurr1++; + } + } + + X265_CHECK(m_rps.numberOfPictures == m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures, + "unexpected picture in RPS\n"); + + // ref_pic_list_init + Frame* rpsCurrList0[MAX_NUM_REF + 1]; + Frame* rpsCurrList1[MAX_NUM_REF + 1]; + int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr; + + int cIdx = 0; + for (i = 0; i < numPocStCurr0; i++, cIdx++) + rpsCurrList0[cIdx] = refPicSetStCurr0[i]; + + for (i = 0; i < numPocStCurr1; i++, cIdx++) + rpsCurrList0[cIdx] = refPicSetStCurr1[i]; + + for (i = 0; i < numPocLtCurr; i++, cIdx++) + rpsCurrList0[cIdx] = refPicSetLtCurr[i]; + + X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n"); + + if (m_sliceType == B_SLICE) + { + cIdx = 0; + for (i = 0; i < numPocStCurr1; i++, cIdx++) + rpsCurrList1[cIdx] = refPicSetStCurr1[i]; + + for (i = 0; i < numPocStCurr0; i++, cIdx++) + rpsCurrList1[cIdx] = refPicSetStCurr0[i]; + + for (i = 0; i < numPocLtCurr; i++, cIdx++) + rpsCurrList1[cIdx] = refPicSetLtCurr[i]; + + X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n"); + } + + for (int rIdx = 0; rIdx < m_numRefIdx[0]; rIdx++) + { + cIdx = rIdx % numPocTotalCurr; + X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n"); + m_refPicList[0][rIdx] = rpsCurrList0[cIdx]; + } + + if (m_sliceType != B_SLICE) + { + m_numRefIdx[1] = 0; + ::memset(m_refPicList[1], 0, sizeof(m_refPicList[1])); + } + else + { + for (int rIdx = 0; rIdx < m_numRefIdx[1]; rIdx++) + { + cIdx = rIdx % numPocTotalCurr; + X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n"); + m_refPicList[1][rIdx] = rpsCurrList1[cIdx]; + } + } + + for (int dir = 0; dir < 2; dir++) + for (int numRefIdx = 0; numRefIdx < m_numRefIdx[dir]; numRefIdx++) + m_refPOCList[dir][numRefIdx] = m_refPicList[dir][numRefIdx]->m_poc; +} + +void Slice::disableWeights() +{ + for (int l = 0; l < 2; l++) + for (int i = 0; i < MAX_NUM_REF; i++) + for (int yuv = 0; yuv < 3; yuv++) + { + WeightParam& wp = m_weightPredTable[l][i][yuv]; + wp.bPresentFlag = false; + wp.log2WeightDenom = 0; + wp.inputWeight = 1; + wp.inputOffset = 0; + } +} + +/* Sorts the deltaPOC and Used by current values in the RPS based on the + * deltaPOC values. deltaPOC values are sorted with -ve values before the +ve + * values. -ve values are in decreasing order. +ve values are in increasing + * order */ +void RPS::sortDeltaPOC() +{ + // sort in increasing order (smallest first) + for (int j = 1; j < numberOfPictures; j++) + { + int dPOC = deltaPOC[j]; + bool used = bUsed[j]; + for (int k = j - 1; k >= 0; k--) + { + int temp = deltaPOC[k]; + if (dPOC < temp) + { + deltaPOC[k + 1] = temp; + bUsed[k + 1] = bUsed[k]; + deltaPOC[k] = dPOC; + bUsed[k] = used; + } + } + } + + // flip the negative values to largest first + int numNegPics = numberOfNegativePictures; + for (int j = 0, k = numNegPics - 1; j < numNegPics >> 1; j++, k--) + { + int dPOC = deltaPOC[j]; + bool used = bUsed[j]; + deltaPOC[j] = deltaPOC[k]; + bUsed[j] = bUsed[k]; + deltaPOC[k] = dPOC; + bUsed[k] = used; + } +} + +uint32_t Slice::realEndAddress(uint32_t endCUAddr) const +{ + // Calculate end address + uint32_t internalAddress = (endCUAddr - 1) % NUM_CU_PARTITIONS; + uint32_t externalAddress = (endCUAddr - 1) / NUM_CU_PARTITIONS; + uint32_t xmax = m_sps->picWidthInLumaSamples - (externalAddress % m_sps->numCuInWidth) * g_maxCUSize; + uint32_t ymax = m_sps->picHeightInLumaSamples - (externalAddress / m_sps->numCuInWidth) * g_maxCUSize; + + while (g_zscanToPelX[internalAddress] >= xmax || g_zscanToPelY[internalAddress] >= ymax) + internalAddress--; + + internalAddress++; + if (internalAddress == NUM_CU_PARTITIONS) + { + internalAddress = 0; + externalAddress++; + } + + return externalAddress * NUM_CU_PARTITIONS + internalAddress; +} + + diff --git a/source/common/slice.h b/source/common/slice.h new file mode 100644 index 0000000..bd0ba63 --- /dev/null +++ b/source/common/slice.h @@ -0,0 +1,361 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_SLICE_H +#define X265_SLICE_H + +#include "common.h" + +namespace x265 { +// private namespace + +class Frame; +class PicList; +class MotionReference; + +enum SliceType +{ + B_SLICE, + P_SLICE, + I_SLICE +}; + +struct RPS +{ + int numberOfPictures; + int numberOfNegativePictures; + int numberOfPositivePictures; + + int poc[MAX_NUM_REF_PICS]; + int deltaPOC[MAX_NUM_REF_PICS]; + bool bUsed[MAX_NUM_REF_PICS]; + + RPS() + : numberOfPictures(0) + , numberOfNegativePictures(0) + , numberOfPositivePictures(0) + { + ::memset(deltaPOC, 0, sizeof(deltaPOC)); + ::memset(poc, 0, sizeof(poc)); + ::memset(bUsed, 0, sizeof(bUsed)); + } + + void sortDeltaPOC(); +}; + +namespace Profile { + enum Name + { + NONE = 0, + MAIN = 1, + MAIN10 = 2, + MAINSTILLPICTURE = 3, + MAINREXT = 4, + HIGHTHROUGHPUTREXT = 5 + }; +} + +namespace Level { + enum Tier + { + MAIN = 0, + HIGH = 1, + }; + + enum Name + { + NONE = 0, + LEVEL1 = 30, + LEVEL2 = 60, + LEVEL2_1 = 63, + LEVEL3 = 90, + LEVEL3_1 = 93, + LEVEL4 = 120, + LEVEL4_1 = 123, + LEVEL5 = 150, + LEVEL5_1 = 153, + LEVEL5_2 = 156, + LEVEL6 = 180, + LEVEL6_1 = 183, + LEVEL6_2 = 186, + }; +} + +struct ProfileTierLevel +{ + bool tierFlag; + bool progressiveSourceFlag; + bool interlacedSourceFlag; + bool nonPackedConstraintFlag; + bool frameOnlyConstraintFlag; + bool profileCompatibilityFlag[32]; + bool intraConstraintFlag; + bool lowerBitRateConstraintFlag; + int profileIdc; + int levelIdc; + uint32_t minCrForLevel; + uint32_t maxLumaSrForLevel; + uint32_t bitDepthConstraint; + int chromaFormatConstraint; +}; + +struct HRDInfo +{ + uint32_t bitRateScale; + uint32_t cpbSizeScale; + uint32_t initialCpbRemovalDelayLength; + uint32_t cpbRemovalDelayLength; + uint32_t dpbOutputDelayLength; + uint32_t bitRateValue; + uint32_t cpbSizeValue; + bool cbrFlag; + + HRDInfo() + : bitRateScale(0) + , cpbSizeScale(0) + , initialCpbRemovalDelayLength(1) + , cpbRemovalDelayLength(1) + , dpbOutputDelayLength(1) + , cbrFlag(false) + { + } +}; + +struct TimingInfo +{ + uint32_t numUnitsInTick; + uint32_t timeScale; +}; + +struct VPS +{ + uint32_t numReorderPics; + uint32_t maxDecPicBuffering; + HRDInfo hrdParameters; + ProfileTierLevel ptl; +}; + +struct Window +{ + bool bEnabled; + int leftOffset; + int rightOffset; + int topOffset; + int bottomOffset; + + Window() + { + bEnabled = false; + } +}; + +struct VUI +{ + bool aspectRatioInfoPresentFlag; + int aspectRatioIdc; + int sarWidth; + int sarHeight; + + bool overscanInfoPresentFlag; + bool overscanAppropriateFlag; + + bool videoSignalTypePresentFlag; + int videoFormat; + bool videoFullRangeFlag; + + bool colourDescriptionPresentFlag; + int colourPrimaries; + int transferCharacteristics; + int matrixCoefficients; + + bool chromaLocInfoPresentFlag; + int chromaSampleLocTypeTopField; + int chromaSampleLocTypeBottomField; + + Window defaultDisplayWindow; + + bool frameFieldInfoPresentFlag; + bool fieldSeqFlag; + + bool hrdParametersPresentFlag; + HRDInfo hrdParameters; + + TimingInfo timingInfo; +}; + +struct SPS +{ + int chromaFormatIdc; // use param + uint32_t picWidthInLumaSamples; // use param + uint32_t picHeightInLumaSamples; // use param + + uint32_t numCuInWidth; + uint32_t numCuInHeight; + uint32_t numCUsInFrame; + uint32_t numPartitions; + uint32_t numPartInCUSize; + + int log2MinCodingBlockSize; + int log2DiffMaxMinCodingBlockSize; + + uint32_t quadtreeTULog2MaxSize; + uint32_t quadtreeTULog2MinSize; + + uint32_t quadtreeTUMaxDepthInter; // use param + uint32_t quadtreeTUMaxDepthIntra; // use param + + bool bUseSAO; // use param + bool bUseAMP; // use param + uint32_t maxAMPDepth; + + uint32_t maxDecPicBuffering; // these are dups of VPS values + int numReorderPics; + + bool bUseStrongIntraSmoothing; // use param + bool bTemporalMVPEnabled; + + Window conformanceWindow; + VUI vuiParameters; +}; + +struct PPS +{ + uint32_t maxCuDQPDepth; + + int chromaCbQpOffset; // use param + int chromaCrQpOffset; // use param + + bool bUseWeightPred; // use param + bool bUseWeightedBiPred; // use param + bool bUseDQP; + bool bConstrainedIntraPred; // use param + + bool bTransquantBypassEnabled; // Indicates presence of cu_transquant_bypass_flag in CUs. + bool bTransformSkipEnabled; // use param + bool bEntropyCodingSyncEnabled; // use param + bool bSignHideEnabled; // use param + + bool bDeblockingFilterControlPresent; + bool bPicDisableDeblockingFilter; + int deblockingFilterBetaOffsetDiv2; + int deblockingFilterTcOffsetDiv2; +}; + +struct WeightParam +{ + // Explicit weighted prediction parameters parsed in slice header, + bool bPresentFlag; + uint32_t log2WeightDenom; + int inputWeight; + int inputOffset; + + /* makes a non-h265 weight (i.e. fix7), into an h265 weight */ + void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize) + { + inputOffset = o; + log2WeightDenom = denom; + inputWeight = w; + while (bNormalize && log2WeightDenom > 0 && (inputWeight > 127)) + { + log2WeightDenom--; + inputWeight >>= 1; + } + + inputWeight = X265_MIN(inputWeight, 127); + } +}; + +class Slice +{ +public: + + const SPS* m_sps; + const PPS* m_pps; + WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V] + MotionReference (*m_mref)[MAX_NUM_REF + 1]; + RPS m_rps; + + NalUnitType m_nalUnitType; + SliceType m_sliceType; + int m_sliceQp; + int m_poc; + + int m_lastIDR; + + bool m_bCheckLDC; // TODO: is this necessary? + bool m_sLFaseFlag; // loop filter boundary flag + bool m_colFromL0Flag; // collocated picture from List0 or List1 flag + uint32_t m_colRefIdx; // never modified + + int m_numRefIdx[2]; + Frame* m_refPicList[2][MAX_NUM_REF + 1]; + int m_refPOCList[2][MAX_NUM_REF + 1]; + + uint32_t m_maxNumMergeCand; // use param + uint32_t m_endCUAddr; + + Slice() + { + m_lastIDR = 0; + m_sLFaseFlag = true; + m_numRefIdx[0] = m_numRefIdx[1] = 0; + for (int i = 0; i < MAX_NUM_REF; i++) + { + m_refPicList[0][i] = NULL; + m_refPicList[1][i] = NULL; + m_refPOCList[0][i] = 0; + m_refPOCList[1][i] = 0; + } + + disableWeights(); + } + + void disableWeights(); + + void setRefPicList(PicList& picList); + + bool getRapPicFlag() const + { + return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL + || m_nalUnitType == NAL_UNIT_CODED_SLICE_CRA; + } + + bool getIdrPicFlag() const + { + return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL; + } + + bool isIRAP() const { return m_nalUnitType >= 16 && m_nalUnitType <= 23; } + + bool isIntra() const { return m_sliceType == I_SLICE; } + + bool isInterB() const { return m_sliceType == B_SLICE; } + + bool isInterP() const { return m_sliceType == P_SLICE; } + + uint32_t realEndAddress(uint32_t endCUAddr) const; +}; + +} + +#endif // ifndef X265_SLICE_H diff --git a/source/common/threading.cpp b/source/common/threading.cpp new file mode 100644 index 0000000..cb50eb2 --- /dev/null +++ b/source/common/threading.cpp @@ -0,0 +1,106 @@ +/***************************************************************************** + * x265: threading class and intrinsics + ***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#include "threading.h" + +namespace x265 { +// x265 private namespace + +/* C shim for forced stack alignment */ +static void stackAlignMain(Thread *instance) +{ + instance->threadMain(); +} + +#if _WIN32 + +static DWORD WINAPI ThreadShim(Thread *instance) +{ + // defer processing to the virtual function implemented in the derived class + x265_stack_align(stackAlignMain, instance); + + return 0; +} + +bool Thread::start() +{ + DWORD threadId; + + this->thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)ThreadShim, this, 0, &threadId); + + return threadId > 0; +} + +void Thread::stop() +{ + if (this->thread) + WaitForSingleObject(this->thread, INFINITE); +} + +Thread::~Thread() +{ + if (this->thread) + CloseHandle(this->thread); +} + +#else /* POSIX / pthreads */ + +static void *ThreadShim(void *opaque) +{ + // defer processing to the virtual function implemented in the derived class + Thread *instance = reinterpret_cast(opaque); + + x265_stack_align(stackAlignMain, instance); + + return NULL; +} + +bool Thread::start() +{ + if (pthread_create(&this->thread, NULL, ThreadShim, this)) + { + this->thread = 0; + + return false; + } + + return true; +} + +void Thread::stop() +{ + if (this->thread) + pthread_join(this->thread, NULL); +} + +Thread::~Thread() {} + +#endif // if _WIN32 + +Thread::Thread() +{ + this->thread = 0; +} +} diff --git a/source/common/threading.h b/source/common/threading.h new file mode 100644 index 0000000..ef5642a --- /dev/null +++ b/source/common/threading.h @@ -0,0 +1,476 @@ +/***************************************************************************** + * x265: threading class and intrinsics + ***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#ifndef X265_THREADING_H +#define X265_THREADING_H + +#include "common.h" +#include "x265.h" + +#ifdef _WIN32 +#include +#include "winxp.h" // XP workarounds for CONDITION_VARIABLE and ATOMIC_OR +#else +#include +#include +#include +#include +#endif + +#if MACOS +#include +#include +#endif + +#ifdef __GNUC__ /* GCCs builtin atomics */ + +#include +#include + +#define CLZ32(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 +#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x) +#define ATOMIC_OR(ptr, mask) __sync_or_and_fetch(ptr, mask) +#define ATOMIC_CAS(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval) +#define ATOMIC_CAS32(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval) +#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1) +#define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1) +#define GIVE_UP_TIME() usleep(0) + +#elif defined(_MSC_VER) /* Windows atomic intrinsics */ + +#include + +#if !_WIN64 +inline int _BitScanReverse64(DWORD *id, uint64_t x64) // fake 64bit CLZ +{ + uint32_t high32 = (uint32_t)(x64 >> 32); + uint32_t low32 = (uint32_t)x64; + + if (high32) + { + _BitScanReverse(id, high32); + *id += 32; + return 1; + } + else if (low32) + return _BitScanReverse(id, low32); + else + return *id = 0; +} + +inline int _BitScanForward64(DWORD *id, uint64_t x64) // fake 64bit CLZ +{ + uint32_t high32 = (uint32_t)(x64 >> 32); + uint32_t low32 = (uint32_t)x64; + + if (high32) + { + _BitScanForward(id, high32); + *id += 32; + return 1; + } + else if (low32) + return _BitScanForward(id, low32); + else + return *id = 0; +} + +#endif // if !_WIN64 + +#ifndef ATOMIC_OR +#define ATOMIC_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, mask) +#endif + +#define CLZ32(id, x) _BitScanReverse(&id, x) +#define CTZ64(id, x) _BitScanForward64(&id, x) +#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval) +#define ATOMIC_CAS32(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval) +#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr) +#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr) +#define GIVE_UP_TIME() Sleep(0) + +#endif // ifdef __GNUC__ + +namespace x265 { +// x265 private namespace + +#ifdef _WIN32 + +typedef HANDLE ThreadHandle; + +class Lock +{ +public: + + Lock() + { + InitializeCriticalSection(&this->handle); + } + + ~Lock() + { + DeleteCriticalSection(&this->handle); + } + + void acquire() + { + EnterCriticalSection(&this->handle); + } + + void release() + { + LeaveCriticalSection(&this->handle); + } + +protected: + + CRITICAL_SECTION handle; +}; + +class Event +{ +public: + + Event() + { + this->handle = CreateEvent(NULL, FALSE, FALSE, NULL); + } + + ~Event() + { + CloseHandle(this->handle); + } + + void wait() + { + WaitForSingleObject(this->handle, INFINITE); + } + + bool timedWait(uint32_t milliseconds) + { + /* returns true if event was signaled */ + return WaitForSingleObject(this->handle, milliseconds) == WAIT_OBJECT_0; + } + + void trigger() + { + SetEvent(this->handle); + } + +protected: + + HANDLE handle; +}; + +/* This class is intended for use in signaling state changes safely between CPU + * cores. One thread should be a writer and multiple threads may be readers. The + * mutex's main purpose is to serve as a memory fence to ensure writes made by + * the writer thread are visible prior to readers seeing the m_val change. Its + * secondary purpose is for use with the condition variable for blocking waits */ +class ThreadSafeInteger +{ +public: + + ThreadSafeInteger() + { + m_val = 0; + InitializeCriticalSection(&m_cs); + InitializeConditionVariable(&m_cv); + } + + ~ThreadSafeInteger() + { + DeleteCriticalSection(&m_cs); + XP_CONDITION_VAR_FREE(&m_cv); + } + + int waitForChange(int prev) + { + EnterCriticalSection(&m_cs); + if (m_val == prev) + SleepConditionVariableCS(&m_cv, &m_cs, INFINITE); + LeaveCriticalSection(&m_cs); + return m_val; + } + + int get() + { + EnterCriticalSection(&m_cs); + int ret = m_val; + LeaveCriticalSection(&m_cs); + return ret; + } + + void set(int newval) + { + EnterCriticalSection(&m_cs); + m_val = newval; + WakeAllConditionVariable(&m_cv); + LeaveCriticalSection(&m_cs); + } + + void incr() + { + EnterCriticalSection(&m_cs); + m_val++; + WakeAllConditionVariable(&m_cv); + LeaveCriticalSection(&m_cs); + } + +protected: + + CRITICAL_SECTION m_cs; + CONDITION_VARIABLE m_cv; + int m_val; +}; + +#else /* POSIX / pthreads */ + +typedef pthread_t ThreadHandle; + +class Lock +{ +public: + + Lock() + { + pthread_mutex_init(&this->handle, NULL); + } + + ~Lock() + { + pthread_mutex_destroy(&this->handle); + } + + void acquire() + { + pthread_mutex_lock(&this->handle); + } + + void release() + { + pthread_mutex_unlock(&this->handle); + } + +protected: + + pthread_mutex_t handle; +}; + +class Event +{ +public: + + Event() + { + m_counter = 0; + if (pthread_mutex_init(&m_mutex, NULL) || + pthread_cond_init(&m_cond, NULL)) + { + x265_log(NULL, X265_LOG_ERROR, "fatal: unable to initialize conditional variable\n"); + } + } + + ~Event() + { + pthread_cond_destroy(&m_cond); + pthread_mutex_destroy(&m_mutex); + } + + void wait() + { + pthread_mutex_lock(&m_mutex); + + /* blocking wait on conditional variable, mutex is atomically released + * while blocked. When condition is signaled, mutex is re-acquired */ + while (m_counter == 0) + { + pthread_cond_wait(&m_cond, &m_mutex); + } + + m_counter--; + pthread_mutex_unlock(&m_mutex); + } + + bool timedWait(uint32_t waitms) + { + bool bTimedOut = false; + + pthread_mutex_lock(&m_mutex); + if (m_counter == 0) + { + struct timeval tv; + struct timespec ts; + gettimeofday(&tv, NULL); + /* convert current time from (sec, usec) to (sec, nsec) */ + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; + + ts.tv_nsec += 1000 * 1000 * (waitms % 1000); /* add ms to tv_nsec */ + ts.tv_sec += ts.tv_nsec / (1000 * 1000 * 1000); /* overflow tv_nsec */ + ts.tv_nsec %= (1000 * 1000 * 1000); /* clamp tv_nsec */ + ts.tv_sec += waitms / 1000; /* add seconds */ + + /* blocking wait on conditional variable, mutex is atomically released + * while blocked. When condition is signaled, mutex is re-acquired. + * ts is absolute time to stop waiting */ + bTimedOut = pthread_cond_timedwait(&m_cond, &m_mutex, &ts) == ETIMEDOUT; + } + if (m_counter > 0) + m_counter--; + pthread_mutex_unlock(&m_mutex); + return bTimedOut; + } + + void trigger() + { + pthread_mutex_lock(&m_mutex); + if (m_counter < UINT_MAX) + m_counter++; + /* Signal a single blocking thread */ + pthread_cond_signal(&m_cond); + pthread_mutex_unlock(&m_mutex); + } + +protected: + + pthread_mutex_t m_mutex; + pthread_cond_t m_cond; + uint32_t m_counter; +}; + +/* This class is intended for use in signaling state changes safely between CPU + * cores. One thread should be a writer and multiple threads may be readers. The + * mutex's main purpose is to serve as a memory fence to ensure writes made by + * the writer thread are visible prior to readers seeing the m_val change. Its + * secondary purpose is for use with the condition variable for blocking waits */ +class ThreadSafeInteger +{ +public: + + ThreadSafeInteger() + { + m_val = 0; + if (pthread_mutex_init(&m_mutex, NULL) || + pthread_cond_init(&m_cond, NULL)) + { + x265_log(NULL, X265_LOG_ERROR, "fatal: unable to initialize conditional variable\n"); + } + } + + ~ThreadSafeInteger() + { + pthread_cond_destroy(&m_cond); + pthread_mutex_destroy(&m_mutex); + } + + int waitForChange(int prev) + { + pthread_mutex_lock(&m_mutex); + if (m_val == prev) + pthread_cond_wait(&m_cond, &m_mutex); + pthread_mutex_unlock(&m_mutex); + return m_val; + } + + int get() + { + pthread_mutex_lock(&m_mutex); + int ret = m_val; + pthread_mutex_unlock(&m_mutex); + return ret; + } + + void set(int newval) + { + pthread_mutex_lock(&m_mutex); + m_val = newval; + pthread_cond_broadcast(&m_cond); + pthread_mutex_unlock(&m_mutex); + } + + void incr() + { + pthread_mutex_lock(&m_mutex); + m_val++; + pthread_cond_broadcast(&m_cond); + pthread_mutex_unlock(&m_mutex); + } + +protected: + + pthread_mutex_t m_mutex; + pthread_cond_t m_cond; + int m_val; +}; + +#endif // ifdef _WIN32 + +class ScopedLock +{ +public: + + ScopedLock(Lock &instance) : inst(instance) + { + this->inst.acquire(); + } + + ~ScopedLock() + { + this->inst.release(); + } + +protected: + + // do not allow assignments + ScopedLock &operator =(const ScopedLock &); + + Lock &inst; +}; + +//< Simplistic portable thread class. Shutdown signalling left to derived class +class Thread +{ +private: + + ThreadHandle thread; + +public: + + Thread(); + + virtual ~Thread(); + + //< Derived class must implement ThreadMain. + virtual void threadMain() = 0; + + //< Returns true if thread was successfully created + bool start(); + + void stop(); +}; +} // end namespace x265 + +#endif // ifndef X265_THREADING_H diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp new file mode 100644 index 0000000..8a2ab9d --- /dev/null +++ b/source/common/threadpool.cpp @@ -0,0 +1,465 @@ +/***************************************************************************** + * x265: singleton thread pool and interface classes + ***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#include "common.h" +#include "threadpool.h" +#include "threading.h" + +#include + +#if MACOS +#include +#include +#endif + +namespace x265 { +// x265 private namespace + +class ThreadPoolImpl; + +class PoolThread : public Thread +{ +private: + + ThreadPoolImpl &m_pool; + + PoolThread& operator =(const PoolThread&); + + int m_id; + + bool m_dirty; + + bool m_exited; + + Event m_wakeEvent; + +public: + + PoolThread(ThreadPoolImpl& pool, int id) + : m_pool(pool) + , m_id(id) + , m_dirty(false) + , m_exited(false) + { + } + + bool isDirty() const { return m_dirty; } + + void markDirty() { m_dirty = true; } + + bool isExited() const { return m_exited; } + + void poke() { m_wakeEvent.trigger(); } + + virtual ~PoolThread() {} + + void threadMain(); +}; + +class ThreadPoolImpl : public ThreadPool +{ +private: + + bool m_ok; + int m_referenceCount; + int m_numThreads; + int m_numSleepMapWords; + PoolThread *m_threads; + volatile uint64_t *m_sleepMap; + + /* Lock for write access to the provider lists. Threads are + * always allowed to read m_firstProvider and follow the + * linked list. Providers must zero their m_nextProvider + * pointers before removing themselves from this list */ + Lock m_writeLock; + +public: + + static ThreadPoolImpl *s_instance; + static Lock s_createLock; + + JobProvider *m_firstProvider; + JobProvider *m_lastProvider; + +public: + + ThreadPoolImpl(int numthreads); + + virtual ~ThreadPoolImpl(); + + ThreadPoolImpl *AddReference() + { + m_referenceCount++; + + return this; + } + + void markThreadAsleep(int id); + + void waitForAllIdle(); + + int getThreadCount() const { return m_numThreads; } + + bool IsValid() const { return m_ok; } + + void release(); + + void Stop(); + + void enqueueJobProvider(JobProvider &); + + void dequeueJobProvider(JobProvider &); + + void FlushProviderList(); + + void pokeIdleThread(); +}; + +void PoolThread::threadMain() +{ +#if _WIN32 + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL); +#else + __attribute__((unused)) int val = nice(10); +#endif + + while (m_pool.IsValid()) + { + /* Walk list of job providers, looking for work */ + JobProvider *cur = m_pool.m_firstProvider; + while (cur) + { + // FindJob() may perform actual work and return true. If + // it does we restart the job search + if (cur->findJob(m_id) == true) + break; + + cur = cur->m_nextProvider; + } + + // this thread has reached the end of the provider list + m_dirty = false; + + if (cur == NULL) + { + m_pool.markThreadAsleep(m_id); + m_wakeEvent.wait(); + } + } + + m_exited = true; +} + +void ThreadPoolImpl::markThreadAsleep(int id) +{ + int word = id >> 6; + uint64_t bit = 1LL << (id & 63); + + ATOMIC_OR(&m_sleepMap[word], bit); +} + +void ThreadPoolImpl::pokeIdleThread() +{ + /* Find a bit in the sleeping thread bitmap and poke it awake, do + * not give up until a thread is awakened or all of them are awake */ + for (int i = 0; i < m_numSleepMapWords; i++) + { + uint64_t oldval = m_sleepMap[i]; + while (oldval) + { + unsigned long id; + CTZ64(id, oldval); + + uint64_t newval = oldval & ~(1LL << id); + if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval) + { + m_threads[(i << 6) | id].poke(); + return; + } + + oldval = m_sleepMap[i]; + } + } +} + +ThreadPoolImpl *ThreadPoolImpl::s_instance; +Lock ThreadPoolImpl::s_createLock; + +/* static */ +ThreadPool *ThreadPool::allocThreadPool(int numthreads) +{ + if (ThreadPoolImpl::s_instance) + return ThreadPoolImpl::s_instance->AddReference(); + + /* acquire the lock to create the instance */ + ThreadPoolImpl::s_createLock.acquire(); + + if (ThreadPoolImpl::s_instance) + /* pool was allocated while we waited for the lock */ + ThreadPoolImpl::s_instance->AddReference(); + else + ThreadPoolImpl::s_instance = new ThreadPoolImpl(numthreads); + ThreadPoolImpl::s_createLock.release(); + + return ThreadPoolImpl::s_instance; +} + +ThreadPool *ThreadPool::getThreadPool() +{ + X265_CHECK(ThreadPoolImpl::s_instance, "getThreadPool() called prior to allocThreadPool()\n"); + return ThreadPoolImpl::s_instance; +} + +void ThreadPoolImpl::release() +{ + if (--m_referenceCount == 0) + { + X265_CHECK(this == ThreadPoolImpl::s_instance, "multiple thread pool instances detected\n"); + ThreadPoolImpl::s_instance = NULL; + this->Stop(); + delete this; + } +} + +ThreadPoolImpl::ThreadPoolImpl(int numThreads) + : m_ok(false) + , m_referenceCount(1) + , m_firstProvider(NULL) + , m_lastProvider(NULL) +{ + m_numSleepMapWords = (numThreads + 63) >> 6; + m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords); + + char *buffer = (char*)X265_MALLOC(PoolThread, numThreads); + m_threads = reinterpret_cast(buffer); + m_numThreads = numThreads; + + if (m_threads && m_sleepMap) + { + for (int i = 0; i < m_numSleepMapWords; i++) + { + m_sleepMap[i] = 0; + } + + m_ok = true; + int i; + for (i = 0; i < numThreads; i++) + { + new (buffer)PoolThread(*this, i); + buffer += sizeof(PoolThread); + if (!m_threads[i].start()) + { + m_ok = false; + break; + } + } + + if (m_ok) + { + waitForAllIdle(); + } + else + { + // stop threads that did start up + for (int j = 0; j < i; j++) + { + m_threads[j].poke(); + m_threads[j].stop(); + } + } + } +} + +void ThreadPoolImpl::waitForAllIdle() +{ + if (!m_ok) + return; + + int id = 0; + do + { + int word = id >> 6; + uint64_t bit = 1LL << (id & 63); + if (m_sleepMap[word] & bit) + { + id++; + } + else + { + GIVE_UP_TIME(); + } + } + while (id < m_numThreads); +} + +void ThreadPoolImpl::Stop() +{ + if (m_ok) + { + waitForAllIdle(); + + // set invalid flag, then wake them up so they exit their main func + m_ok = false; + for (int i = 0; i < m_numThreads; i++) + { + m_threads[i].poke(); + m_threads[i].stop(); + } + } +} + +ThreadPoolImpl::~ThreadPoolImpl() +{ + X265_FREE((void*)m_sleepMap); + + if (m_threads) + { + // cleanup thread handles + for (int i = 0; i < m_numThreads; i++) + { + m_threads[i].~PoolThread(); + } + + X265_FREE(reinterpret_cast(m_threads)); + } +} + +void ThreadPoolImpl::enqueueJobProvider(JobProvider &p) +{ + // only one list writer at a time + ScopedLock l(m_writeLock); + + p.m_nextProvider = NULL; + p.m_prevProvider = m_lastProvider; + m_lastProvider = &p; + + if (p.m_prevProvider) + p.m_prevProvider->m_nextProvider = &p; + else + m_firstProvider = &p; +} + +void ThreadPoolImpl::dequeueJobProvider(JobProvider &p) +{ + // only one list writer at a time + ScopedLock l(m_writeLock); + + // update pool entry pointers first + if (m_firstProvider == &p) + m_firstProvider = p.m_nextProvider; + + if (m_lastProvider == &p) + m_lastProvider = p.m_prevProvider; + + // extract self from doubly linked lists + if (p.m_nextProvider) + p.m_nextProvider->m_prevProvider = p.m_prevProvider; + + if (p.m_prevProvider) + p.m_prevProvider->m_nextProvider = p.m_nextProvider; + + p.m_nextProvider = NULL; + p.m_prevProvider = NULL; +} + +/* Ensure all threads have made a full pass through the provider list, ensuring + * dequeued providers are safe for deletion. */ +void ThreadPoolImpl::FlushProviderList() +{ + for (int i = 0; i < m_numThreads; i++) + { + m_threads[i].markDirty(); + m_threads[i].poke(); + } + + int i; + do + { + for (i = 0; i < m_numThreads; i++) + { + if (m_threads[i].isDirty()) + { + GIVE_UP_TIME(); + break; + } + } + } + while (i < m_numThreads); +} + +void JobProvider::flush() +{ + if (m_nextProvider || m_prevProvider) + dequeue(); + dynamic_cast(m_pool)->FlushProviderList(); +} + +void JobProvider::enqueue() +{ + // Add this provider to the end of the thread pool's job provider list + X265_CHECK(!m_nextProvider && !m_prevProvider && m_pool, "job provider was already queued\n"); + m_pool->enqueueJobProvider(*this); + m_pool->pokeIdleThread(); +} + +void JobProvider::dequeue() +{ + // Remove this provider from the thread pool's job provider list + m_pool->dequeueJobProvider(*this); + // Ensure no jobs were missed while the provider was being removed + m_pool->pokeIdleThread(); +} + +int getCpuCount() +{ +#if _WIN32 + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; +#elif __unix__ + return sysconf(_SC_NPROCESSORS_ONLN); +#elif MACOS + int nm[2]; + size_t len = 4; + uint32_t count; + + nm[0] = CTL_HW; + nm[1] = HW_AVAILCPU; + sysctl(nm, 2, &count, &len, NULL, 0); + + if (count < 1) + { + nm[1] = HW_NCPU; + sysctl(nm, 2, &count, &len, NULL, 0); + if (count < 1) + count = 1; + } + + return count; +#else // if _WIN32 + return 2; // default to 2 threads, everywhere else +#endif // if _WIN32 +} +} // end namespace x265 diff --git a/source/common/threadpool.h b/source/common/threadpool.h new file mode 100644 index 0000000..7616670 --- /dev/null +++ b/source/common/threadpool.h @@ -0,0 +1,111 @@ +/***************************************************************************** + * x265: singleton thread pool and interface classes + ***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#ifndef X265_THREADPOOL_H +#define X265_THREADPOOL_H + +#include "common.h" + +namespace x265 { +// x265 private namespace + +class ThreadPool; + +int getCpuCount(); + +// Any class that wants to distribute work to the thread pool must +// derive from JobProvider and implement FindJob(). +class JobProvider +{ +protected: + + ThreadPool *m_pool; + + JobProvider *m_nextProvider; + JobProvider *m_prevProvider; + +public: + + JobProvider(ThreadPool *p) : m_pool(p), m_nextProvider(0), m_prevProvider(0) {} + + virtual ~JobProvider() {} + + void setThreadPool(ThreadPool *p) { m_pool = p; } + + // Register this job provider with the thread pool, jobs are available + void enqueue(); + + // Remove this job provider from the thread pool, all jobs complete + void dequeue(); + + // Worker threads will call this method to find a job. Must return true if + // work was completed. False if no work was available. + virtual bool findJob(int threadId) = 0; + + // All derived objects that call Enqueue *MUST* call flush before allowing + // their object to be destroyed, otherwise you will see random crashes involving + // partially freed vtables and you will be unhappy + void flush(); + + friend class ThreadPoolImpl; + friend class PoolThread; +}; + +// Abstract interface to ThreadPool. Each encoder instance should call +// AllocThreadPool() to get a handle to the singleton object and then make +// it available to their job provider structures (wave-front frame encoders, +// etc). +class ThreadPool +{ +protected: + + // Destructor is inaccessable, force the use of reference counted Release() + ~ThreadPool() {} + + virtual void enqueueJobProvider(JobProvider &) = 0; + + virtual void dequeueJobProvider(JobProvider &) = 0; + +public: + + // When numthreads == 0, a default thread count is used. A request may grow + // an existing pool but it will never shrink. + static ThreadPool *allocThreadPool(int numthreads = 0); + + static ThreadPool *getThreadPool(); + + virtual void pokeIdleThread() = 0; + + // The pool is reference counted so all calls to AllocThreadPool() should be + // followed by a call to Release() + virtual void release() = 0; + + virtual int getThreadCount() const = 0; + + friend class JobProvider; +}; +} // end namespace x265 + +#endif // ifndef X265_THREADPOOL_H diff --git a/source/common/vec/dct-sse3.cpp b/source/common/vec/dct-sse3.cpp new file mode 100644 index 0000000..c435b52 --- /dev/null +++ b/source/common/vec/dct-sse3.cpp @@ -0,0 +1,1572 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Mandar Gurav + * Deepthi Devaki Akkoorath + * Mahesh Pittala + * Rajesh Paulraj + * Min Chen + * Praveen Kumar Tiwari + * Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include // SSE +#include // SSE3 + +using namespace x265; + +namespace { +#if !HIGH_BIT_DEPTH +ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) = +{ + { 89, 75, 89, 75, 89, 75, 89, 75 }, + { 50, 18, 50, 18, 50, 18, 50, 18 }, + { 75, -18, 75, -18, 75, -18, 75, -18 }, + { -89, -50, -89, -50, -89, -50, -89, -50 }, + { 50, -89, 50, -89, 50, -89, 50, -89 }, + { 18, 75, 18, 75, 18, 75, 18, 75 }, + { 18, -50, 18, -50, 18, -50, 18, -50 }, + { 75, -89, 75, -89, 75, -89, 75, -89 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 } +}; +void idct8(int32_t *src, int16_t *dst, intptr_t stride) +{ + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; + __m128i T00, T01, T02, T03, T04, T05, T06, T07; + + m128iAdd = _mm_set1_epi32(64); + + T00 = _mm_load_si128((__m128i*)&src[8 + 0]); + T01 = _mm_load_si128((__m128i*)&src[8 + 4]); + m128iS1 = _mm_packs_epi32(T00, T01); + T00 = _mm_load_si128((__m128i*)&src[24 + 0]); + T01 = _mm_load_si128((__m128i*)&src[24 + 4]); + m128iS3 = _mm_packs_epi32(T00, T01); + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); + + T00 = _mm_load_si128((__m128i*)&src[40 + 0]); + T01 = _mm_load_si128((__m128i*)&src[40 + 4]); + m128iS5 = _mm_packs_epi32(T00, T01); + T00 = _mm_load_si128((__m128i*)&src[56 + 0]); + T01 = _mm_load_si128((__m128i*)&src[56 + 4]); + m128iS7 = _mm_packs_epi32(T00, T01); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); + + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + /* ------- */ + + T00 = _mm_load_si128((__m128i*)&src[0 + 0]); + T01 = _mm_load_si128((__m128i*)&src[0 + 4]); + m128iS0 = _mm_packs_epi32(T00, T01); + T00 = _mm_load_si128((__m128i*)&src[32 + 0]); + T01 = _mm_load_si128((__m128i*)&src[32 + 4]); + m128iS4 = _mm_packs_epi32(T00, T01); + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); + + EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); + EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); + + /* ------- */ + + T00 = _mm_load_si128((__m128i*)&src[16 + 0]); + T01 = _mm_load_si128((__m128i*)&src[16 + 4]); + m128iS2 = _mm_packs_epi32(T00, T01); + T00 = _mm_load_si128((__m128i*)&src[48 + 0]); + T01 = _mm_load_si128((__m128i*)&src[48 + 4]); + m128iS6 = _mm_packs_epi32(T00, T01); + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); + E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); + E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 7)); + m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 7)); + m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 7)); + m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 7)); + m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 7)); + m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 7)); + m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 7)); + m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 7)); + /* Invers matrix */ + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + m128iAdd = _mm_set1_epi32(2048); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); + E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); + E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); + E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); + EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); + EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); + E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); + E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 12)); + m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 12)); + m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 12)); + m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 12)); + m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 12)); + m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 12)); + m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 12)); + m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 12)); + + // [07 06 05 04 03 02 01 00] + // [17 16 15 14 13 12 11 10] + // [27 26 25 24 23 22 21 20] + // [37 36 35 34 33 32 31 30] + // [47 46 45 44 43 42 41 40] + // [57 56 55 54 53 52 51 50] + // [67 66 65 64 63 62 61 60] + // [77 76 75 74 73 72 71 70] + + T00 = _mm_unpacklo_epi16(m128iS0, m128iS1); // [13 03 12 02 11 01 10 00] + T01 = _mm_unpackhi_epi16(m128iS0, m128iS1); // [17 07 16 06 15 05 14 04] + T02 = _mm_unpacklo_epi16(m128iS2, m128iS3); // [33 23 32 22 31 21 30 20] + T03 = _mm_unpackhi_epi16(m128iS2, m128iS3); // [37 27 36 26 35 25 34 24] + T04 = _mm_unpacklo_epi16(m128iS4, m128iS5); // [53 43 52 42 51 41 50 40] + T05 = _mm_unpackhi_epi16(m128iS4, m128iS5); // [57 47 56 46 55 45 54 44] + T06 = _mm_unpacklo_epi16(m128iS6, m128iS7); // [73 63 72 62 71 61 70 60] + T07 = _mm_unpackhi_epi16(m128iS6, m128iS7); // [77 67 76 66 75 65 74 64] + + __m128i T10, T11; + T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00] + T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02] + _mm_storel_epi64((__m128i*)&dst[0 * stride + 0], T10); // [30 20 10 00] + _mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(T10)); // [31 21 11 01] + _mm_storel_epi64((__m128i*)&dst[2 * stride + 0], T11); // [32 22 12 02] + _mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(T11)); // [33 23 13 03] + + T10 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40] + T11 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42] + _mm_storel_epi64((__m128i*)&dst[0 * stride + 4], T10); + _mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(T10)); + _mm_storel_epi64((__m128i*)&dst[2 * stride + 4], T11); + _mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(T11)); + + T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04] + T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06] + _mm_storel_epi64((__m128i*)&dst[4 * stride + 0], T10); + _mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(T10)); + _mm_storel_epi64((__m128i*)&dst[6 * stride + 0], T11); + _mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(T11)); + + T10 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44] + T11 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36] + _mm_storel_epi64((__m128i*)&dst[4 * stride + 4], T10); + _mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(T10)); + _mm_storel_epi64((__m128i*)&dst[6 * stride + 4], T11); + _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11)); +} + +void idct16(int32_t *src, int16_t *dst, intptr_t stride) +{ + const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address + const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050); + const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039); + const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); + const __m128i c16_p57_p87 = _mm_set1_epi32(0x00390057); //row1 + const __m128i c16_n43_p09 = _mm_set1_epi32(0xFFD50009); + const __m128i c16_n90_n80 = _mm_set1_epi32(0xFFA6FFB0); + const __m128i c16_n25_n70 = _mm_set1_epi32(0xFFE7FFBA); + const __m128i c16_p09_p80 = _mm_set1_epi32(0x00090050); //row2 + const __m128i c16_n87_n70 = _mm_set1_epi32(0xFFA9FFBA); + const __m128i c16_p57_n25 = _mm_set1_epi32(0x0039FFE7); + const __m128i c16_p43_p90 = _mm_set1_epi32(0x002B005A); + const __m128i c16_n43_p70 = _mm_set1_epi32(0xFFD50046); //row3 + const __m128i c16_p09_n87 = _mm_set1_epi32(0x0009FFA9); + const __m128i c16_p25_p90 = _mm_set1_epi32(0x0019005A); + const __m128i c16_n57_n80 = _mm_set1_epi32(0xFFC7FFB0); + const __m128i c16_n80_p57 = _mm_set1_epi32(0xFFB00039); //row4 + const __m128i c16_p90_n25 = _mm_set1_epi32(0x005AFFE7); + const __m128i c16_n87_n09 = _mm_set1_epi32(0xFFA9FFF7); + const __m128i c16_p70_p43 = _mm_set1_epi32(0x0046002B); + const __m128i c16_n90_p43 = _mm_set1_epi32(0xFFA6002B); //row5 + const __m128i c16_p25_p57 = _mm_set1_epi32(0x00190039); + const __m128i c16_p70_n87 = _mm_set1_epi32(0x0046FFA9); + const __m128i c16_n80_p09 = _mm_set1_epi32(0xFFB00009); + const __m128i c16_n70_p25 = _mm_set1_epi32(0xFFBA0019); //row6 + const __m128i c16_n80_p90 = _mm_set1_epi32(0xFFB0005A); + const __m128i c16_p09_p43 = _mm_set1_epi32(0x0009002B); + const __m128i c16_p87_n57 = _mm_set1_epi32(0x0057FFC7); + const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row7 + const __m128i c16_n57_p43 = _mm_set1_epi32(0xFFC7002B); + const __m128i c16_n80_p70 = _mm_set1_epi32(0xFFB00046); + const __m128i c16_n90_p87 = _mm_set1_epi32(0xFFA60057); + + const __m128i c16_p75_p89 = _mm_set1_epi32(0x004B0059); + const __m128i c16_p18_p50 = _mm_set1_epi32(0x00120032); + const __m128i c16_n18_p75 = _mm_set1_epi32(0xFFEE004B); + const __m128i c16_n50_n89 = _mm_set1_epi32(0xFFCEFFA7); + const __m128i c16_n89_p50 = _mm_set1_epi32(0xFFA70032); + const __m128i c16_p75_p18 = _mm_set1_epi32(0x004B0012); + const __m128i c16_n50_p18 = _mm_set1_epi32(0xFFCE0012); + const __m128i c16_n89_p75 = _mm_set1_epi32(0xFFA7004B); + + const __m128i c16_p36_p83 = _mm_set1_epi32(0x00240053); + const __m128i c16_n83_p36 = _mm_set1_epi32(0xFFAD0024); + + const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040); + const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040); + __m128i c32_rnd = _mm_set1_epi32(64); + + int nShift = 7; + + // DCT1 + __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]; + __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; + __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2]; + __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; + + for (int i = 0; i < 2; i++) + { + const int offset = (i << 3); + __m128i T00, T01; + + T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]); + in00[i] = _mm_packs_epi32(T00, T01); // [07 06 05 04 03 02 01 00] + + T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]); + in01[i] = _mm_packs_epi32(T00, T01); // [17 16 15 14 13 12 11 10] + + T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]); + in02[i] = _mm_packs_epi32(T00, T01); // [27 26 25 24 23 22 21 20] + + T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]); + in03[i] = _mm_packs_epi32(T00, T01); // [37 36 35 34 33 32 31 30] + + T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]); + in04[i] = _mm_packs_epi32(T00, T01); // [47 46 45 44 43 42 41 40] + + T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]); + in05[i] = _mm_packs_epi32(T00, T01); // [57 56 55 54 53 52 51 50] + + T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]); + in06[i] = _mm_packs_epi32(T00, T01); // [67 66 65 64 63 62 61 60] + + T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]); + in07[i] = _mm_packs_epi32(T00, T01); // [77 76 75 74 73 72 71 70] + + T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]); + in08[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]); + in09[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]); + in10[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]); + in11[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]); + in12[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]); + in13[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]); + in14[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]); + in15[i] = _mm_packs_epi32(T00, T01); + } + + for (int pass = 0; pass < 2; pass++) + { + if (pass == 1) + { + c32_rnd = _mm_set1_epi32(2048); + nShift = 12; + } + + for (int part = 0; part < 2; part++) + { + const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] + const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] + const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] + const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] + const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] + const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] + const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] + const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] + const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] + const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] + const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] + const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] + const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]row + const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] + const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]); // [83 03 82 02 81 01 81 00] row08 row00 + const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]); // [87 07 86 06 85 05 84 04] + + __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; + __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; + { + __m128i T00, T01; +#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ + T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \ + T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \ + row = _mm_add_epi32(T00, T01); + + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A) + + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B) +#undef COMPUTE_ROW + } + + __m128i EO0A, EO1A, EO2A, EO3A; + __m128i EO0B, EO1B, EO2B, EO3B; + EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0 + EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50)); + EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1 + EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89)); + EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2 + EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18)); + EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3 + EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75)); + + __m128i EEO0A, EEO1A; + __m128i EEO0B, EEO1B; + EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83); + EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83); + EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36); + EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36); + + __m128i EEE0A, EEE1A; + __m128i EEE0B, EEE1B; + EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64); + EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64); + EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64); + EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64); + + const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 + const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); + const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 + const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); + const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 + const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B); + const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 + const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B); + + const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 + const __m128i E0B = _mm_add_epi32(EE0B, EO0B); + const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 + const __m128i E1B = _mm_add_epi32(EE1B, EO1B); + const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 + const __m128i E2B = _mm_add_epi32(EE2B, EO2B); + const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 + const __m128i E3B = _mm_add_epi32(EE3B, EO3B); + const __m128i E7A = _mm_sub_epi32(EE0A, EO0A); // E0 = EE0 - EO0 + const __m128i E7B = _mm_sub_epi32(EE0B, EO0B); + const __m128i E6A = _mm_sub_epi32(EE1A, EO1A); // E1 = EE1 - EO1 + const __m128i E6B = _mm_sub_epi32(EE1B, EO1B); + const __m128i E5A = _mm_sub_epi32(EE2A, EO2A); // E2 = EE2 - EO2 + const __m128i E5B = _mm_sub_epi32(EE2B, EO2B); + const __m128i E4A = _mm_sub_epi32(EE3A, EO3A); // E3 = EE3 - EO3 + const __m128i E4B = _mm_sub_epi32(EE3B, EO3B); + + const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd + const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); + const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd + const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); + const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd + const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); + const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd + const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); + const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd + const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); + const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd + const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); + const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd + const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); + const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd + const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); + + const __m128i T20A = _mm_add_epi32(T10A, O0A); // E0 + O0 + rnd + const __m128i T20B = _mm_add_epi32(T10B, O0B); + const __m128i T21A = _mm_add_epi32(T11A, O1A); // E1 + O1 + rnd + const __m128i T21B = _mm_add_epi32(T11B, O1B); + const __m128i T22A = _mm_add_epi32(T12A, O2A); // E2 + O2 + rnd + const __m128i T22B = _mm_add_epi32(T12B, O2B); + const __m128i T23A = _mm_add_epi32(T13A, O3A); // E3 + O3 + rnd + const __m128i T23B = _mm_add_epi32(T13B, O3B); + const __m128i T24A = _mm_add_epi32(T14A, O4A); // E4 + const __m128i T24B = _mm_add_epi32(T14B, O4B); + const __m128i T25A = _mm_add_epi32(T15A, O5A); // E5 + const __m128i T25B = _mm_add_epi32(T15B, O5B); + const __m128i T26A = _mm_add_epi32(T16A, O6A); // E6 + const __m128i T26B = _mm_add_epi32(T16B, O6B); + const __m128i T27A = _mm_add_epi32(T17A, O7A); // E7 + const __m128i T27B = _mm_add_epi32(T17B, O7B); + const __m128i T2FA = _mm_sub_epi32(T10A, O0A); // E0 - O0 + rnd + const __m128i T2FB = _mm_sub_epi32(T10B, O0B); + const __m128i T2EA = _mm_sub_epi32(T11A, O1A); // E1 - O1 + rnd + const __m128i T2EB = _mm_sub_epi32(T11B, O1B); + const __m128i T2DA = _mm_sub_epi32(T12A, O2A); // E2 - O2 + rnd + const __m128i T2DB = _mm_sub_epi32(T12B, O2B); + const __m128i T2CA = _mm_sub_epi32(T13A, O3A); // E3 - O3 + rnd + const __m128i T2CB = _mm_sub_epi32(T13B, O3B); + const __m128i T2BA = _mm_sub_epi32(T14A, O4A); // E4 + const __m128i T2BB = _mm_sub_epi32(T14B, O4B); + const __m128i T2AA = _mm_sub_epi32(T15A, O5A); // E5 + const __m128i T2AB = _mm_sub_epi32(T15B, O5B); + const __m128i T29A = _mm_sub_epi32(T16A, O6A); // E6 + const __m128i T29B = _mm_sub_epi32(T16B, O6B); + const __m128i T28A = _mm_sub_epi32(T17A, O7A); // E7 + const __m128i T28B = _mm_sub_epi32(T17B, O7B); + + const __m128i T30A = _mm_srai_epi32(T20A, nShift); // [30 20 10 00] + const __m128i T30B = _mm_srai_epi32(T20B, nShift); // [70 60 50 40] + const __m128i T31A = _mm_srai_epi32(T21A, nShift); // [31 21 11 01] + const __m128i T31B = _mm_srai_epi32(T21B, nShift); // [71 61 51 41] + const __m128i T32A = _mm_srai_epi32(T22A, nShift); // [32 22 12 02] + const __m128i T32B = _mm_srai_epi32(T22B, nShift); // [72 62 52 42] + const __m128i T33A = _mm_srai_epi32(T23A, nShift); // [33 23 13 03] + const __m128i T33B = _mm_srai_epi32(T23B, nShift); // [73 63 53 43] + const __m128i T34A = _mm_srai_epi32(T24A, nShift); // [33 24 14 04] + const __m128i T34B = _mm_srai_epi32(T24B, nShift); // [74 64 54 44] + const __m128i T35A = _mm_srai_epi32(T25A, nShift); // [35 25 15 05] + const __m128i T35B = _mm_srai_epi32(T25B, nShift); // [75 65 55 45] + const __m128i T36A = _mm_srai_epi32(T26A, nShift); // [36 26 16 06] + const __m128i T36B = _mm_srai_epi32(T26B, nShift); // [76 66 56 46] + const __m128i T37A = _mm_srai_epi32(T27A, nShift); // [37 27 17 07] + const __m128i T37B = _mm_srai_epi32(T27B, nShift); // [77 67 57 47] + + const __m128i T38A = _mm_srai_epi32(T28A, nShift); // [30 20 10 00] x8 + const __m128i T38B = _mm_srai_epi32(T28B, nShift); // [70 60 50 40] + const __m128i T39A = _mm_srai_epi32(T29A, nShift); // [31 21 11 01] x9 + const __m128i T39B = _mm_srai_epi32(T29B, nShift); // [71 61 51 41] + const __m128i T3AA = _mm_srai_epi32(T2AA, nShift); // [32 22 12 02] xA + const __m128i T3AB = _mm_srai_epi32(T2AB, nShift); // [72 62 52 42] + const __m128i T3BA = _mm_srai_epi32(T2BA, nShift); // [33 23 13 03] xB + const __m128i T3BB = _mm_srai_epi32(T2BB, nShift); // [73 63 53 43] + const __m128i T3CA = _mm_srai_epi32(T2CA, nShift); // [33 24 14 04] xC + const __m128i T3CB = _mm_srai_epi32(T2CB, nShift); // [74 64 54 44] + const __m128i T3DA = _mm_srai_epi32(T2DA, nShift); // [35 25 15 05] xD + const __m128i T3DB = _mm_srai_epi32(T2DB, nShift); // [75 65 55 45] + const __m128i T3EA = _mm_srai_epi32(T2EA, nShift); // [36 26 16 06] xE + const __m128i T3EB = _mm_srai_epi32(T2EB, nShift); // [76 66 56 46] + const __m128i T3FA = _mm_srai_epi32(T2FA, nShift); // [37 27 17 07] xF + const __m128i T3FB = _mm_srai_epi32(T2FB, nShift); // [77 67 57 47] + + res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] + res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] + res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] + res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] + res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] + res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] + res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] + res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] + + res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80] + res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81] + res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82] + res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83] + res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84] + res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85] + res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86] + res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87] + } + //transpose matrix 8x8 16bit. + { + __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; + __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; +#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ + tr0_0 = _mm_unpacklo_epi16(I0, I1); \ + tr0_1 = _mm_unpacklo_epi16(I2, I3); \ + tr0_2 = _mm_unpackhi_epi16(I0, I1); \ + tr0_3 = _mm_unpackhi_epi16(I2, I3); \ + tr0_4 = _mm_unpacklo_epi16(I4, I5); \ + tr0_5 = _mm_unpacklo_epi16(I6, I7); \ + tr0_6 = _mm_unpackhi_epi16(I4, I5); \ + tr0_7 = _mm_unpackhi_epi16(I6, I7); \ + tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + + TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) + TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) + TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) + TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) + +#undef TRANSPOSE_8x8_16BIT + } + } + + _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]); + _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]); + _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]); + _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]); + _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]); + _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]); + _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]); + _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]); + _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]); + _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]); + _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]); + _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]); + _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]); + _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]); + _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]); + _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]); + _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]); + _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]); + _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]); + _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]); + _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]); + _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]); + _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]); + _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]); + _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]); + _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]); + _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]); + _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]); + _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]); + _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]); + _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]); + _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]); +} + +void idct32(int32_t *src, int16_t *dst, intptr_t stride) +{ + //Odd + const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0 + const __m128i c16_p85_p88 = _mm_set1_epi32(0x00550058); + const __m128i c16_p78_p82 = _mm_set1_epi32(0x004E0052); + const __m128i c16_p67_p73 = _mm_set1_epi32(0x00430049); + const __m128i c16_p54_p61 = _mm_set1_epi32(0x0036003D); + const __m128i c16_p38_p46 = _mm_set1_epi32(0x0026002E); + const __m128i c16_p22_p31 = _mm_set1_epi32(0x0016001F); + const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); + const __m128i c16_p82_p90 = _mm_set1_epi32(0x0052005A); //column 1 + const __m128i c16_p46_p67 = _mm_set1_epi32(0x002E0043); + const __m128i c16_n04_p22 = _mm_set1_epi32(0xFFFC0016); + const __m128i c16_n54_n31 = _mm_set1_epi32(0xFFCAFFE1); + const __m128i c16_n85_n73 = _mm_set1_epi32(0xFFABFFB7); + const __m128i c16_n88_n90 = _mm_set1_epi32(0xFFA8FFA6); + const __m128i c16_n61_n78 = _mm_set1_epi32(0xFFC3FFB2); + const __m128i c16_n13_n38 = _mm_set1_epi32(0xFFF3FFDA); + const __m128i c16_p67_p88 = _mm_set1_epi32(0x00430058); //column 2 + const __m128i c16_n13_p31 = _mm_set1_epi32(0xFFF3001F); + const __m128i c16_n82_n54 = _mm_set1_epi32(0xFFAEFFCA); + const __m128i c16_n78_n90 = _mm_set1_epi32(0xFFB2FFA6); + const __m128i c16_n04_n46 = _mm_set1_epi32(0xFFFCFFD2); + const __m128i c16_p73_p38 = _mm_set1_epi32(0x00490026); + const __m128i c16_p85_p90 = _mm_set1_epi32(0x0055005A); + const __m128i c16_p22_p61 = _mm_set1_epi32(0x0016003D); + const __m128i c16_p46_p85 = _mm_set1_epi32(0x002E0055); //column 3 + const __m128i c16_n67_n13 = _mm_set1_epi32(0xFFBDFFF3); + const __m128i c16_n73_n90 = _mm_set1_epi32(0xFFB7FFA6); + const __m128i c16_p38_n22 = _mm_set1_epi32(0x0026FFEA); + const __m128i c16_p88_p82 = _mm_set1_epi32(0x00580052); + const __m128i c16_n04_p54 = _mm_set1_epi32(0xFFFC0036); + const __m128i c16_n90_n61 = _mm_set1_epi32(0xFFA6FFC3); + const __m128i c16_n31_n78 = _mm_set1_epi32(0xFFE1FFB2); + const __m128i c16_p22_p82 = _mm_set1_epi32(0x00160052); //column 4 + const __m128i c16_n90_n54 = _mm_set1_epi32(0xFFA6FFCA); + const __m128i c16_p13_n61 = _mm_set1_epi32(0x000DFFC3); + const __m128i c16_p85_p78 = _mm_set1_epi32(0x0055004E); + const __m128i c16_n46_p31 = _mm_set1_epi32(0xFFD2001F); + const __m128i c16_n67_n90 = _mm_set1_epi32(0xFFBDFFA6); + const __m128i c16_p73_p04 = _mm_set1_epi32(0x00490004); + const __m128i c16_p38_p88 = _mm_set1_epi32(0x00260058); + const __m128i c16_n04_p78 = _mm_set1_epi32(0xFFFC004E); //column 5 + const __m128i c16_n73_n82 = _mm_set1_epi32(0xFFB7FFAE); + const __m128i c16_p85_p13 = _mm_set1_epi32(0x0055000D); + const __m128i c16_n22_p67 = _mm_set1_epi32(0xFFEA0043); + const __m128i c16_n61_n88 = _mm_set1_epi32(0xFFC3FFA8); + const __m128i c16_p90_p31 = _mm_set1_epi32(0x005A001F); + const __m128i c16_n38_p54 = _mm_set1_epi32(0xFFDA0036); + const __m128i c16_n46_n90 = _mm_set1_epi32(0xFFD2FFA6); + const __m128i c16_n31_p73 = _mm_set1_epi32(0xFFE10049); //column 6 + const __m128i c16_n22_n90 = _mm_set1_epi32(0xFFEAFFA6); + const __m128i c16_p67_p78 = _mm_set1_epi32(0x0043004E); + const __m128i c16_n90_n38 = _mm_set1_epi32(0xFFA6FFDA); + const __m128i c16_p82_n13 = _mm_set1_epi32(0x0052FFF3); + const __m128i c16_n46_p61 = _mm_set1_epi32(0xFFD2003D); + const __m128i c16_n04_n88 = _mm_set1_epi32(0xFFFCFFA8); + const __m128i c16_p54_p85 = _mm_set1_epi32(0x00360055); + const __m128i c16_n54_p67 = _mm_set1_epi32(0xFFCA0043); //column 7 + const __m128i c16_p38_n78 = _mm_set1_epi32(0x0026FFB2); + const __m128i c16_n22_p85 = _mm_set1_epi32(0xFFEA0055); + const __m128i c16_p04_n90 = _mm_set1_epi32(0x0004FFA6); + const __m128i c16_p13_p90 = _mm_set1_epi32(0x000D005A); + const __m128i c16_n31_n88 = _mm_set1_epi32(0xFFE1FFA8); + const __m128i c16_p46_p82 = _mm_set1_epi32(0x002E0052); + const __m128i c16_n61_n73 = _mm_set1_epi32(0xFFC3FFB7); + const __m128i c16_n73_p61 = _mm_set1_epi32(0xFFB7003D); //column 8 + const __m128i c16_p82_n46 = _mm_set1_epi32(0x0052FFD2); + const __m128i c16_n88_p31 = _mm_set1_epi32(0xFFA8001F); + const __m128i c16_p90_n13 = _mm_set1_epi32(0x005AFFF3); + const __m128i c16_n90_n04 = _mm_set1_epi32(0xFFA6FFFC); + const __m128i c16_p85_p22 = _mm_set1_epi32(0x00550016); + const __m128i c16_n78_n38 = _mm_set1_epi32(0xFFB2FFDA); + const __m128i c16_p67_p54 = _mm_set1_epi32(0x00430036); + const __m128i c16_n85_p54 = _mm_set1_epi32(0xFFAB0036); //column 9 + const __m128i c16_p88_n04 = _mm_set1_epi32(0x0058FFFC); + const __m128i c16_n61_n46 = _mm_set1_epi32(0xFFC3FFD2); + const __m128i c16_p13_p82 = _mm_set1_epi32(0x000D0052); + const __m128i c16_p38_n90 = _mm_set1_epi32(0x0026FFA6); + const __m128i c16_n78_p67 = _mm_set1_epi32(0xFFB20043); + const __m128i c16_p90_n22 = _mm_set1_epi32(0x005AFFEA); + const __m128i c16_n73_n31 = _mm_set1_epi32(0xFFB7FFE1); + const __m128i c16_n90_p46 = _mm_set1_epi32(0xFFA6002E); //column 10 + const __m128i c16_p54_p38 = _mm_set1_epi32(0x00360026); + const __m128i c16_p31_n90 = _mm_set1_epi32(0x001FFFA6); + const __m128i c16_n88_p61 = _mm_set1_epi32(0xFFA8003D); + const __m128i c16_p67_p22 = _mm_set1_epi32(0x00430016); + const __m128i c16_p13_n85 = _mm_set1_epi32(0x000DFFAB); + const __m128i c16_n82_p73 = _mm_set1_epi32(0xFFAE0049); + const __m128i c16_p78_p04 = _mm_set1_epi32(0x004E0004); + const __m128i c16_n88_p38 = _mm_set1_epi32(0xFFA80026); //column 11 + const __m128i c16_n04_p73 = _mm_set1_epi32(0xFFFC0049); + const __m128i c16_p90_n67 = _mm_set1_epi32(0x005AFFBD); + const __m128i c16_n31_n46 = _mm_set1_epi32(0xFFE1FFD2); + const __m128i c16_n78_p85 = _mm_set1_epi32(0xFFB20055); + const __m128i c16_p61_p13 = _mm_set1_epi32(0x003D000D); + const __m128i c16_p54_n90 = _mm_set1_epi32(0x0036FFA6); + const __m128i c16_n82_p22 = _mm_set1_epi32(0xFFAE0016); + const __m128i c16_n78_p31 = _mm_set1_epi32(0xFFB2001F); //column 12 + const __m128i c16_n61_p90 = _mm_set1_epi32(0xFFC3005A); + const __m128i c16_p54_p04 = _mm_set1_epi32(0x00360004); + const __m128i c16_p82_n88 = _mm_set1_epi32(0x0052FFA8); + const __m128i c16_n22_n38 = _mm_set1_epi32(0xFFEAFFDA); + const __m128i c16_n90_p73 = _mm_set1_epi32(0xFFA60049); + const __m128i c16_n13_p67 = _mm_set1_epi32(0xFFF30043); + const __m128i c16_p85_n46 = _mm_set1_epi32(0x0055FFD2); + const __m128i c16_n61_p22 = _mm_set1_epi32(0xFFC30016); //column 13 + const __m128i c16_n90_p85 = _mm_set1_epi32(0xFFA60055); + const __m128i c16_n38_p73 = _mm_set1_epi32(0xFFDA0049); + const __m128i c16_p46_n04 = _mm_set1_epi32(0x002EFFFC); + const __m128i c16_p90_n78 = _mm_set1_epi32(0x005AFFB2); + const __m128i c16_p54_n82 = _mm_set1_epi32(0x0036FFAE); + const __m128i c16_n31_n13 = _mm_set1_epi32(0xFFE1FFF3); + const __m128i c16_n88_p67 = _mm_set1_epi32(0xFFA80043); + const __m128i c16_n38_p13 = _mm_set1_epi32(0xFFDA000D); //column 14 + const __m128i c16_n78_p61 = _mm_set1_epi32(0xFFB2003D); + const __m128i c16_n90_p88 = _mm_set1_epi32(0xFFA60058); + const __m128i c16_n73_p85 = _mm_set1_epi32(0xFFB70055); + const __m128i c16_n31_p54 = _mm_set1_epi32(0xFFE10036); + const __m128i c16_p22_p04 = _mm_set1_epi32(0x00160004); + const __m128i c16_p67_n46 = _mm_set1_epi32(0x0043FFD2); + const __m128i c16_p90_n82 = _mm_set1_epi32(0x005AFFAE); + const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //column 15 + const __m128i c16_n31_p22 = _mm_set1_epi32(0xFFE10016); + const __m128i c16_n46_p38 = _mm_set1_epi32(0xFFD20026); + const __m128i c16_n61_p54 = _mm_set1_epi32(0xFFC30036); + const __m128i c16_n73_p67 = _mm_set1_epi32(0xFFB70043); + const __m128i c16_n82_p78 = _mm_set1_epi32(0xFFAE004E); + const __m128i c16_n88_p85 = _mm_set1_epi32(0xFFA80055); + const __m128i c16_n90_p90 = _mm_set1_epi32(0xFFA6005A); + + //EO + const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address + const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050); + const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039); + const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); + const __m128i c16_p57_p87 = _mm_set1_epi32(0x00390057); //row1 + const __m128i c16_n43_p09 = _mm_set1_epi32(0xFFD50009); + const __m128i c16_n90_n80 = _mm_set1_epi32(0xFFA6FFB0); + const __m128i c16_n25_n70 = _mm_set1_epi32(0xFFE7FFBA); + const __m128i c16_p09_p80 = _mm_set1_epi32(0x00090050); //row2 + const __m128i c16_n87_n70 = _mm_set1_epi32(0xFFA9FFBA); + const __m128i c16_p57_n25 = _mm_set1_epi32(0x0039FFE7); + const __m128i c16_p43_p90 = _mm_set1_epi32(0x002B005A); + const __m128i c16_n43_p70 = _mm_set1_epi32(0xFFD50046); //row3 + const __m128i c16_p09_n87 = _mm_set1_epi32(0x0009FFA9); + const __m128i c16_p25_p90 = _mm_set1_epi32(0x0019005A); + const __m128i c16_n57_n80 = _mm_set1_epi32(0xFFC7FFB0); + const __m128i c16_n80_p57 = _mm_set1_epi32(0xFFB00039); //row4 + const __m128i c16_p90_n25 = _mm_set1_epi32(0x005AFFE7); + const __m128i c16_n87_n09 = _mm_set1_epi32(0xFFA9FFF7); + const __m128i c16_p70_p43 = _mm_set1_epi32(0x0046002B); + const __m128i c16_n90_p43 = _mm_set1_epi32(0xFFA6002B); //row5 + const __m128i c16_p25_p57 = _mm_set1_epi32(0x00190039); + const __m128i c16_p70_n87 = _mm_set1_epi32(0x0046FFA9); + const __m128i c16_n80_p09 = _mm_set1_epi32(0xFFB00009); + const __m128i c16_n70_p25 = _mm_set1_epi32(0xFFBA0019); //row6 + const __m128i c16_n80_p90 = _mm_set1_epi32(0xFFB0005A); + const __m128i c16_p09_p43 = _mm_set1_epi32(0x0009002B); + const __m128i c16_p87_n57 = _mm_set1_epi32(0x0057FFC7); + const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row7 + const __m128i c16_n57_p43 = _mm_set1_epi32(0xFFC7002B); + const __m128i c16_n80_p70 = _mm_set1_epi32(0xFFB00046); + const __m128i c16_n90_p87 = _mm_set1_epi32(0xFFA60057); + //EEO + const __m128i c16_p75_p89 = _mm_set1_epi32(0x004B0059); + const __m128i c16_p18_p50 = _mm_set1_epi32(0x00120032); + const __m128i c16_n18_p75 = _mm_set1_epi32(0xFFEE004B); + const __m128i c16_n50_n89 = _mm_set1_epi32(0xFFCEFFA7); + const __m128i c16_n89_p50 = _mm_set1_epi32(0xFFA70032); + const __m128i c16_p75_p18 = _mm_set1_epi32(0x004B0012); + const __m128i c16_n50_p18 = _mm_set1_epi32(0xFFCE0012); + const __m128i c16_n89_p75 = _mm_set1_epi32(0xFFA7004B); + //EEEO + const __m128i c16_p36_p83 = _mm_set1_epi32(0x00240053); + const __m128i c16_n83_p36 = _mm_set1_epi32(0xFFAD0024); + //EEEE + const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040); + const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040); + __m128i c32_rnd = _mm_set1_epi32(64); + + int nShift = 7; + + // DCT1 + __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4]; + __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4]; + __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; + __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; + + for (int i = 0; i < 4; i++) + { + const int offset = (i << 3); + __m128i T00, T01; + + T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]); + in00[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]); + in01[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]); + in02[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]); + in03[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]); + in04[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]); + in05[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]); + in06[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]); + in07[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]); + in08[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]); + in09[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]); + in10[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]); + in11[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]); + in12[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]); + in13[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]); + in14[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]); + in15[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]); + in16[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]); + in17[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]); + in18[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]); + in19[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]); + in20[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]); + in21[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]); + in22[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]); + in23[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]); + in24[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]); + in25[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]); + in26[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]); + in27[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]); + in28[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]); + in29[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]); + in30[i] = _mm_packs_epi32(T00, T01); + + T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]); + T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]); + in31[i] = _mm_packs_epi32(T00, T01); + } + + for (int pass = 0; pass < 2; pass++) + { + if (pass == 1) + { + c32_rnd = _mm_set1_epi32(2048); + nShift = 12; + } + + for (int part = 0; part < 4; part++) + { + const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] + const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] + const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] + const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] + const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] + const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] + const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] + const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] + const __m128i T_00_04A = _mm_unpacklo_epi16(in17[part], in19[part]); // [ ] + const __m128i T_00_04B = _mm_unpackhi_epi16(in17[part], in19[part]); // [ ] + const __m128i T_00_05A = _mm_unpacklo_epi16(in21[part], in23[part]); // [ ] + const __m128i T_00_05B = _mm_unpackhi_epi16(in21[part], in23[part]); // [ ] + const __m128i T_00_06A = _mm_unpacklo_epi16(in25[part], in27[part]); // [ ] + const __m128i T_00_06B = _mm_unpackhi_epi16(in25[part], in27[part]); // [ ] + const __m128i T_00_07A = _mm_unpacklo_epi16(in29[part], in31[part]); // + const __m128i T_00_07B = _mm_unpackhi_epi16(in29[part], in31[part]); // [ ] + + const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] + const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] + const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] + const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] + const __m128i T_00_10A = _mm_unpacklo_epi16(in18[part], in22[part]); // [ ] + const __m128i T_00_10B = _mm_unpackhi_epi16(in18[part], in22[part]); // [ ] + const __m128i T_00_11A = _mm_unpacklo_epi16(in26[part], in30[part]); // [ ] + const __m128i T_00_11B = _mm_unpackhi_epi16(in26[part], in30[part]); // [ ] + + const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ] + const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] + const __m128i T_00_13A = _mm_unpacklo_epi16(in20[part], in28[part]); // [ ] + const __m128i T_00_13B = _mm_unpackhi_epi16(in20[part], in28[part]); // [ ] + + const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], in24[part]); // + const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], in24[part]); // [ ] + const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], in16[part]); // + const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], in16[part]); // [ ] + + __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; + __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; + { + __m128i T00, T01, T02, T03; +#define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ + T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ + T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ + T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ + T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ + row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); + + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_p90_p90, c16_p85_p88, c16_p78_p82, c16_p67_p73, c16_p54_p61, c16_p38_p46, c16_p22_p31, c16_p04_p13, O00A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_p82_p90, c16_p46_p67, c16_n04_p22, c16_n54_n31, c16_n85_n73, c16_n88_n90, c16_n61_n78, c16_n13_n38, O01A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_p67_p88, c16_n13_p31, c16_n82_n54, c16_n78_n90, c16_n04_n46, c16_p73_p38, c16_p85_p90, c16_p22_p61, O02A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_p46_p85, c16_n67_n13, c16_n73_n90, c16_p38_n22, c16_p88_p82, c16_n04_p54, c16_n90_n61, c16_n31_n78, O03A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_p22_p82, c16_n90_n54, c16_p13_n61, c16_p85_p78, c16_n46_p31, c16_n67_n90, c16_p73_p04, c16_p38_p88, O04A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n04_p78, c16_n73_n82, c16_p85_p13, c16_n22_p67, c16_n61_n88, c16_p90_p31, c16_n38_p54, c16_n46_n90, O05A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n31_p73, c16_n22_n90, c16_p67_p78, c16_n90_n38, c16_p82_n13, c16_n46_p61, c16_n04_n88, c16_p54_p85, O06A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n54_p67, c16_p38_n78, c16_n22_p85, c16_p04_n90, c16_p13_p90, c16_n31_n88, c16_p46_p82, c16_n61_n73, O07A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n73_p61, c16_p82_n46, c16_n88_p31, c16_p90_n13, c16_n90_n04, c16_p85_p22, c16_n78_n38, c16_p67_p54, O08A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n85_p54, c16_p88_n04, c16_n61_n46, c16_p13_p82, c16_p38_n90, c16_n78_p67, c16_p90_n22, c16_n73_n31, O09A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n90_p46, c16_p54_p38, c16_p31_n90, c16_n88_p61, c16_p67_p22, c16_p13_n85, c16_n82_p73, c16_p78_p04, O10A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n88_p38, c16_n04_p73, c16_p90_n67, c16_n31_n46, c16_n78_p85, c16_p61_p13, c16_p54_n90, c16_n82_p22, O11A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n78_p31, c16_n61_p90, c16_p54_p04, c16_p82_n88, c16_n22_n38, c16_n90_p73, c16_n13_p67, c16_p85_n46, O12A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n61_p22, c16_n90_p85, c16_n38_p73, c16_p46_n04, c16_p90_n78, c16_p54_n82, c16_n31_n13, c16_n88_p67, O13A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n38_p13, c16_n78_p61, c16_n90_p88, c16_n73_p85, c16_n31_p54, c16_p22_p04, c16_p67_n46, c16_p90_n82, O14A) + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ + c16_n13_p04, c16_n31_p22, c16_n46_p38, c16_n61_p54, c16_n73_p67, c16_n82_p78, c16_n88_p85, c16_n90_p90, O15A) + + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_p90_p90, c16_p85_p88, c16_p78_p82, c16_p67_p73, c16_p54_p61, c16_p38_p46, c16_p22_p31, c16_p04_p13, O00B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_p82_p90, c16_p46_p67, c16_n04_p22, c16_n54_n31, c16_n85_n73, c16_n88_n90, c16_n61_n78, c16_n13_n38, O01B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_p67_p88, c16_n13_p31, c16_n82_n54, c16_n78_n90, c16_n04_n46, c16_p73_p38, c16_p85_p90, c16_p22_p61, O02B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_p46_p85, c16_n67_n13, c16_n73_n90, c16_p38_n22, c16_p88_p82, c16_n04_p54, c16_n90_n61, c16_n31_n78, O03B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_p22_p82, c16_n90_n54, c16_p13_n61, c16_p85_p78, c16_n46_p31, c16_n67_n90, c16_p73_p04, c16_p38_p88, O04B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n04_p78, c16_n73_n82, c16_p85_p13, c16_n22_p67, c16_n61_n88, c16_p90_p31, c16_n38_p54, c16_n46_n90, O05B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n31_p73, c16_n22_n90, c16_p67_p78, c16_n90_n38, c16_p82_n13, c16_n46_p61, c16_n04_n88, c16_p54_p85, O06B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n54_p67, c16_p38_n78, c16_n22_p85, c16_p04_n90, c16_p13_p90, c16_n31_n88, c16_p46_p82, c16_n61_n73, O07B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n73_p61, c16_p82_n46, c16_n88_p31, c16_p90_n13, c16_n90_n04, c16_p85_p22, c16_n78_n38, c16_p67_p54, O08B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n85_p54, c16_p88_n04, c16_n61_n46, c16_p13_p82, c16_p38_n90, c16_n78_p67, c16_p90_n22, c16_n73_n31, O09B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n90_p46, c16_p54_p38, c16_p31_n90, c16_n88_p61, c16_p67_p22, c16_p13_n85, c16_n82_p73, c16_p78_p04, O10B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n88_p38, c16_n04_p73, c16_p90_n67, c16_n31_n46, c16_n78_p85, c16_p61_p13, c16_p54_n90, c16_n82_p22, O11B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n78_p31, c16_n61_p90, c16_p54_p04, c16_p82_n88, c16_n22_n38, c16_n90_p73, c16_n13_p67, c16_p85_n46, O12B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n61_p22, c16_n90_p85, c16_n38_p73, c16_p46_n04, c16_p90_n78, c16_p54_n82, c16_n31_n13, c16_n88_p67, O13B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n38_p13, c16_n78_p61, c16_n90_p88, c16_n73_p85, c16_n31_p54, c16_p22_p04, c16_p67_n46, c16_p90_n82, O14B) + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ + c16_n13_p04, c16_n31_p22, c16_n46_p38, c16_n61_p54, c16_n73_p67, c16_n82_p78, c16_n88_p85, c16_n90_p90, O15B) + +#undef COMPUTE_ROW + } + + __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; + __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; + { + __m128i T00, T01; +#define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ + T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ + T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ + row = _mm_add_epi32(T00, T01); + + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, EO0A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, EO1A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, EO2A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, EO3A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, EO4A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, EO5A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, EO6A) + COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, EO7A) + + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, EO0B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, EO1B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, EO2B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, EO3B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, EO4B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, EO5B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, EO6B) + COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, EO7B) +#undef COMPUTE_ROW + } + + const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p75_p89), _mm_madd_epi16(T_00_13A, c16_p18_p50)); // EEO0 + const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p75_p89), _mm_madd_epi16(T_00_13B, c16_p18_p50)); + const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n18_p75), _mm_madd_epi16(T_00_13A, c16_n50_n89)); // EEO1 + const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n18_p75), _mm_madd_epi16(T_00_13B, c16_n50_n89)); + const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n89_p50), _mm_madd_epi16(T_00_13A, c16_p75_p18)); // EEO2 + const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n89_p50), _mm_madd_epi16(T_00_13B, c16_p75_p18)); + const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n50_p18), _mm_madd_epi16(T_00_13A, c16_n89_p75)); // EEO3 + const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n50_p18), _mm_madd_epi16(T_00_13B, c16_n89_p75)); + + const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p36_p83); + const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p36_p83); + const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n83_p36); + const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n83_p36); + + const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p64_p64); + const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p64_p64); + const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n64_p64); + const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n64_p64); + + const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 + const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); + const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 + const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); + const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 + const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); + const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 + const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); + + const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 + const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); + const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 + const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); + const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 + const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); + const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 + const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); + const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 + const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); + const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 + const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); + const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 + const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); + const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 + const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); + + const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 + const __m128i E0B = _mm_add_epi32(EE0B, EO0B); + const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 + const __m128i E1B = _mm_add_epi32(EE1B, EO1B); + const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 + const __m128i E2B = _mm_add_epi32(EE2B, EO2B); + const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 + const __m128i E3B = _mm_add_epi32(EE3B, EO3B); + const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = + const __m128i E4B = _mm_add_epi32(EE4B, EO4B); + const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = + const __m128i E5B = _mm_add_epi32(EE5B, EO5B); + const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = + const __m128i E6B = _mm_add_epi32(EE6B, EO6B); + const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = + const __m128i E7B = _mm_add_epi32(EE7B, EO7B); + const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 + const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); + const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 + const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); + const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 + const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); + const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 + const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); + const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = + const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); + const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = + const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); + const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = + const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); + const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = + const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); + + const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd + const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); + const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd + const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); + const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd + const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); + const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd + const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); + const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd + const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); + const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd + const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); + const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd + const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); + const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd + const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); + const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd + const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); + const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd + const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); + const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd + const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); + const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd + const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); + const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd + const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); + const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd + const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); + const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd + const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); + const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd + const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); + + const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd + const __m128i T2_00B = _mm_add_epi32(T10B, O00B); + const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd + const __m128i T2_01B = _mm_add_epi32(T11B, O01B); + const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd + const __m128i T2_02B = _mm_add_epi32(T12B, O02B); + const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd + const __m128i T2_03B = _mm_add_epi32(T13B, O03B); + const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 + const __m128i T2_04B = _mm_add_epi32(T14B, O04B); + const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 + const __m128i T2_05B = _mm_add_epi32(T15B, O05B); + const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 + const __m128i T2_06B = _mm_add_epi32(T16B, O06B); + const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 + const __m128i T2_07B = _mm_add_epi32(T17B, O07B); + const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 + const __m128i T2_08B = _mm_add_epi32(T18B, O08B); + const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 + const __m128i T2_09B = _mm_add_epi32(T19B, O09B); + const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 + const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); + const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 + const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); + const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 + const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); + const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 + const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); + const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 + const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); + const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 + const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); + const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd + const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); + const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd + const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); + const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd + const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); + const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd + const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); + const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 + const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); + const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 + const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); + const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 + const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); + const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 + const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); + const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // + const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); + const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // + const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); + const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // + const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); + const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // + const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); + const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // + const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); + const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // + const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); + const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // + const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); + const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // + const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); + + const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] + const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] + const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] + const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] + const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] + const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] + const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] + const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] + const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] + const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] + const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] + const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] + const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] + const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] + const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] + const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] + const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 + const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] + const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 + const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] + const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA + const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] + const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB + const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] + const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC + const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] + const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD + const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] + const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE + const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] + const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF + const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] + + const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] + const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] + const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] + const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] + const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] + const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] + const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] + const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] + const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] + const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] + const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] + const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] + const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] + const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] + const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] + const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] + const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 + const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] + const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 + const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] + const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA + const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] + const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB + const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] + const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC + const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] + const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD + const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] + const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE + const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] + const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF + const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] + + res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] + res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] + res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] + res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] + res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] + res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] + res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] + res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] + res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] + res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] + res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] + res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] + res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] + res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] + res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] + res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] + res16[part] = _mm_packs_epi32(T3_16A, T3_16B); + res17[part] = _mm_packs_epi32(T3_17A, T3_17B); + res18[part] = _mm_packs_epi32(T3_18A, T3_18B); + res19[part] = _mm_packs_epi32(T3_19A, T3_19B); + res20[part] = _mm_packs_epi32(T3_20A, T3_20B); + res21[part] = _mm_packs_epi32(T3_21A, T3_21B); + res22[part] = _mm_packs_epi32(T3_22A, T3_22B); + res23[part] = _mm_packs_epi32(T3_23A, T3_23B); + res24[part] = _mm_packs_epi32(T3_24A, T3_24B); + res25[part] = _mm_packs_epi32(T3_25A, T3_25B); + res26[part] = _mm_packs_epi32(T3_26A, T3_26B); + res27[part] = _mm_packs_epi32(T3_27A, T3_27B); + res28[part] = _mm_packs_epi32(T3_28A, T3_28B); + res29[part] = _mm_packs_epi32(T3_29A, T3_29B); + res30[part] = _mm_packs_epi32(T3_30A, T3_30B); + res31[part] = _mm_packs_epi32(T3_31A, T3_31B); + } + //transpose matrix 8x8 16bit. + { + __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; + __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; +#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ + tr0_0 = _mm_unpacklo_epi16(I0, I1); \ + tr0_1 = _mm_unpacklo_epi16(I2, I3); \ + tr0_2 = _mm_unpackhi_epi16(I0, I1); \ + tr0_3 = _mm_unpackhi_epi16(I2, I3); \ + tr0_4 = _mm_unpacklo_epi16(I4, I5); \ + tr0_5 = _mm_unpacklo_epi16(I6, I7); \ + tr0_6 = _mm_unpackhi_epi16(I4, I5); \ + tr0_7 = _mm_unpackhi_epi16(I6, I7); \ + tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + + TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) + TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) + TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0]) + TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]) + + TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) + TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) + TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1]) + TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]) + + TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) + TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]) + TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2]) + TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]) + + TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) + TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3]) + TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3]) + TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3]) + +#undef TRANSPOSE_8x8_16BIT + } + } + + // Add + for (int i = 0; i < 2; i++) + { +#define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \ + _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \ + _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \ + _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \ + _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \ + _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \ + _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \ + _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \ + _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \ + _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \ + _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \ + _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \ + _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \ + _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \ + _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \ + _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \ + _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7); + + const int k = i * 2; + STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16) + STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16) + STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16) + STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16) +#undef STORE_LINE + } +} + +#endif // if !HIGH_BIT_DEPTH +} + +namespace x265 { +void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives &p) +{ + /* Note: We have AVX2 assembly for these two functions, but since AVX2 is + * still somewhat rare on end-user PCs we still compile and link these SSE3 + * intrinsic SIMD functions */ +#if !HIGH_BIT_DEPTH + p.idct[IDCT_8x8] = idct8; + p.idct[IDCT_16x16] = idct16; + p.idct[IDCT_32x32] = idct32; +#endif +} +} diff --git a/source/common/vec/dct-sse41.cpp b/source/common/vec/dct-sse41.cpp new file mode 100644 index 0000000..aa52709 --- /dev/null +++ b/source/common/vec/dct-sse41.cpp @@ -0,0 +1,118 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Mandar Gurav + * Deepthi Devaki Akkoorath + * Mahesh Pittala + * Rajesh Paulraj + * Min Chen + * Praveen Kumar Tiwari + * Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include // SSE +#include // SSE4.1 + +using namespace x265; + +namespace { +void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift) +{ + X265_CHECK(num <= 32 * 32, "dequant num too large\n"); + + int valueToAdd; + + shift += 4; + + if (shift > per) + { + valueToAdd = 1 << (shift - per - 1); + __m128i IAdd = _mm_set1_epi32(valueToAdd); + + for (int n = 0; n < num; n = n + 8) + { + __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign; + + quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n)); + + deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n)); + deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4)); + + sign = _mm_srai_epi16(quantCoef12, 15); + quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); + quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); + + quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, deQuantCoef1), IAdd), _mm_cvtsi32_si128(shift - per)); + quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per)); + + quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); + sign = _mm_srai_epi16(quantCoef12, 15); + quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); + _mm_storeu_si128((__m128i*)(coef + n), quantCoef1); + quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); + _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2); + } + } + else + { + for (int n = 0; n < num; n = n + 8) + { + __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign; + + quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n)); + + deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n)); + deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4)); + + sign = _mm_srai_epi16(quantCoef12, 15); + quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); + quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); + + quantCoef1 = _mm_mullo_epi32(quantCoef1, deQuantCoef1); + quantCoef2 = _mm_mullo_epi32(quantCoef2, deQuantCoef2); + + quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); + sign = _mm_srai_epi16(quantCoef12, 15); + quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); + quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); + + quantCoef1 = _mm_sll_epi32(quantCoef1, _mm_cvtsi32_si128(per - shift)); + quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift)); + + quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2); + sign = _mm_srai_epi16(quantCoef12, 15); + quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign); + _mm_storeu_si128((__m128i*)(coef + n), quantCoef1); + quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign); + _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2); + } + } +} +} + +namespace x265 { +void Setup_Vec_DCTPrimitives_sse41(EncoderPrimitives &p) +{ + p.dequant_scaling = dequant_scaling; +} +} diff --git a/source/common/vec/dct-ssse3.cpp b/source/common/vec/dct-ssse3.cpp new file mode 100644 index 0000000..bbb7858 --- /dev/null +++ b/source/common/vec/dct-ssse3.cpp @@ -0,0 +1,1108 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Mandar Gurav + * Deepthi Devaki Akkoorath + * Mahesh Pittala + * Rajesh Paulraj + * Min Chen + * Praveen Kumar Tiwari + * Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include // SSE +#include // SSE3 +#include // SSSE3 + +using namespace x265; + +#if !HIGH_BIT_DEPTH +namespace { +ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) = +{ + { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, + + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 }, + { 89, 18, 75, 50, 89, 18, 75, 50 }, + { 75, -50, -18, -89, 75, -50, -18, -89 }, + { 50, 75, -89, 18, 50, 75, -89, 18 }, + { 18, -89, -50, 75, 18, -89, -50, 75 }, + + { 83, 83, -83, -83, 36, 36, -36, -36 }, + { 36, 36, -36, -36, -83, -83, 83, 83 }, + { 89, -89, 18, -18, 75, -75, 50, -50 }, + { 75, -75, -50, 50, -18, 18, -89, 89 }, + { 50, -50, 75, -75, -89, 89, 18, -18 }, + { 18, -18, -89, 89, -50, 50, 75, -75 }, +}; + +ALIGN_VAR_32(static const int16_t, tab_dct_16_0[][8]) = +{ + { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0 + { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1 + { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2 + { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 }, // 3 +}; + +ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) = +{ + { 90, 87, 80, 70, 57, 43, 25, 9 }, // 0 + { 87, 57, 9, -43, -80, -90, -70, -25 }, // 1 + { 80, 9, -70, -87, -25, 57, 90, 43 }, // 2 + { 70, -43, -87, 9, 90, 25, -80, -57 }, // 3 + { 57, -80, -25, 90, -9, -87, 43, 70 }, // 4 + { 43, -90, 57, 25, -87, 70, 9, -80 }, // 5 + { 25, -70, 90, -80, 43, 9, -57, 87 }, // 6 + { 9, -25, 43, -57, 70, -80, 87, -90 }, // 7 + { 83, 83, -83, -83, 36, 36, -36, -36 }, // 8 + { 36, 36, -36, -36, -83, -83, 83, 83 }, // 9 + { 89, 89, 18, 18, 75, 75, 50, 50 }, // 10 + { 75, 75, -50, -50, -18, -18, -89, -89 }, // 11 + { 50, 50, 75, 75, -89, -89, 18, 18 }, // 12 + { 18, 18, -89, -89, -50, -50, 75, 75 }, // 13 + +#define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \ + { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) \ + }, \ + { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) }, + + MAKE_COEF(90, 87, 80, 70, 57, 43, 25, 9) + MAKE_COEF(87, 57, 9, -43, -80, -90, -70, -25) + MAKE_COEF(80, 9, -70, -87, -25, 57, 90, 43) + MAKE_COEF(70, -43, -87, 9, 90, 25, -80, -57) + MAKE_COEF(57, -80, -25, 90, -9, -87, 43, 70) + MAKE_COEF(43, -90, 57, 25, -87, 70, 9, -80) + MAKE_COEF(25, -70, 90, -80, 43, 9, -57, 87) + MAKE_COEF(9, -25, 43, -57, 70, -80, 87, -90) +#undef MAKE_COEF +}; + +void dct16(int16_t *src, int32_t *dst, intptr_t stride) +{ + // Const + __m128i c_4 = _mm_set1_epi32(4); + __m128i c_512 = _mm_set1_epi32(512); + + int i; + + ALIGN_VAR_32(int16_t, tmp[16 * 16]); + + __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; + __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; + __m128i T10, T11, T12, T13, T14, T15, T16, T17; + __m128i T20, T21, T22, T23, T24, T25, T26, T27; + __m128i T30, T31, T32, T33, T34, T35, T36, T37; + __m128i T40, T41, T42, T43, T44, T45, T46, T47; + __m128i T50, T51, T52, T53; + __m128i T60, T61, T62, T63, T64, T65, T66, T67; + __m128i T70; + + // DCT1 + for (i = 0; i < 16; i += 8) + { + T00A = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 0]); // [07 06 05 04 03 02 01 00] + T00B = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 8]); // [0F 0E 0D 0C 0B 0A 09 08] + T01A = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 0]); // [17 16 15 14 13 12 11 10] + T01B = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 8]); // [1F 1E 1D 1C 1B 1A 19 18] + T02A = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 0]); // [27 26 25 24 23 22 21 20] + T02B = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 8]); // [2F 2E 2D 2C 2B 2A 29 28] + T03A = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 0]); // [37 36 35 34 33 32 31 30] + T03B = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 8]); // [3F 3E 3D 3C 3B 3A 39 38] + T04A = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 0]); // [47 46 45 44 43 42 41 40] + T04B = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 8]); // [4F 4E 4D 4C 4B 4A 49 48] + T05A = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 0]); // [57 56 55 54 53 52 51 50] + T05B = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 8]); // [5F 5E 5D 5C 5B 5A 59 58] + T06A = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 0]); // [67 66 65 64 63 62 61 60] + T06B = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 8]); // [6F 6E 6D 6C 6B 6A 69 68] + T07A = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 0]); // [77 76 75 74 73 72 71 70] + T07B = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 8]); // [7F 7E 7D 7C 7B 7A 79 78] + + T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + + T10 = _mm_add_epi16(T00A, T00B); + T11 = _mm_add_epi16(T01A, T01B); + T12 = _mm_add_epi16(T02A, T02B); + T13 = _mm_add_epi16(T03A, T03B); + T14 = _mm_add_epi16(T04A, T04B); + T15 = _mm_add_epi16(T05A, T05B); + T16 = _mm_add_epi16(T06A, T06B); + T17 = _mm_add_epi16(T07A, T07B); + + T20 = _mm_sub_epi16(T00A, T00B); + T21 = _mm_sub_epi16(T01A, T01B); + T22 = _mm_sub_epi16(T02A, T02B); + T23 = _mm_sub_epi16(T03A, T03B); + T24 = _mm_sub_epi16(T04A, T04B); + T25 = _mm_sub_epi16(T05A, T05B); + T26 = _mm_sub_epi16(T06A, T06B); + T27 = _mm_sub_epi16(T07A, T07B); + + T30 = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T31 = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T32 = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T33 = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T34 = _mm_shuffle_epi8(T14, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T35 = _mm_shuffle_epi8(T15, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T36 = _mm_shuffle_epi8(T16, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T37 = _mm_shuffle_epi8(T17, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + + T40 = _mm_hadd_epi16(T30, T31); + T41 = _mm_hadd_epi16(T32, T33); + T42 = _mm_hadd_epi16(T34, T35); + T43 = _mm_hadd_epi16(T36, T37); + T44 = _mm_hsub_epi16(T30, T31); + T45 = _mm_hsub_epi16(T32, T33); + T46 = _mm_hsub_epi16(T34, T35); + T47 = _mm_hsub_epi16(T36, T37); + + T50 = _mm_hadd_epi16(T40, T41); + T51 = _mm_hadd_epi16(T42, T43); + T52 = _mm_hsub_epi16(T40, T41); + T53 = _mm_hsub_epi16(T42, T43); + + T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1])); + T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1])); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70); + + T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2])); + T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2])); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70); + + T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3])); + T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3])); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70); + + T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4])); + T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4])); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70); + + T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5])); + T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5])); + T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[5])); + T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5])); + T60 = _mm_hadd_epi32(T60, T61); + T61 = _mm_hadd_epi32(T62, T63); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70); + + T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6])); + T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6])); + T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[6])); + T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6])); + T60 = _mm_hadd_epi32(T60, T61); + T61 = _mm_hadd_epi32(T62, T63); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70); + + T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7])); + T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7])); + T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[7])); + T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7])); + T60 = _mm_hadd_epi32(T60, T61); + T61 = _mm_hadd_epi32(T62, T63); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70); + + T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8])); + T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8])); + T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[8])); + T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8])); + T60 = _mm_hadd_epi32(T60, T61); + T61 = _mm_hadd_epi32(T62, T63); + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); + T70 = _mm_packs_epi32(T60, T61); + _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70); + +#define MAKE_ODD(tab, dstPos) \ + T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T64 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T65 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T66 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T67 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T60 = _mm_hadd_epi32(T60, T61); \ + T61 = _mm_hadd_epi32(T62, T63); \ + T62 = _mm_hadd_epi32(T64, T65); \ + T63 = _mm_hadd_epi32(T66, T67); \ + T60 = _mm_hadd_epi32(T60, T61); \ + T61 = _mm_hadd_epi32(T62, T63); \ + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \ + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \ + T70 = _mm_packs_epi32(T60, T61); \ + _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70); + + MAKE_ODD(0, 1); + MAKE_ODD(1, 3); + MAKE_ODD(2, 5); + MAKE_ODD(3, 7); + MAKE_ODD(4, 9); + MAKE_ODD(5, 11); + MAKE_ODD(6, 13); + MAKE_ODD(7, 15); +#undef MAKE_ODD + } + + // DCT2 + for (i = 0; i < 16; i += 4) + { + T00A = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 0]); // [07 06 05 04 03 02 01 00] + T00B = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 8]); // [0F 0E 0D 0C 0B 0A 09 08] + T01A = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 0]); // [17 16 15 14 13 12 11 10] + T01B = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 8]); // [1F 1E 1D 1C 1B 1A 19 18] + T02A = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 0]); // [27 26 25 24 23 22 21 20] + T02B = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 8]); // [2F 2E 2D 2C 2B 2A 29 28] + T03A = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 0]); // [37 36 35 34 33 32 31 30] + T03B = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 8]); // [3F 3E 3D 3C 3B 3A 39 38] + + T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); + T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); + T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); + T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); + T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); + T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); + T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); + T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); + + T10 = _mm_unpacklo_epi16(T00A, T00B); + T11 = _mm_unpackhi_epi16(T00A, T00B); + T12 = _mm_unpacklo_epi16(T01A, T01B); + T13 = _mm_unpackhi_epi16(T01A, T01B); + T14 = _mm_unpacklo_epi16(T02A, T02B); + T15 = _mm_unpackhi_epi16(T02A, T02B); + T16 = _mm_unpacklo_epi16(T03A, T03B); + T17 = _mm_unpackhi_epi16(T03A, T03B); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[1])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[1])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[1])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_8[1])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_8[1])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_8[1])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_8[1])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_8[1])); + + T30 = _mm_add_epi32(T20, T21); + T31 = _mm_add_epi32(T22, T23); + T32 = _mm_add_epi32(T24, T25); + T33 = _mm_add_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T41 = _mm_hsub_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40); + _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + + T30 = _mm_add_epi32(T20, T21); + T31 = _mm_add_epi32(T22, T23); + T32 = _mm_add_epi32(T24, T25); + T33 = _mm_add_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + + T30 = _mm_add_epi32(T20, T21); + T31 = _mm_add_epi32(T22, T23); + T32 = _mm_add_epi32(T24, T25); + T33 = _mm_add_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[10])); + + T30 = _mm_sub_epi32(T20, T21); + T31 = _mm_sub_epi32(T22, T23); + T32 = _mm_sub_epi32(T24, T25); + T33 = _mm_sub_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[11])); + + T30 = _mm_sub_epi32(T20, T21); + T31 = _mm_sub_epi32(T22, T23); + T32 = _mm_sub_epi32(T24, T25); + T33 = _mm_sub_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[12])); + + T30 = _mm_sub_epi32(T20, T21); + T31 = _mm_sub_epi32(T22, T23); + T32 = _mm_sub_epi32(T24, T25); + T33 = _mm_sub_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40); + + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[13])); + + T30 = _mm_sub_epi32(T20, T21); + T31 = _mm_sub_epi32(T22, T23); + T32 = _mm_sub_epi32(T24, T25); + T33 = _mm_sub_epi32(T26, T27); + + T30 = _mm_hadd_epi32(T30, T31); + T31 = _mm_hadd_epi32(T32, T33); + + T40 = _mm_hadd_epi32(T30, T31); + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); + _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40); + +#define MAKE_ODD(tab, dstPos) \ + T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \ + T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \ + T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ + T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ + T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ + T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ + \ + T30 = _mm_add_epi32(T20, T21); \ + T31 = _mm_add_epi32(T22, T23); \ + T32 = _mm_add_epi32(T24, T25); \ + T33 = _mm_add_epi32(T26, T27); \ + \ + T30 = _mm_hadd_epi32(T30, T31); \ + T31 = _mm_hadd_epi32(T32, T33); \ + \ + T40 = _mm_hadd_epi32(T30, T31); \ + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \ + _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40); + + MAKE_ODD(14, 1); + MAKE_ODD(16, 3); + MAKE_ODD(18, 5); + MAKE_ODD(20, 7); + MAKE_ODD(22, 9); + MAKE_ODD(24, 11); + MAKE_ODD(26, 13); + MAKE_ODD(28, 15); +#undef MAKE_ODD + } +} + +ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) = +{ + { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0 +}; + +ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) = +{ + { 89, -89, 18, -18, 75, -75, 50, -50 }, // 0 + { 75, -75, -50, 50, -18, 18, -89, 89 }, // 1 + { 50, -50, 75, -75, -89, 89, 18, -18 }, // 2 + { 18, -18, -89, 89, -50, 50, 75, -75 }, // 3 + +#define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ + { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) \ + }, \ + + MAKE_COEF8(90, 87, 80, 70, 57, 43, 25, 9) // 4 + MAKE_COEF8(87, 57, 9, -43, -80, -90, -70, -25) // 5 + MAKE_COEF8(80, 9, -70, -87, -25, 57, 90, 43) // 6 + MAKE_COEF8(70, -43, -87, 9, 90, 25, -80, -57) // 7 + MAKE_COEF8(57, -80, -25, 90, -9, -87, 43, 70) // 8 + MAKE_COEF8(43, -90, 57, 25, -87, 70, 9, -80) // 9 + MAKE_COEF8(25, -70, 90, -80, 43, 9, -57, 87) // 10 + MAKE_COEF8(9, -25, 43, -57, 70, -80, 87, -90) // 11 +#undef MAKE_COEF8 + +#define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ + { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \ + { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) }, + + MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 12 + MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) // 14 + MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) // 16 + MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31) // 18 + MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38) // 20 + MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) // 22 + MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) // 24 + MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61) // 26 + MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) // 28 + MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) // 30 + MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78) // 32 + MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82) // 34 + MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) // 36 + MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) // 38 + MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90) // 40 + MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90) // 42 +#undef MAKE_COEF16 + + { + 64, 64, 64, 64, 64, 64, 64, 64 + }, // 44 + + { 64, 64, -64, -64, -64, -64, 64, 64 }, // 45 + + { 83, 83, 36, 36, -36, -36, -83, -83 }, // 46 + { -83, -83, -36, -36, 36, 36, 83, 83 }, // 47 + + { 36, 36, -83, -83, 83, 83, -36, -36 }, // 48 + { -36, -36, 83, 83, -83, -83, 36, 36 }, // 49 + +#define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ + { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \ + { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \ + { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \ + { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) }, + + MAKE_COEF16(89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89) // 50 + MAKE_COEF16(75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75) // 54 + + // TODO: convert below table here +#undef MAKE_COEF16 + + { + 50, 50, -89, -89, 18, 18, 75, 75 + }, // 58 + { -75, -75, -18, -18, 89, 89, -50, -50 }, // 59 + { -50, -50, 89, 89, -18, -18, -75, -75 }, // 60 + { 75, 75, 18, 18, -89, -89, 50, 50 }, // 61 + + { 18, 18, -50, -50, 75, 75, -89, -89 }, // 62 + { 89, 89, -75, -75, 50, 50, -18, -18 }, // 63 + { -18, -18, 50, 50, -75, -75, 89, 89 }, // 64 + { -89, -89, 75, 75, -50, -50, 18, 18 }, // 65 + + { 90, 90, 87, 87, 80, 80, 70, 70 }, // 66 + { 57, 57, 43, 43, 25, 25, 9, 9 }, // 67 + { -9, -9, -25, -25, -43, -43, -57, -57 }, // 68 + { -70, -70, -80, -80, -87, -87, -90, -90 }, // 69 + + { 87, 87, 57, 57, 9, 9, -43, -43 }, // 70 + { -80, -80, -90, -90, -70, -70, -25, -25 }, // 71 + { 25, 25, 70, 70, 90, 90, 80, 80 }, // 72 + { 43, 43, -9, -9, -57, -57, -87, -87 }, // 73 + + { 80, 80, 9, 9, -70, -70, -87, -87 }, // 74 + { -25, -25, 57, 57, 90, 90, 43, 43 }, // 75 + { -43, -43, -90, -90, -57, -57, 25, 25 }, // 76 + { 87, 87, 70, 70, -9, -9, -80, -80 }, // 77 + + { 70, 70, -43, -43, -87, -87, 9, 9 }, // 78 + { 90, 90, 25, 25, -80, -80, -57, -57 }, // 79 + { 57, 57, 80, 80, -25, -25, -90, -90 }, // 80 + { -9, -9, 87, 87, 43, 43, -70, -70 }, // 81 + + { 57, 57, -80, -80, -25, -25, 90, 90 }, // 82 + { -9, -9, -87, -87, 43, 43, 70, 70 }, // 83 + { -70, -70, -43, -43, 87, 87, 9, 9 }, // 84 + { -90, -90, 25, 25, 80, 80, -57, -57 }, // 85 + + { 43, 43, -90, -90, 57, 57, 25, 25 }, // 86 + { -87, -87, 70, 70, 9, 9, -80, -80 }, // 87 + { 80, 80, -9, -9, -70, -70, 87, 87 }, // 88 + { -25, -25, -57, -57, 90, 90, -43, -43 }, // 89 + + { 25, 25, -70, -70, 90, 90, -80, -80 }, // 90 + { 43, 43, 9, 9, -57, -57, 87, 87 }, // 91 + { -87, -87, 57, 57, -9, -9, -43, -43 }, // 92 + { 80, 80, -90, -90, 70, 70, -25, -25 }, // 93 + + { 9, 9, -25, -25, 43, 43, -57, -57 }, // 94 + { 70, 70, -80, -80, 87, 87, -90, -90 }, // 95 + { 90, 90, -87, -87, 80, 80, -70, -70 }, // 96 + { 57, 57, -43, -43, 25, 25, -9, -9 }, // 97 + +#define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ + { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \ + { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \ + { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \ + { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) }, + + MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) // 98 + MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13) //102 + MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22) //106 + MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, +82, 88, 54, -4, -61, -90, -78, -31) //110 + MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, +31, -46, -90, -67, 4, 73, 88, 38) //114 + MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46) //118 + MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54) //122 + MAKE_COEF16(67, -54, -78, 38, 85, -22, -90, 4, +90, 13, -88, -31, 82, 46, -73, -61) //126 + MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67) //130 + MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73) //134 + MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, +22, 67, -85, 13, 73, -82, 4, 78) //138 + MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, +85, -78, 13, 61, -90, 54, 22, -82) //142 + MAKE_COEF16(31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85) //146 + MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88) //150 + MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, +54, -31, 4, 22, -46, 67, -82, 90) //154 + MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, +67, -73, 78, -82, 85, -88, 90, -90) //158 + +#undef MAKE_COEF16 +}; + +void dct32(int16_t *src, int32_t *dst, intptr_t stride) +{ + // Const + __m128i c_8 = _mm_set1_epi32(8); + __m128i c_1024 = _mm_set1_epi32(1024); + + int i; + + __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; + __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; + __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C; + __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D; + __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A; + __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B; + __m128i T20, T21, T22, T23, T24, T25, T26, T27; + __m128i T30, T31, T32, T33, T34, T35, T36, T37; + __m128i T40, T41, T42, T43, T44, T45, T46, T47; + __m128i T50, T51, T52, T53; + __m128i T60, T61, T62, T63, T64, T65, T66, T67; + __m128i im[32][4]; + + // DCT1 + for (i = 0; i < 32 / 8; i++) + { + T00A = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 0]); // [07 06 05 04 03 02 01 00] + T00B = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 8]); // [15 14 13 12 11 10 09 08] + T00C = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 16]); // [23 22 21 20 19 18 17 16] + T00D = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 24]); // [31 30 29 28 27 26 25 24] + T01A = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 0]); + T01B = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 8]); + T01C = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 16]); + T01D = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 24]); + T02A = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 0]); + T02B = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 8]); + T02C = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 16]); + T02D = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 24]); + T03A = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 0]); + T03B = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 8]); + T03C = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 16]); + T03D = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 24]); + T04A = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 0]); + T04B = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 8]); + T04C = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 16]); + T04D = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 24]); + T05A = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 0]); + T05B = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 8]); + T05C = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 16]); + T05D = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 24]); + T06A = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 0]); + T06B = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 8]); + T06C = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 16]); + T06D = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 24]); + T07A = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 0]); + T07B = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 8]); + T07C = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 16]); + T07D = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 24]); + + T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] + T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] + T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] + T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] + T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); + T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); + + T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00] + T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15] + T11A = _mm_add_epi16(T01A, T01D); + T11B = _mm_add_epi16(T01B, T01C); + T12A = _mm_add_epi16(T02A, T02D); + T12B = _mm_add_epi16(T02B, T02C); + T13A = _mm_add_epi16(T03A, T03D); + T13B = _mm_add_epi16(T03B, T03C); + T14A = _mm_add_epi16(T04A, T04D); + T14B = _mm_add_epi16(T04B, T04C); + T15A = _mm_add_epi16(T05A, T05D); + T15B = _mm_add_epi16(T05B, T05C); + T16A = _mm_add_epi16(T06A, T06D); + T16B = _mm_add_epi16(T06B, T06C); + T17A = _mm_add_epi16(T07A, T07D); + T17B = _mm_add_epi16(T07B, T07C); + + T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00] + T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15] + T01A = _mm_sub_epi16(T01A, T01D); + T01B = _mm_sub_epi16(T01B, T01C); + T02A = _mm_sub_epi16(T02A, T02D); + T02B = _mm_sub_epi16(T02B, T02C); + T03A = _mm_sub_epi16(T03A, T03D); + T03B = _mm_sub_epi16(T03B, T03C); + T04A = _mm_sub_epi16(T04A, T04D); + T04B = _mm_sub_epi16(T04B, T04C); + T05A = _mm_sub_epi16(T05A, T05D); + T05B = _mm_sub_epi16(T05B, T05C); + T06A = _mm_sub_epi16(T06A, T06D); + T06B = _mm_sub_epi16(T06B, T06C); + T07A = _mm_sub_epi16(T07A, T07D); + T07B = _mm_sub_epi16(T07B, T07C); + + T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0] + T21 = _mm_add_epi16(T11A, T11B); + T22 = _mm_add_epi16(T12A, T12B); + T23 = _mm_add_epi16(T13A, T13B); + T24 = _mm_add_epi16(T14A, T14B); + T25 = _mm_add_epi16(T15A, T15B); + T26 = _mm_add_epi16(T16A, T16B); + T27 = _mm_add_epi16(T17A, T17B); + + T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1])); + T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1])); + T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1])); + T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1])); + T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1])); + T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1])); + T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1])); + T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1])); + + T40 = _mm_hadd_epi32(T30, T31); + T41 = _mm_hadd_epi32(T32, T33); + T42 = _mm_hadd_epi32(T34, T35); + T43 = _mm_hadd_epi32(T36, T37); + + T50 = _mm_hadd_epi32(T40, T41); + T51 = _mm_hadd_epi32(T42, T43); + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); + T60 = _mm_packs_epi32(T50, T51); + im[0][i] = T60; + + T50 = _mm_hsub_epi32(T40, T41); + T51 = _mm_hsub_epi32(T42, T43); + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); + T60 = _mm_packs_epi32(T50, T51); + im[16][i] = T60; + + T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8])); + + T40 = _mm_hadd_epi32(T30, T31); + T41 = _mm_hadd_epi32(T32, T33); + T42 = _mm_hadd_epi32(T34, T35); + T43 = _mm_hadd_epi32(T36, T37); + + T50 = _mm_hadd_epi32(T40, T41); + T51 = _mm_hadd_epi32(T42, T43); + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); + T60 = _mm_packs_epi32(T50, T51); + im[8][i] = T60; + + T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[9])); + + T40 = _mm_hadd_epi32(T30, T31); + T41 = _mm_hadd_epi32(T32, T33); + T42 = _mm_hadd_epi32(T34, T35); + T43 = _mm_hadd_epi32(T36, T37); + + T50 = _mm_hadd_epi32(T40, T41); + T51 = _mm_hadd_epi32(T42, T43); + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); + T60 = _mm_packs_epi32(T50, T51); + im[24][i] = T60; + +#define MAKE_ODD(tab, dstPos) \ + T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + \ + T40 = _mm_hadd_epi32(T30, T31); \ + T41 = _mm_hadd_epi32(T32, T33); \ + T42 = _mm_hadd_epi32(T34, T35); \ + T43 = _mm_hadd_epi32(T36, T37); \ + \ + T50 = _mm_hadd_epi32(T40, T41); \ + T51 = _mm_hadd_epi32(T42, T43); \ + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \ + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \ + T60 = _mm_packs_epi32(T50, T51); \ + im[(dstPos)][i] = T60; + + MAKE_ODD(0, 4); + MAKE_ODD(1, 12); + MAKE_ODD(2, 20); + MAKE_ODD(3, 28); + + T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0] + T21 = _mm_sub_epi16(T11A, T11B); + T22 = _mm_sub_epi16(T12A, T12B); + T23 = _mm_sub_epi16(T13A, T13B); + T24 = _mm_sub_epi16(T14A, T14B); + T25 = _mm_sub_epi16(T15A, T15B); + T26 = _mm_sub_epi16(T16A, T16B); + T27 = _mm_sub_epi16(T17A, T17B); + + MAKE_ODD(4, 2); + MAKE_ODD(5, 6); + MAKE_ODD(6, 10); + MAKE_ODD(7, 14); + MAKE_ODD(8, 18); + MAKE_ODD(9, 22); + MAKE_ODD(10, 26); + MAKE_ODD(11, 30); +#undef MAKE_ODD + +#define MAKE_ODD(tab, dstPos) \ + T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ + T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ + \ + T40 = _mm_hadd_epi32(T20, T21); \ + T41 = _mm_hadd_epi32(T22, T23); \ + T42 = _mm_hadd_epi32(T24, T25); \ + T43 = _mm_hadd_epi32(T26, T27); \ + T44 = _mm_hadd_epi32(T30, T31); \ + T45 = _mm_hadd_epi32(T32, T33); \ + T46 = _mm_hadd_epi32(T34, T35); \ + T47 = _mm_hadd_epi32(T36, T37); \ + \ + T50 = _mm_hadd_epi32(T40, T41); \ + T51 = _mm_hadd_epi32(T42, T43); \ + T52 = _mm_hadd_epi32(T44, T45); \ + T53 = _mm_hadd_epi32(T46, T47); \ + \ + T50 = _mm_hadd_epi32(T50, T51); \ + T51 = _mm_hadd_epi32(T52, T53); \ + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \ + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \ + T60 = _mm_packs_epi32(T50, T51); \ + im[(dstPos)][i] = T60; + + MAKE_ODD(12, 1); + MAKE_ODD(14, 3); + MAKE_ODD(16, 5); + MAKE_ODD(18, 7); + MAKE_ODD(20, 9); + MAKE_ODD(22, 11); + MAKE_ODD(24, 13); + MAKE_ODD(26, 15); + MAKE_ODD(28, 17); + MAKE_ODD(30, 19); + MAKE_ODD(32, 21); + MAKE_ODD(34, 23); + MAKE_ODD(36, 25); + MAKE_ODD(38, 27); + MAKE_ODD(40, 29); + MAKE_ODD(42, 31); + +#undef MAKE_ODD + } + + // DCT2 + for (i = 0; i < 32 / 4; i++) + { + // OPT_ME: to avoid register spill, I use matrix multiply, have other way? + T00A = im[i * 4 + 0][0]; // [07 06 05 04 03 02 01 00] + T00B = im[i * 4 + 0][1]; // [15 14 13 12 11 10 09 08] + T00C = im[i * 4 + 0][2]; // [23 22 21 20 19 18 17 16] + T00D = im[i * 4 + 0][3]; // [31 30 29 28 27 26 25 24] + T01A = im[i * 4 + 1][0]; + T01B = im[i * 4 + 1][1]; + T01C = im[i * 4 + 1][2]; + T01D = im[i * 4 + 1][3]; + T02A = im[i * 4 + 2][0]; + T02B = im[i * 4 + 2][1]; + T02C = im[i * 4 + 2][2]; + T02D = im[i * 4 + 2][3]; + T03A = im[i * 4 + 3][0]; + T03B = im[i * 4 + 3][1]; + T03C = im[i * 4 + 3][2]; + T03D = im[i * 4 + 3][3]; + + T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [16 17 18 19 20 21 22 23] + T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [24 25 26 27 28 29 30 31] + T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); + + T10A = _mm_unpacklo_epi16(T00A, T00D); // [28 03 29 02 30 01 31 00] + T10B = _mm_unpackhi_epi16(T00A, T00D); // [24 07 25 06 26 05 27 04] + T00A = _mm_unpacklo_epi16(T00B, T00C); // [20 11 21 10 22 09 23 08] + T00B = _mm_unpackhi_epi16(T00B, T00C); // [16 15 17 14 18 13 19 12] + T11A = _mm_unpacklo_epi16(T01A, T01D); + T11B = _mm_unpackhi_epi16(T01A, T01D); + T01A = _mm_unpacklo_epi16(T01B, T01C); + T01B = _mm_unpackhi_epi16(T01B, T01C); + T12A = _mm_unpacklo_epi16(T02A, T02D); + T12B = _mm_unpackhi_epi16(T02A, T02D); + T02A = _mm_unpacklo_epi16(T02B, T02C); + T02B = _mm_unpackhi_epi16(T02B, T02C); + T13A = _mm_unpacklo_epi16(T03A, T03D); + T13B = _mm_unpackhi_epi16(T03A, T03D); + T03A = _mm_unpacklo_epi16(T03B, T03C); + T03B = _mm_unpackhi_epi16(T03B, T03C); + +#define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \ + T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ + T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ + T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ + T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ + T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ + T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ + T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ + T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ + T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ + T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ + T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ + T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ + T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ + T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ + T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ + T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ + \ + T60 = _mm_hadd_epi32(T20, T21); \ + T61 = _mm_hadd_epi32(T22, T23); \ + T62 = _mm_hadd_epi32(T24, T25); \ + T63 = _mm_hadd_epi32(T26, T27); \ + T64 = _mm_hadd_epi32(T30, T31); \ + T65 = _mm_hadd_epi32(T32, T33); \ + T66 = _mm_hadd_epi32(T34, T35); \ + T67 = _mm_hadd_epi32(T36, T37); \ + \ + T60 = _mm_hadd_epi32(T60, T61); \ + T61 = _mm_hadd_epi32(T62, T63); \ + T62 = _mm_hadd_epi32(T64, T65); \ + T63 = _mm_hadd_epi32(T66, T67); \ + \ + T60 = _mm_hadd_epi32(T60, T61); \ + T61 = _mm_hadd_epi32(T62, T63); \ + \ + T60 = _mm_hadd_epi32(T60, T61); \ + \ + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \ + _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \ + + MAKE_ODD(44, 44, 44, 44, 0); + MAKE_ODD(45, 45, 45, 45, 16); + MAKE_ODD(46, 47, 46, 47, 8); + MAKE_ODD(48, 49, 48, 49, 24); + + MAKE_ODD(50, 51, 52, 53, 4); + MAKE_ODD(54, 55, 56, 57, 12); + MAKE_ODD(58, 59, 60, 61, 20); + MAKE_ODD(62, 63, 64, 65, 28); + + MAKE_ODD(66, 67, 68, 69, 2); + MAKE_ODD(70, 71, 72, 73, 6); + MAKE_ODD(74, 75, 76, 77, 10); + MAKE_ODD(78, 79, 80, 81, 14); + + MAKE_ODD(82, 83, 84, 85, 18); + MAKE_ODD(86, 87, 88, 89, 22); + MAKE_ODD(90, 91, 92, 93, 26); + MAKE_ODD(94, 95, 96, 97, 30); + + MAKE_ODD(98, 99, 100, 101, 1); + MAKE_ODD(102, 103, 104, 105, 3); + MAKE_ODD(106, 107, 108, 109, 5); + MAKE_ODD(110, 111, 112, 113, 7); + MAKE_ODD(114, 115, 116, 117, 9); + MAKE_ODD(118, 119, 120, 121, 11); + MAKE_ODD(122, 123, 124, 125, 13); + MAKE_ODD(126, 127, 128, 129, 15); + MAKE_ODD(130, 131, 132, 133, 17); + MAKE_ODD(134, 135, 136, 137, 19); + MAKE_ODD(138, 139, 140, 141, 21); + MAKE_ODD(142, 143, 144, 145, 23); + MAKE_ODD(146, 147, 148, 149, 25); + MAKE_ODD(150, 151, 152, 153, 27); + MAKE_ODD(154, 155, 156, 157, 29); + MAKE_ODD(158, 159, 160, 161, 31); +#undef MAKE_ODD + } +} +} +#endif // if !HIGH_BIT_DEPTH + +namespace x265 { +void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives &p) +{ + /* Note: We have AVX2 assembly for these two functions, but since AVX2 is + * still somewhat rare on end-user PCs we still compile and link these SSSE3 + * intrinsic SIMD functions */ +#if !HIGH_BIT_DEPTH + p.dct[DCT_16x16] = dct16; + p.dct[DCT_32x32] = dct32; +#endif +} +} diff --git a/source/common/vec/vec-primitives.cpp b/source/common/vec/vec-primitives.cpp new file mode 100644 index 0000000..c5d5405 --- /dev/null +++ b/source/common/vec/vec-primitives.cpp @@ -0,0 +1,84 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "primitives.h" +#include "x265.h" + +/* The #if logic here must match the file lists in CMakeLists.txt */ +#if X265_ARCH_X86 +#if defined(__INTEL_COMPILER) +#define HAVE_SSE3 +#define HAVE_SSSE3 +#define HAVE_SSE4 +#define HAVE_AVX2 +#elif defined(__GNUC__) +#if __clang__ || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 3) +#define HAVE_SSE3 +#define HAVE_SSSE3 +#define HAVE_SSE4 +#endif +#if __clang__ || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 7) +#define HAVE_AVX2 +#endif +#elif defined(_MSC_VER) +#define HAVE_SSE3 +#define HAVE_SSSE3 +#define HAVE_SSE4 +#if _MSC_VER >= 1700 // VC11 +#define HAVE_AVX2 +#endif +#endif // compiler checks +#endif // if X265_ARCH_X86 + +namespace x265 { +// private x265 namespace + +void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives&); +void Setup_Vec_DCTPrimitives_ssse3(EncoderPrimitives&); +void Setup_Vec_DCTPrimitives_sse41(EncoderPrimitives&); + +/* Use primitives for the best available vector architecture */ +void Setup_Instrinsic_Primitives(EncoderPrimitives &p, int cpuMask) +{ +#ifdef HAVE_SSE3 + if (cpuMask & X265_CPU_SSE3) + { + Setup_Vec_DCTPrimitives_sse3(p); + } +#endif +#ifdef HAVE_SSSE3 + if (cpuMask & X265_CPU_SSSE3) + { + Setup_Vec_DCTPrimitives_ssse3(p); + } +#endif +#ifdef HAVE_SSE4 + if (cpuMask & X265_CPU_SSE4) + { + Setup_Vec_DCTPrimitives_sse41(p); + } +#endif + (void)p; + (void)cpuMask; +} +} diff --git a/source/common/version.cpp b/source/common/version.cpp new file mode 100644 index 0000000..d377b85 --- /dev/null +++ b/source/common/version.cpp @@ -0,0 +1,93 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "x265.h" +#include "common.h" + +#define XSTR(x) STR(x) +#define STR(x) #x + +#if defined(__clang__) +#define NVM_COMPILEDBY "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]" +#ifdef __IA64__ +#define NVM_ONARCH "[on 64-bit] " +#else +#define NVM_ONARCH "[on 32-bit] " +#endif +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +#define NVM_COMPILEDBY "[GCC " XSTR(__GNUC__) "." XSTR(__GNUC_MINOR__) "." XSTR(__GNUC_PATCHLEVEL__) "]" +#ifdef __IA64__ +#define NVM_ONARCH "[on 64-bit] " +#else +#define NVM_ONARCH "[on 32-bit] " +#endif +#endif + +#ifdef __INTEL_COMPILER +#define NVM_COMPILEDBY "[ICC " XSTR(__INTEL_COMPILER) "]" +#elif _MSC_VER +#define NVM_COMPILEDBY "[MSVC " XSTR(_MSC_VER) "]" +#endif + +#ifndef NVM_COMPILEDBY +#define NVM_COMPILEDBY "[Unk-CXX]" +#endif + +#ifdef _WIN32 +#define NVM_ONOS "[Windows]" +#elif __linux +#define NVM_ONOS "[Linux]" +#elif __OpenBSD__ +#define NVM_ONOS "[OpenBSD]" +#elif __CYGWIN__ +#define NVM_ONOS "[Cygwin]" +#elif __APPLE__ +#define NVM_ONOS "[Mac OS X]" +#else +#define NVM_ONOS "[Unk-OS]" +#endif + +#if X86_64 +#define NVM_BITS "[64 bit]" +#else +#define NVM_BITS "[32 bit]" +#endif + +#if CHECKED_BUILD +#define CHECKED "[CHECKED] " +#else +#define CHECKED " " +#endif + +#if HIGH_BIT_DEPTH +#define BITDEPTH "16bpp" +const int x265_max_bit_depth = 10; +#else +#define BITDEPTH "8bpp" +const int x265_max_bit_depth = 8; +#endif + +const char *x265_version_str = XSTR(X265_VERSION); +const char *x265_build_info_str = NVM_ONOS NVM_COMPILEDBY NVM_BITS CHECKED BITDEPTH; diff --git a/source/common/wavefront.cpp b/source/common/wavefront.cpp new file mode 100644 index 0000000..17c44aa --- /dev/null +++ b/source/common/wavefront.cpp @@ -0,0 +1,139 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#include "threadpool.h" +#include "threading.h" +#include "wavefront.h" +#include "common.h" + +namespace x265 { +// x265 private namespace + +bool WaveFront::init(int numRows) +{ + m_numRows = numRows; + + m_numWords = (numRows + 63) >> 6; + m_internalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords); + if (m_internalDependencyBitmap) + memset((void*)m_internalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords); + + m_externalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords); + if (m_externalDependencyBitmap) + memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords); + + return m_internalDependencyBitmap && m_externalDependencyBitmap; +} + +WaveFront::~WaveFront() +{ + x265_free((void*)m_internalDependencyBitmap); + x265_free((void*)m_externalDependencyBitmap); +} + +void WaveFront::clearEnabledRowMask() +{ + memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords); +} + +void WaveFront::enqueueRow(int row) +{ + // thread safe + uint64_t bit = 1LL << (row & 63); + + X265_CHECK(row < m_numRows, "invalid row\n"); + ATOMIC_OR(&m_internalDependencyBitmap[row >> 6], bit); + if (m_pool) m_pool->pokeIdleThread(); +} + +void WaveFront::enableRow(int row) +{ + // thread safe + uint64_t bit = 1LL << (row & 63); + + X265_CHECK(row < m_numRows, "invalid row\n"); + ATOMIC_OR(&m_externalDependencyBitmap[row >> 6], bit); +} + +void WaveFront::enableAllRows() +{ + memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint64_t) * m_numWords); +} + +bool WaveFront::checkHigherPriorityRow(int curRow) +{ + int fullwords = curRow >> 6; + uint64_t mask = (1LL << (curRow & 63)) - 1; + + // Check full bitmap words before curRow + for (int i = 0; i < fullwords; i++) + { + if (m_internalDependencyBitmap[i] & m_externalDependencyBitmap[i]) + return true; + } + + // check the partially masked bitmap word of curRow + if (m_internalDependencyBitmap[fullwords] & m_externalDependencyBitmap[fullwords] & mask) + return true; + return false; +} + +bool WaveFront::dequeueRow(int row) +{ + uint64_t oldval, newval; + + oldval = m_internalDependencyBitmap[row >> 6]; + newval = oldval & ~(1LL << (row & 63)); + return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval; +} + +bool WaveFront::findJob(int threadId) +{ + unsigned long id; + + // thread safe + for (int w = 0; w < m_numWords; w++) + { + uint64_t oldval = m_internalDependencyBitmap[w]; + while (oldval & m_externalDependencyBitmap[w]) + { + uint64_t mask = oldval & m_externalDependencyBitmap[w]; + + CTZ64(id, mask); + + uint64_t newval = oldval & ~(1LL << id); + if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval) + { + // we cleared the bit, process row + processRow(w * 64 + id, threadId); + return true; + } + // some other thread cleared the bit, try another bit + oldval = m_internalDependencyBitmap[w]; + } + } + + // made it through the bitmap without finding any enqueued rows + return false; +} +} diff --git a/source/common/wavefront.h b/source/common/wavefront.h new file mode 100644 index 0000000..a34b9a4 --- /dev/null +++ b/source/common/wavefront.h @@ -0,0 +1,102 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#ifndef X265_WAVEFRONT_H +#define X265_WAVEFRONT_H + +#include "common.h" +#include "threadpool.h" + +namespace x265 { +// x265 private namespace + +// Generic wave-front scheduler, manages busy-state of CU rows as a priority +// queue (higher CU rows have priority over lower rows) +// +// Derived classes must implement ProcessRow(). +class WaveFront : public JobProvider +{ +private: + + // bitmaps of rows queued for processing, uses atomic intrinsics + + // Dependencies are categorized as internal and external. Internal dependencies + // are caused by neighbor block availability. External dependencies are generally + // reference frame reconstructed pixels being available. + uint64_t volatile *m_internalDependencyBitmap; + uint64_t volatile *m_externalDependencyBitmap; + + // number of words in the bitmap + int m_numWords; + + int m_numRows; + +public: + + WaveFront(ThreadPool *pool) + : JobProvider(pool) + , m_internalDependencyBitmap(0) + , m_externalDependencyBitmap(0) + {} + + virtual ~WaveFront(); + + // If returns false, the frame must be encoded in series. + bool init(int numRows); + + // Enqueue a row to be processed (mark its internal dependencies as resolved). + // A worker thread will later call processRow(row). + // This provider must be enqueued in the pool before enqueuing a row + void enqueueRow(int row); + + // Mark a row as no longer having internal dependencies resolved. Returns + // true if bit clear was successful, false otherwise. + bool dequeueRow(int row); + + // Mark the row's external dependencies as being resolved + void enableRow(int row); + + // Mark all row external dependencies as being resolved. Some wavefront + // implementations (lookahead, for instance) have no recon pixel dependencies. + void enableAllRows(); + + // Mark all rows as having external dependencies which must be + // resolved before each row may proceed. + void clearEnabledRowMask(); + + // WaveFront's implementation of JobProvider::findJob. Consults + // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row + // or returns false + bool findJob(int threadId); + + // Start or resume encode processing of this row, must be implemented by + // derived classes. + virtual void processRow(int row, int threadId) = 0; + + // Returns true if a row above curRow is available for processing. The processRow() + // method may call this function periodically and voluntarily exit + bool checkHigherPriorityRow(int curRow); +}; +} // end namespace x265 + +#endif // ifndef X265_WAVEFRONT_H diff --git a/source/common/winxp.cpp b/source/common/winxp.cpp new file mode 100644 index 0000000..50951c0 --- /dev/null +++ b/source/common/winxp.cpp @@ -0,0 +1,130 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#include "threading.h" + +#if defined(_WIN32) && (_WIN32_WINNT < 0x0600) // _WIN32_WINNT_VISTA + +namespace x265 { +/* Mimic CONDITION_VARIABLE functions only supported on Vista+ */ + +int WINAPI cond_init(ConditionVariable *cond) +{ // InitializeConditionVariable + cond->semaphore = CreateSemaphore(NULL, 0, 0x7fffffff, NULL); + if (!cond->semaphore) + return -1; + cond->waitersDone = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!cond->waitersDone) + return -1; + + InitializeCriticalSection(&cond->waiterCountMutex); + InitializeCriticalSection(&cond->broadcastMutex); + cond->waiterCount = 0; + cond->bIsBroadcast = false; + + return 0; +} + +void WINAPI cond_broadcast(ConditionVariable *cond) +{ // WakeAllConditionVariable + EnterCriticalSection(&cond->broadcastMutex); + EnterCriticalSection(&cond->waiterCountMutex); + int haveWaiter = 0; + + if (cond->waiterCount) + { + cond->bIsBroadcast = 1; + haveWaiter = 1; + } + + if (haveWaiter) + { + ReleaseSemaphore(cond->semaphore, cond->waiterCount, NULL); + LeaveCriticalSection(&cond->waiterCountMutex); + WaitForSingleObject(cond->waitersDone, INFINITE); + cond->bIsBroadcast = 0; + } + else + LeaveCriticalSection(&cond->waiterCountMutex); + + LeaveCriticalSection(&cond->broadcastMutex); +} + +void WINAPI cond_signal(ConditionVariable *cond) +{ // WakeConditionVariable + EnterCriticalSection(&cond->broadcastMutex); + EnterCriticalSection(&cond->waiterCountMutex); + int haveWaiter = cond->waiterCount; + LeaveCriticalSection(&cond->waiterCountMutex); + + if (haveWaiter) + { + ReleaseSemaphore(cond->semaphore, 1, NULL); + WaitForSingleObject(cond->waitersDone, INFINITE); + } + + LeaveCriticalSection(&cond->broadcastMutex); +} + +BOOL WINAPI cond_wait(ConditionVariable *cond, CRITICAL_SECTION *mutex, DWORD wait) +{ // SleepConditionVariableCS + EnterCriticalSection(&cond->broadcastMutex); + EnterCriticalSection(&cond->waiterCountMutex); + cond->waiterCount++; + LeaveCriticalSection(&cond->waiterCountMutex); + LeaveCriticalSection(&cond->broadcastMutex); + + // unlock the external mutex + LeaveCriticalSection(mutex); + BOOL ret = WaitForSingleObject(cond->semaphore, wait); + + EnterCriticalSection(&cond->waiterCountMutex); + cond->waiterCount--; + int last_waiter = !cond->waiterCount || !cond->bIsBroadcast; + LeaveCriticalSection(&cond->waiterCountMutex); + + if (last_waiter) + SetEvent(cond->waitersDone); + + // lock the external mutex + EnterCriticalSection(mutex); + + // returns false on timeout or error + return ret; +} + +/* Native CONDITION_VARIABLE instances are not freed, so this is a special case */ +void cond_destroy(ConditionVariable *cond) +{ + CloseHandle(cond->semaphore); + CloseHandle(cond->waitersDone); + DeleteCriticalSection(&cond->broadcastMutex); + DeleteCriticalSection(&cond->waiterCountMutex); +} +} // namespace x265 + +#elif defined(_MSC_VER) + +namespace { int _avoid_linker_warnings = 0; } + +#endif // _WIN32_WINNT <= _WIN32_WINNT_WINXP diff --git a/source/common/winxp.h b/source/common/winxp.h new file mode 100644 index 0000000..b105804 --- /dev/null +++ b/source/common/winxp.h @@ -0,0 +1,91 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#ifndef X265_WINXP_H +#define X265_WINXP_H + +#if defined(_WIN32) && (_WIN32_WINNT < 0x0600) // _WIN32_WINNT_VISTA + +#ifdef _MSC_VER +#include // _InterlockedCompareExchange64 +#endif + +namespace x265 { +/* non-native condition variable */ +typedef struct +{ + CRITICAL_SECTION broadcastMutex; + CRITICAL_SECTION waiterCountMutex; + HANDLE semaphore; + HANDLE waitersDone; + volatile int waiterCount; + volatile int bIsBroadcast; +} ConditionVariable; + +int WINAPI cond_init(ConditionVariable *cond); +void WINAPI cond_broadcast(ConditionVariable *cond); +void WINAPI cond_signal(ConditionVariable *cond); +BOOL WINAPI cond_wait(ConditionVariable *cond, CRITICAL_SECTION *mutex, DWORD wait); +void cond_destroy(ConditionVariable *cond); + +/* map missing API symbols to our structure and functions */ +#define CONDITION_VARIABLE x265::ConditionVariable +#define InitializeConditionVariable x265::cond_init +#define SleepConditionVariableCS x265::cond_wait +#define WakeConditionVariable x265::cond_signal +#define WakeAllConditionVariable x265::cond_broadcast +#define XP_CONDITION_VAR_FREE x265::cond_destroy + +#if defined(_MSC_VER) + +/* Windows XP did not define atomic OR 64, but gcc has a good version, so + * only use this workaround when targeting XP with MSVC */ +FORCEINLINE LONGLONG interlocked_OR64(__inout LONGLONG volatile *Destination, + __in LONGLONG Value) +{ + LONGLONG Old; + + do + { + Old = *Destination; + } + while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old); + + return Old; +} + +#define ATOMIC_OR(ptr, mask) x265::interlocked_OR64((volatile LONG64*)ptr, mask) + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma intrinsic(_InterlockedCompareExchange64) +#endif +#endif // defined(_MSC_VER) +} // namespace x265 + +#else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600) + +#define XP_CONDITION_VAR_FREE(x) + +#endif // _WIN32_WINNT <= _WIN32_WINNT_WINXP + +#endif // ifndef X265_WINXP_H diff --git a/source/common/x86/README.txt b/source/common/x86/README.txt new file mode 100644 index 0000000..f8c83ea --- /dev/null +++ b/source/common/x86/README.txt @@ -0,0 +1,14 @@ +The ASM source here is directly pulled from the x264 project with two +changes: + +1 - FENC_STRIDE must be increased to 64 in x86util.asm because of HEVC's + larger CU sizes +2 - Because of #1, we must rebrand the functions with x265_ prefixes in + x86inc.asm (private_prefix) and pixel-a.asm (mangle(x265_pixel_ssd)) +3 - We have modified the MMX SSD primitives to use EMMS before returning +4 - We have added some new SATD block sizes for SSE3 + +Current assembly is based on x264 revision: + configure: Support cygwin64 + Diogo Franco (Kovensky) + 2013-07-23 22:17:44 -0300 diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp new file mode 100644 index 0000000..ec1607d --- /dev/null +++ b/source/common/x86/asm-primitives.cpp @@ -0,0 +1,1853 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Praveen Kumar Tiwari + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "x265.h" +#include "cpu.h" + +extern "C" { +#include "pixel.h" +#include "pixel-util.h" +#include "mc.h" +#include "ipfilter8.h" +#include "loopfilter.h" +#include "blockcopy8.h" +#include "intrapred.h" +#include "dct8.h" +} + +#define INIT2_NAME(name1, name2, cpu) \ + p.name1[LUMA_16x16] = x265_pixel_ ## name2 ## _16x16 ## cpu; \ + p.name1[LUMA_16x8] = x265_pixel_ ## name2 ## _16x8 ## cpu; +#define INIT4_NAME(name1, name2, cpu) \ + INIT2_NAME(name1, name2, cpu) \ + p.name1[LUMA_8x16] = x265_pixel_ ## name2 ## _8x16 ## cpu; \ + p.name1[LUMA_8x8] = x265_pixel_ ## name2 ## _8x8 ## cpu; +#define INIT5_NAME(name1, name2, cpu) \ + INIT4_NAME(name1, name2, cpu) \ + p.name1[LUMA_8x4] = x265_pixel_ ## name2 ## _8x4 ## cpu; +#define INIT6_NAME(name1, name2, cpu) \ + INIT5_NAME(name1, name2, cpu) \ + p.name1[LUMA_4x8] = x265_pixel_ ## name2 ## _4x8 ## cpu; +#define INIT7_NAME(name1, name2, cpu) \ + INIT6_NAME(name1, name2, cpu) \ + p.name1[LUMA_4x4] = x265_pixel_ ## name2 ## _4x4 ## cpu; +#define INIT8_NAME(name1, name2, cpu) \ + INIT7_NAME(name1, name2, cpu) \ + p.name1[LUMA_4x16] = x265_pixel_ ## name2 ## _4x16 ## cpu; +#define INIT2(name, cpu) INIT2_NAME(name, name, cpu) +#define INIT4(name, cpu) INIT4_NAME(name, name, cpu) +#define INIT5(name, cpu) INIT5_NAME(name, name, cpu) +#define INIT6(name, cpu) INIT6_NAME(name, name, cpu) +#define INIT7(name, cpu) INIT7_NAME(name, name, cpu) +#define INIT8(name, cpu) INIT8_NAME(name, name, cpu) + +#define HEVC_SATD(cpu) \ + p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \ + p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \ + p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \ + p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \ + p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \ + p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \ + p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \ + p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \ + p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \ + p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \ + p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \ + p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \ + p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \ + p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \ + p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \ + p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \ + p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \ + p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \ + p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \ + p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \ + p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \ + p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \ + p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \ + p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; + +#define SAD_X3(cpu) \ + p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \ + p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \ + p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \ + p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \ + p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \ + p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \ + p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \ + p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \ + p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \ + p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \ + p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \ + p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \ + p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \ + p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \ + p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \ + p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu + +#define SAD_X4(cpu) \ + p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \ + p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \ + p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \ + p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \ + p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \ + p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \ + p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \ + p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \ + p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \ + p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \ + p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \ + p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \ + p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \ + p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \ + p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \ + p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu + +#define SAD(cpu) \ + p.sad[LUMA_8x32] = x265_pixel_sad_8x32_ ## cpu; \ + p.sad[LUMA_16x4] = x265_pixel_sad_16x4_ ## cpu; \ + p.sad[LUMA_16x12] = x265_pixel_sad_16x12_ ## cpu; \ + p.sad[LUMA_16x32] = x265_pixel_sad_16x32_ ## cpu; \ + p.sad[LUMA_16x64] = x265_pixel_sad_16x64_ ## cpu; \ + p.sad[LUMA_32x8] = x265_pixel_sad_32x8_ ## cpu; \ + p.sad[LUMA_32x16] = x265_pixel_sad_32x16_ ## cpu; \ + p.sad[LUMA_32x24] = x265_pixel_sad_32x24_ ## cpu; \ + p.sad[LUMA_32x32] = x265_pixel_sad_32x32_ ## cpu; \ + p.sad[LUMA_32x64] = x265_pixel_sad_32x64_ ## cpu; \ + p.sad[LUMA_64x16] = x265_pixel_sad_64x16_ ## cpu; \ + p.sad[LUMA_64x32] = x265_pixel_sad_64x32_ ## cpu; \ + p.sad[LUMA_64x48] = x265_pixel_sad_64x48_ ## cpu; \ + p.sad[LUMA_64x64] = x265_pixel_sad_64x64_ ## cpu; \ + p.sad[LUMA_48x64] = x265_pixel_sad_48x64_ ## cpu; \ + p.sad[LUMA_24x32] = x265_pixel_sad_24x32_ ## cpu; \ + p.sad[LUMA_12x16] = x265_pixel_sad_12x16_ ## cpu + +#define ASSGN_SSE(cpu) \ + p.sse_pp[LUMA_8x8] = x265_pixel_ssd_8x8_ ## cpu; \ + p.sse_pp[LUMA_8x4] = x265_pixel_ssd_8x4_ ## cpu; \ + p.sse_pp[LUMA_16x16] = x265_pixel_ssd_16x16_ ## cpu; \ + p.sse_pp[LUMA_16x4] = x265_pixel_ssd_16x4_ ## cpu; \ + p.sse_pp[LUMA_16x8] = x265_pixel_ssd_16x8_ ## cpu; \ + p.sse_pp[LUMA_8x16] = x265_pixel_ssd_8x16_ ## cpu; \ + p.sse_pp[LUMA_16x12] = x265_pixel_ssd_16x12_ ## cpu; \ + p.sse_pp[LUMA_32x32] = x265_pixel_ssd_32x32_ ## cpu; \ + p.sse_pp[LUMA_32x16] = x265_pixel_ssd_32x16_ ## cpu; \ + p.sse_pp[LUMA_16x32] = x265_pixel_ssd_16x32_ ## cpu; \ + p.sse_pp[LUMA_8x32] = x265_pixel_ssd_8x32_ ## cpu; \ + p.sse_pp[LUMA_32x8] = x265_pixel_ssd_32x8_ ## cpu; \ + p.sse_pp[LUMA_32x24] = x265_pixel_ssd_32x24_ ## cpu; \ + p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \ + p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu + +#define ASSGN_SSE_SS(cpu) \ + p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_ ## cpu; \ + p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_ ## cpu; \ + p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_ ## cpu; \ + p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_ ## cpu; \ + p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_ ## cpu; \ + p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_ ## cpu; \ + p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_ ## cpu; \ + p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_ ## cpu; \ + p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_ ## cpu; \ + p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_ ## cpu; \ + p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_ ## cpu; \ + p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_ ## cpu; \ + p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_ ## cpu; \ + p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_ ## cpu; \ + p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_ ## cpu; \ + p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_ ## cpu; \ + p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_ ## cpu; \ + p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_ ## cpu; \ + p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_ ## cpu; \ + p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_ ## cpu; \ + p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_ ## cpu; \ + p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_ ## cpu; \ + p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_ ## cpu; \ + p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_ ## cpu; \ + p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_ ## cpu; + +#define SA8D_INTER_FROM_BLOCK(cpu) \ + p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \ + p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \ + p.sa8d_inter[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \ + p.sa8d_inter[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \ + p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \ + p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_ ## cpu; \ + p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_ ## cpu; \ + p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \ + p.sa8d_inter[LUMA_16x8] = x265_pixel_sa8d_16x8_ ## cpu; \ + p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_ ## cpu; \ + p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_ ## cpu; \ + p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_ ## cpu; \ + p.sa8d_inter[LUMA_32x8] = x265_pixel_sa8d_32x8_ ## cpu; \ + p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_ ## cpu; \ + p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \ + p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_ ## cpu; \ + p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_ ## cpu; \ + p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \ + p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \ + p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \ + p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \ + p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_ ## cpu; \ + p.sa8d_inter[LUMA_64x16] = x265_pixel_sa8d_64x16_ ## cpu; \ + p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_ ## cpu; + +#define PIXEL_AVG(cpu) \ + p.pixelavg_pp[LUMA_64x64] = x265_pixel_avg_64x64_ ## cpu; \ + p.pixelavg_pp[LUMA_64x48] = x265_pixel_avg_64x48_ ## cpu; \ + p.pixelavg_pp[LUMA_64x32] = x265_pixel_avg_64x32_ ## cpu; \ + p.pixelavg_pp[LUMA_64x16] = x265_pixel_avg_64x16_ ## cpu; \ + p.pixelavg_pp[LUMA_48x64] = x265_pixel_avg_48x64_ ## cpu; \ + p.pixelavg_pp[LUMA_32x64] = x265_pixel_avg_32x64_ ## cpu; \ + p.pixelavg_pp[LUMA_32x32] = x265_pixel_avg_32x32_ ## cpu; \ + p.pixelavg_pp[LUMA_32x24] = x265_pixel_avg_32x24_ ## cpu; \ + p.pixelavg_pp[LUMA_32x16] = x265_pixel_avg_32x16_ ## cpu; \ + p.pixelavg_pp[LUMA_32x8] = x265_pixel_avg_32x8_ ## cpu; \ + p.pixelavg_pp[LUMA_24x32] = x265_pixel_avg_24x32_ ## cpu; \ + p.pixelavg_pp[LUMA_16x64] = x265_pixel_avg_16x64_ ## cpu; \ + p.pixelavg_pp[LUMA_16x32] = x265_pixel_avg_16x32_ ## cpu; \ + p.pixelavg_pp[LUMA_16x16] = x265_pixel_avg_16x16_ ## cpu; \ + p.pixelavg_pp[LUMA_16x12] = x265_pixel_avg_16x12_ ## cpu; \ + p.pixelavg_pp[LUMA_16x8] = x265_pixel_avg_16x8_ ## cpu; \ + p.pixelavg_pp[LUMA_16x4] = x265_pixel_avg_16x4_ ## cpu; \ + p.pixelavg_pp[LUMA_12x16] = x265_pixel_avg_12x16_ ## cpu; \ + p.pixelavg_pp[LUMA_8x32] = x265_pixel_avg_8x32_ ## cpu; \ + p.pixelavg_pp[LUMA_8x16] = x265_pixel_avg_8x16_ ## cpu; \ + p.pixelavg_pp[LUMA_8x8] = x265_pixel_avg_8x8_ ## cpu; \ + p.pixelavg_pp[LUMA_8x4] = x265_pixel_avg_8x4_ ## cpu; + +#define PIXEL_AVG_W4(cpu) \ + p.pixelavg_pp[LUMA_4x4] = x265_pixel_avg_4x4_ ## cpu; \ + p.pixelavg_pp[LUMA_4x8] = x265_pixel_avg_4x8_ ## cpu; \ + p.pixelavg_pp[LUMA_4x16] = x265_pixel_avg_4x16_ ## cpu; + +#define SETUP_CHROMA_FUNC_DEF_420(W, H, cpu) \ + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \ + p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \ + p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \ + p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \ + p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \ + p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; + +#define CHROMA_FILTERS_420(cpu) \ + SETUP_CHROMA_FUNC_DEF_420(4, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(4, 2, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(2, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(8, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(8, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(4, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(8, 6, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(6, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(8, 2, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(2, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(16, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(16, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(8, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(16, 12, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(12, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(16, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(4, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(32, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(32, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(16, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(32, 24, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(24, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(32, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_420(8, 32, cpu); + +#define CHROMA_FILTERS_422(cpu) \ + SETUP_CHROMA_FUNC_DEF_422(4, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(4, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(2, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(8, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(8, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(4, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(8, 12, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(6, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(8, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(2, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(16, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(16, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(8, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(16, 24, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(12, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(16, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(4, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(32, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(32, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(16, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(32, 48, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(24, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(32, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_422(8, 64, cpu); + +#define CHROMA_FILTERS_444(cpu) \ + SETUP_CHROMA_FUNC_DEF_444(8, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(8, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(4, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(16, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(16, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(8, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(16, 12, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(12, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(16, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(4, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(32, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(32, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(16, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(32, 24, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(24, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(32, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(64, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(64, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(32, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(64, 48, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(48, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(64, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF_444(16, 64, cpu); + +#define CHROMA_SP_FILTERS_SSE4_420(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF_420(4, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(4, 2, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(2, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(4, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(6, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(2, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(16, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(16, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(16, 12, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(12, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(16, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(4, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(32, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(32, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(16, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(32, 24, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(24, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(32, 8, cpu); + +#define CHROMA_SP_FILTERS_420(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF_420(8, 2, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(8, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(8, 6, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(8, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(8, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_420(8, 32, cpu); + +#define CHROMA_SP_FILTERS_SSE4_422(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF_422(4, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(4, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(2, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(4, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(6, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(2, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(16, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(16, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(16, 24, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(12, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(16, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(4, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(32, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(32, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(16, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(32, 48, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(24, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(32, 16, cpu); + +#define CHROMA_SP_FILTERS_422(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF_422(8, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(8, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(8, 12, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(8, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(8, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_422(8, 64, cpu); + +#define CHROMA_SP_FILTERS_SSE4_444(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF_444(4, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(16, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(16, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(16, 12, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(12, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(16, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(4, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(32, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(32, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(16, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(32, 24, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(24, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(64, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(64, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(32, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(64, 48, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(48, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(64, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(16, 64, cpu); + +#define CHROMA_SP_FILTERS_444(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF_444(8, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(8, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(8, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF_444(8, 32, cpu); + +#define CHROMA_SS_FILTERS_420(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF_420(4, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(4, 2, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(8, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(8, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(4, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(8, 6, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(8, 2, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(16, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(16, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(8, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(16, 12, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(12, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(16, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(4, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(32, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(32, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(16, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(32, 24, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(24, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(32, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(8, 32, cpu); + +#define CHROMA_SS_FILTERS_SSE4_420(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF_420(2, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(2, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_420(6, 8, cpu); + +#define CHROMA_SS_FILTERS_422(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF_422(4, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(4, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(8, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(8, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(4, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(8, 12, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(8, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(16, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(16, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(8, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(16, 24, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(12, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(16, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(4, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(32, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(32, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(16, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(32, 48, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(24, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(32, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(8, 64, cpu); + +#define CHROMA_SS_FILTERS_SSE4_422(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF_422(2, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(2, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_422(6, 16, cpu); + +#define CHROMA_SS_FILTERS_444(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF_444(8, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(8, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(4, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(16, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(16, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(8, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(16, 12, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(12, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(16, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(4, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(32, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(32, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(16, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(32, 24, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(24, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(32, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(64, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(64, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(32, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(64, 48, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(48, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(64, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF_444(16, 64, cpu); + +#if HIGH_BIT_DEPTH // temporary, until all 10bit functions are completed +#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \ + p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \ + p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \ + p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; +#else +#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \ + p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \ + p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; +#endif // if HIGH_BIT_DEPTH + +#define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \ + p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \ + p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu; + +#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \ + p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu; + +#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \ + p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu; + +#define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \ + p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu; + +#define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \ + p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu; + +#define CHROMA_BLOCKCOPY(type, cpu) \ + SETUP_CHROMA_BLOCKCOPY(type, 2, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 2, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 4, 2, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 4, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 4, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 4, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 6, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 8, 2, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 8, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 8, 6, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 8, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 8, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 8, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 12, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 16, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 16, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 16, 12, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 16, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 16, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 24, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 32, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 32, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \ + SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu); + +#define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \ + p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu; + +#define CHROMA_BLOCKCOPY_422(type, cpu) \ + SETUP_CHROMA_BLOCKCOPY_422(type, 2, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 2, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 4, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 4, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 4, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 4, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 6, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 8, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 8, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 8, 12, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 8, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 8, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 8, 64, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 12, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 16, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 16, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 16, 24, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 16, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 16, 64, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 24, 64, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 32, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 32, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 32, 48, cpu); \ + SETUP_CHROMA_BLOCKCOPY_422(type, 32, 64, cpu); + +#define LUMA_BLOCKCOPY(type, cpu) \ + SETUP_LUMA_BLOCKCOPY(type, 4, 4, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 8, 8, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 8, 4, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 4, 8, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 16, 16, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 16, 8, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 8, 16, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 16, 12, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 12, 16, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 16, 4, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 4, 16, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 32, 32, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 32, 16, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 16, 32, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 32, 24, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 24, 32, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 32, 8, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 8, 32, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 64, 64, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 64, 32, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 32, 64, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 64, 48, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 48, 64, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 64, 16, cpu); \ + SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu); + +#define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \ + p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu; + +#define CHROMA_BLOCKCOPY_SP(cpu) \ + SETUP_CHROMA_BLOCKCOPY_SP(2, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(2, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(4, 2, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(4, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(4, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(4, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(6, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(8, 2, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(8, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(8, 6, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(8, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(8, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(8, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(12, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(16, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(16, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(16, 12, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(16, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(16, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(24, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(32, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(32, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu); + +#define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu; + +#define CHROMA_BLOCKCOPY_SP_422(cpu) \ + SETUP_CHROMA_BLOCKCOPY_SP_422(2, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(2, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(4, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(4, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(4, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(4, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(6, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(8, 4, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(8, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(8, 12, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(8, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(8, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(8, 64, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(12, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(16, 8, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(16, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(16, 24, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(16, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(16, 64, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(24, 64, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(32, 16, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(32, 32, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(32, 48, cpu); \ + SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu); + +#define SETUP_CHROMA_PIXELSUB(W, H, cpu) \ + p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu; + +#define CHROMA_PIXELSUB_PS(cpu) \ + SETUP_CHROMA_PIXELSUB(4, 4, cpu); \ + SETUP_CHROMA_PIXELSUB(8, 8, cpu); \ + SETUP_CHROMA_PIXELSUB(16, 16, cpu); \ + SETUP_CHROMA_PIXELSUB(32, 32, cpu); + +#define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu; + +#define CHROMA_PIXELSUB_PS_422(cpu) \ + SETUP_CHROMA_PIXELSUB_422(4, 8, cpu); \ + SETUP_CHROMA_PIXELSUB_422(8, 16, cpu); \ + SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \ + SETUP_CHROMA_PIXELSUB_422(32, 64, cpu); + +#define LUMA_FILTERS(cpu) \ + SETUP_LUMA_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 64, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 64, cpu); + +#define LUMA_PIXELSUB(cpu) \ + SETUP_LUMA_SUB_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_SUB_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu); + +#define LUMA_SP_FILTERS(cpu) \ + SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu); + +#define LUMA_SS_FILTERS(cpu) \ + SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu); + +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \ + p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; + +#define LUMA_VAR(cpu) \ + SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ + SETUP_PIXEL_VAR_DEF(16, 16, cpu); \ + SETUP_PIXEL_VAR_DEF(32, 32, cpu); \ + SETUP_PIXEL_VAR_DEF(64, 64, cpu); + +#define SETUP_PIXEL_SSE_SP_DEF(W, H, cpu) \ + p.sse_sp[LUMA_ ## W ## x ## H] = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu; + +#define LUMA_SSE_SP(cpu) \ + SETUP_PIXEL_SSE_SP_DEF(4, 4, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(8, 8, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(8, 4, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(4, 8, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(16, 16, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(16, 8, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(8, 16, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(16, 12, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(12, 16, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(16, 4, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(4, 16, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(32, 32, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(32, 16, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(16, 32, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(32, 24, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(24, 32, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(32, 8, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(8, 32, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(64, 64, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(64, 32, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(32, 64, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(64, 48, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(48, 64, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \ + SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu); + +#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \ + p.luma_addAvg[LUMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu; + +#define LUMA_ADDAVG(cpu) \ + SETUP_LUMA_ADDAVG_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \ + +#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \ + p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu; + +#define CHROMA_ADDAVG(cpu) \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(6, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); + +#define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu; + +#define CHROMA_ADDAVG_422(cpu) \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 4, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(6, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 4, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 12, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 64, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(12, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 8, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 24, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 64, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(24, 64, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 16, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 32, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \ + SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu); + +#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \ + p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu; + +#define SETUP_INTRA_ANG(mode, fno, cpu) \ + p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu; + +#define SETUP_INTRA_ANG4(mode, fno, cpu) \ + p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; + +#define SETUP_INTRA_ANG16_32(mode, fno, cpu) \ + p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu; + +#define SETUP_INTRA_ANG4_8(mode, fno, cpu) \ + p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \ + p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; + +#define INTRA_ANG_SSSE3(cpu) \ + SETUP_INTRA_ANG_COMMON(2, 2, cpu); \ + SETUP_INTRA_ANG_COMMON(34, 2, cpu); + +#define INTRA_ANG_SSE4_COMMON(cpu) \ + SETUP_INTRA_ANG_COMMON(3, 3, cpu); \ + SETUP_INTRA_ANG_COMMON(4, 4, cpu); \ + SETUP_INTRA_ANG_COMMON(5, 5, cpu); \ + SETUP_INTRA_ANG_COMMON(6, 6, cpu); \ + SETUP_INTRA_ANG_COMMON(7, 7, cpu); \ + SETUP_INTRA_ANG_COMMON(8, 8, cpu); \ + SETUP_INTRA_ANG_COMMON(9, 9, cpu); \ + SETUP_INTRA_ANG_COMMON(10, 10, cpu); \ + SETUP_INTRA_ANG_COMMON(11, 11, cpu); \ + SETUP_INTRA_ANG_COMMON(12, 12, cpu); \ + SETUP_INTRA_ANG_COMMON(13, 13, cpu); \ + SETUP_INTRA_ANG_COMMON(14, 14, cpu); \ + SETUP_INTRA_ANG_COMMON(15, 15, cpu); \ + SETUP_INTRA_ANG_COMMON(16, 16, cpu); \ + SETUP_INTRA_ANG_COMMON(17, 17, cpu); \ + SETUP_INTRA_ANG_COMMON(18, 18, cpu); + +#define INTRA_ANG_SSE4_HIGH(cpu) \ + SETUP_INTRA_ANG(19, 19, cpu); \ + SETUP_INTRA_ANG(20, 20, cpu); \ + SETUP_INTRA_ANG(21, 21, cpu); \ + SETUP_INTRA_ANG(22, 22, cpu); \ + SETUP_INTRA_ANG(23, 23, cpu); \ + SETUP_INTRA_ANG(24, 24, cpu); \ + SETUP_INTRA_ANG(25, 25, cpu); \ + SETUP_INTRA_ANG(26, 26, cpu); \ + SETUP_INTRA_ANG(27, 27, cpu); \ + SETUP_INTRA_ANG(28, 28, cpu); \ + SETUP_INTRA_ANG(29, 29, cpu); \ + SETUP_INTRA_ANG(30, 30, cpu); \ + SETUP_INTRA_ANG(31, 31, cpu); \ + SETUP_INTRA_ANG(32, 32, cpu); \ + SETUP_INTRA_ANG(33, 33, cpu); \ + SETUP_INTRA_ANG4(19, 17, cpu); \ + SETUP_INTRA_ANG4(20, 16, cpu); \ + SETUP_INTRA_ANG4(21, 15, cpu); \ + SETUP_INTRA_ANG4(22, 14, cpu); \ + SETUP_INTRA_ANG4(23, 13, cpu); \ + SETUP_INTRA_ANG4(24, 12, cpu); \ + SETUP_INTRA_ANG4(25, 11, cpu); \ + SETUP_INTRA_ANG4(26, 26, cpu); \ + SETUP_INTRA_ANG4(27, 9, cpu); \ + SETUP_INTRA_ANG4(28, 8, cpu); \ + SETUP_INTRA_ANG4(29, 7, cpu); \ + SETUP_INTRA_ANG4(30, 6, cpu); \ + SETUP_INTRA_ANG4(31, 5, cpu); \ + SETUP_INTRA_ANG4(32, 4, cpu); \ + SETUP_INTRA_ANG4(33, 3, cpu); + +#define INTRA_ANG_SSE4(cpu) \ + SETUP_INTRA_ANG4_8(19, 17, cpu); \ + SETUP_INTRA_ANG4_8(20, 16, cpu); \ + SETUP_INTRA_ANG4_8(21, 15, cpu); \ + SETUP_INTRA_ANG4_8(22, 14, cpu); \ + SETUP_INTRA_ANG4_8(23, 13, cpu); \ + SETUP_INTRA_ANG4_8(24, 12, cpu); \ + SETUP_INTRA_ANG4_8(25, 11, cpu); \ + SETUP_INTRA_ANG4_8(26, 26, cpu); \ + SETUP_INTRA_ANG4_8(27, 9, cpu); \ + SETUP_INTRA_ANG4_8(28, 8, cpu); \ + SETUP_INTRA_ANG4_8(29, 7, cpu); \ + SETUP_INTRA_ANG4_8(30, 6, cpu); \ + SETUP_INTRA_ANG4_8(31, 5, cpu); \ + SETUP_INTRA_ANG4_8(32, 4, cpu); \ + SETUP_INTRA_ANG4_8(33, 3, cpu); \ + SETUP_INTRA_ANG16_32(19, 19, cpu); \ + SETUP_INTRA_ANG16_32(20, 20, cpu); \ + SETUP_INTRA_ANG16_32(21, 21, cpu); \ + SETUP_INTRA_ANG16_32(22, 22, cpu); \ + SETUP_INTRA_ANG16_32(23, 23, cpu); \ + SETUP_INTRA_ANG16_32(24, 24, cpu); \ + SETUP_INTRA_ANG16_32(25, 25, cpu); \ + SETUP_INTRA_ANG16_32(26, 26, cpu); \ + SETUP_INTRA_ANG16_32(27, 27, cpu); \ + SETUP_INTRA_ANG16_32(28, 28, cpu); \ + SETUP_INTRA_ANG16_32(29, 29, cpu); \ + SETUP_INTRA_ANG16_32(30, 30, cpu); \ + SETUP_INTRA_ANG16_32(31, 31, cpu); \ + SETUP_INTRA_ANG16_32(32, 32, cpu); \ + SETUP_INTRA_ANG16_32(33, 33, cpu); + +#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \ + p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu; + +#define CHROMA_VERT_FILTERS(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); + +#define CHROMA_VERT_FILTERS_SSE4(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu); + +#define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu; + +#define CHROMA_VERT_FILTERS_422(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(8, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(8, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(4, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(8, 12, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(8, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(16, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(16, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(8, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(16, 24, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(12, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(16, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(4, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(32, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(32, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(16, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(32, 48, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(24, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(32, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(8, 64, cpu); + +#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF_422(2, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(2, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(4, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu); + +#define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \ + p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu; + +#define CHROMA_VERT_FILTERS_444(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(8, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(4, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(16, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(16, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(8, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(16, 12, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(12, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(16, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(4, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(32, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(32, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(16, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(32, 24, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(24, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(32, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(8, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(64, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(64, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(32, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(64, 48, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(48, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(64, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu); + +#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \ + p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; + +#define CHROMA_HORIZ_FILTERS(cpu) \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); + +#define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \ + p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; + +#define CHROMA_HORIZ_FILTERS_422(cpu) \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 12, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(6, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 24, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(12, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 48, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(24, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu); + +#define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \ + p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \ + p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; + +#define CHROMA_HORIZ_FILTERS_444(cpu) \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 12, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(12, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 24, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(24, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 48, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(48, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 64, cpu); + +namespace x265 { +// private x265 namespace + +#if HIGH_BIT_DEPTH +/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */ +template +void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma) +{ + const int size = 1 << log2Size; + const int sizeIdx = log2Size - 2; + ALIGN_VAR_32(pixel, buffer[32 * 32]); + + for (int mode = 2; mode <= 34; mode++) + { + pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0); + pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0); + pixel *out = dest + ((mode - 2) << (log2Size * 2)); + + if (mode < 18) + { + primitives.intra_pred[mode][sizeIdx](buffer, size, left, above, mode, bLuma); + primitives.transpose[sizeIdx](out, buffer, size); + } + else + primitives.intra_pred[mode][sizeIdx](out, size, left, above, mode, bLuma); + } +} +#endif + +void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) +{ +#if HIGH_BIT_DEPTH + if (cpuMask & X265_CPU_SSE2) + { + INIT8(sad, _mmx2); + INIT2(sad, _sse2); + SAD(sse2); + + INIT6(satd, _sse2); + HEVC_SATD(sse2); + p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; + + p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; + SA8D_INTER_FROM_BLOCK(sse2); + p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2; + p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2; + + p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_mmx2; + p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_mmx2; + p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_mmx2; + p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_sse2; + p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2; + p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2; + p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2; + p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2; + p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2; + p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2; + p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2; + p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2; + p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2; + p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2; + p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2; + p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2; + p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2; + p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2; + p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2; + p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2; + p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2; + p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2; + p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2; + p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2; + p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2; + + p.transpose[BLOCK_4x4] = x265_transpose4_sse2; + p.transpose[BLOCK_8x8] = x265_transpose8_sse2; + p.transpose[BLOCK_16x16] = x265_transpose16_sse2; + p.transpose[BLOCK_32x32] = x265_transpose32_sse2; + p.transpose[BLOCK_64x64] = x265_transpose64_sse2; + + p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; + p.ssim_end_4 = x265_pixel_ssim_end4_sse2; + PIXEL_AVG(sse2); + PIXEL_AVG_W4(mmx2); + LUMA_VAR(_sse2); + + SAD_X3(sse2); + p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2; + p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2; + p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2; + p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2; + p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2; + p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2; + p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2; + p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2; + p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2; + + SAD_X4(sse2); + p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2; + p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2; + p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2; + p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2; + p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2; + p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2; + p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2; + p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2; + p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2; + + p.cvt32to16_shr = x265_cvt32to16_shr_sse2; + p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2; + p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2; + p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2; + p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2; + + CHROMA_PIXELSUB_PS(_sse2); + CHROMA_PIXELSUB_PS_422(_sse2); + LUMA_PIXELSUB(_sse2); + + CHROMA_BLOCKCOPY(ss, _sse2); + CHROMA_BLOCKCOPY_422(ss, _sse2); + LUMA_BLOCKCOPY(ss, _sse2); + + CHROMA_VERT_FILTERS(_sse2); + CHROMA_VERT_FILTERS_422(_sse2); + CHROMA_VERT_FILTERS_444(_sse2); + p.luma_p2s = x265_luma_p2s_sse2; + p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2; + p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2; + p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s + + p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2; + p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2; + p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2; + p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2; + + // TODO: overflow on 12-bits mode! + p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2; + p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2; + p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2; + p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2; + + p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2; + p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2; + p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2; + p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2; + + p.dct[DCT_4x4] = x265_dct4_sse2; + p.idct[IDCT_4x4] = x265_idct4_sse2; + p.idct[IDST_4x4] = x265_idst4_sse2; + + LUMA_SS_FILTERS(_sse2); + } + if (cpuMask & X265_CPU_SSSE3) + { + p.scale1D_128to64 = x265_scale1D_128to64_ssse3; + p.scale2D_64to32 = x265_scale2D_64to32_ssse3; + + INTRA_ANG_SSSE3(ssse3); + + p.dct[DST_4x4] = x265_dst4_ssse3; + p.idct[IDCT_8x8] = x265_idct8_ssse3; + p.count_nonzero = x265_count_nonzero_ssse3; + } + if (cpuMask & X265_CPU_SSE4) + { + LUMA_ADDAVG(_sse4); + CHROMA_ADDAVG(_sse4); + CHROMA_ADDAVG_422(_sse4); + LUMA_FILTERS(_sse4); + CHROMA_HORIZ_FILTERS(_sse4); + CHROMA_VERT_FILTERS_SSE4(_sse4); + CHROMA_HORIZ_FILTERS_422(_sse4); + CHROMA_VERT_FILTERS_SSE4_422(_sse4); + CHROMA_HORIZ_FILTERS_444(_sse4); + + p.dct[DCT_8x8] = x265_dct8_sse4; + p.quant = x265_quant_sse4; + p.nquant = x265_nquant_sse4; + p.dequant_normal = x265_dequant_normal_sse4; + p.cvt16to32_shl = x265_cvt16to32_shl_sse4; + p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4; + p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4; + p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4; + p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4; + p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4; + p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4; + p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4; + p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4; + + p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4; + p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4; + p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4; + p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4; + p.planecopy_cp = x265_upShift_8_sse4; + + INTRA_ANG_SSE4_COMMON(sse4); + INTRA_ANG_SSE4_HIGH(sse4); + } + if (cpuMask & X265_CPU_XOP) + { + p.frame_init_lowres_core = x265_frame_init_lowres_core_xop; + SA8D_INTER_FROM_BLOCK(xop); + INIT7(satd, _xop); + HEVC_SATD(xop); + } + if (cpuMask & X265_CPU_AVX2) + { + p.dct[DCT_4x4] = x265_dct4_avx2; + p.quant = x265_quant_avx2; + p.nquant = x265_nquant_avx2; + p.dequant_normal = x265_dequant_normal_avx2; + p.scale1D_128to64 = x265_scale1D_128to64_avx2; +#if X86_64 + p.dct[DCT_8x8] = x265_dct8_avx2; + p.dct[DCT_16x16] = x265_dct16_avx2; + p.dct[DCT_32x32] = x265_dct32_avx2; + p.idct[IDCT_4x4] = x265_idct4_avx2; + p.idct[IDCT_8x8] = x265_idct8_avx2; + p.idct[IDCT_16x16] = x265_idct16_avx2; + p.idct[IDCT_32x32] = x265_idct32_avx2; + + p.transpose[BLOCK_8x8] = x265_transpose8_avx2; + p.transpose[BLOCK_16x16] = x265_transpose16_avx2; + p.transpose[BLOCK_32x32] = x265_transpose32_avx2; + p.transpose[BLOCK_64x64] = x265_transpose64_avx2; +#endif + } + /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */ + for (int i = 0; i < NUM_LUMA_PARTITIONS; i++) + { + p.sse_pp[i] = (pixelcmp_t)p.sse_ss[i]; + p.sse_sp[i] = (pixelcmp_sp_t)p.sse_ss[i]; + } + + for (int i = 0; i < NUM_LUMA_PARTITIONS; i++) + { + p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i]; + p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i]; + p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i]; + } + + for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++) + { + p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i]; + p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i]; + p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i]; + } + + for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++) + { + p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i]; + p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i]; + p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i]; + } + + if (p.intra_pred[0][0] && p.transpose[0]) + { + p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>; + p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>; + p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>; + p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>; + } + +#else // if HIGH_BIT_DEPTH + if (cpuMask & X265_CPU_SSE2) + { + INIT8_NAME(sse_pp, ssd, _mmx); + INIT8(sad, _mmx2); + INIT8(sad_x3, _mmx2); + INIT8(sad_x4, _mmx2); + p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; + p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; + p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2; + + PIXEL_AVG(sse2); + PIXEL_AVG_W4(mmx2); + + LUMA_VAR(_sse2); + + ASSGN_SSE(sse2); + ASSGN_SSE_SS(sse2); + INIT2(sad, _sse2); + SAD(sse2); + INIT2(sad_x3, _sse2); + INIT2(sad_x4, _sse2); + HEVC_SATD(sse2); + + CHROMA_BLOCKCOPY(ss, _sse2); + CHROMA_BLOCKCOPY(pp, _sse2); + CHROMA_BLOCKCOPY_422(ss, _sse2); + CHROMA_BLOCKCOPY_422(pp, _sse2); + LUMA_BLOCKCOPY(ss, _sse2); + LUMA_BLOCKCOPY(pp, _sse2); + LUMA_BLOCKCOPY(sp, _sse2); + CHROMA_BLOCKCOPY_SP(_sse2); + CHROMA_BLOCKCOPY_SP_422(_sse2); + + CHROMA_SS_FILTERS_420(_sse2); + CHROMA_SS_FILTERS_422(_sse2); + CHROMA_SS_FILTERS_444(_sse2); + CHROMA_SP_FILTERS_420(_sse2); + CHROMA_SP_FILTERS_422(_sse2); + CHROMA_SP_FILTERS_444(_sse2); + LUMA_SS_FILTERS(_sse2); + + // This function pointer initialization is temporary will be removed + // later with macro definitions. It is used to avoid linker errors + // until all partitions are coded and commit smaller patches, easier to + // review. + + p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2; + p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2; + p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2; + p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2; + + p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2; + p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2; + p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2; + p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2; + + p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2; + SA8D_INTER_FROM_BLOCK(sse2); + + p.cvt32to16_shr = x265_cvt32to16_shr_sse2; + p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2; + p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2; + p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2; + p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2; + p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2; + p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2; + p.transpose[BLOCK_4x4] = x265_transpose4_sse2; + p.transpose[BLOCK_8x8] = x265_transpose8_sse2; + p.transpose[BLOCK_16x16] = x265_transpose16_sse2; + p.transpose[BLOCK_32x32] = x265_transpose32_sse2; + p.transpose[BLOCK_64x64] = x265_transpose64_sse2; + p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; + p.ssim_end_4 = x265_pixel_ssim_end4_sse2; + p.dct[DCT_4x4] = x265_dct4_sse2; + p.idct[IDCT_4x4] = x265_idct4_sse2; + p.idct[IDST_4x4] = x265_idst4_sse2; + p.planecopy_sp = x265_downShift_16_sse2; + p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2; + p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2; + p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2; + p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2; + } + if (cpuMask & X265_CPU_SSSE3) + { + p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3; + SA8D_INTER_FROM_BLOCK(ssse3); + p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3; + ASSGN_SSE(ssse3); + PIXEL_AVG(ssse3); + PIXEL_AVG_W4(ssse3); + + INTRA_ANG_SSSE3(ssse3); + + p.scale1D_128to64 = x265_scale1D_128to64_ssse3; + p.scale2D_64to32 = x265_scale2D_64to32_ssse3; + SAD_X3(ssse3); + SAD_X4(ssse3); + p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3; + p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3; + p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3; + p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3; + p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_ssse3; + p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_ssse3; + + p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3; + p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3; + + p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3; + p.luma_p2s = x265_luma_p2s_ssse3; + p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3; + p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3; + p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s + + p.dct[DST_4x4] = x265_dst4_ssse3; + p.idct[IDCT_8x8] = x265_idct8_ssse3; + p.count_nonzero = x265_count_nonzero_ssse3; + } + if (cpuMask & X265_CPU_SSE4) + { + p.saoCuOrgE0 = x265_saoCuOrgE0_sse4; + + LUMA_ADDAVG(_sse4); + CHROMA_ADDAVG(_sse4); + CHROMA_ADDAVG_422(_sse4); + p.cvt16to32_shl = x265_cvt16to32_shl_sse4; + p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4; + p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4; + p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4; + p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4; + + // TODO: check POPCNT flag! + p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4; + p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_sse4; + p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_sse4; + p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_sse4; + + HEVC_SATD(sse4); + SA8D_INTER_FROM_BLOCK(sse4); + + p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4; + p.sse_pp[LUMA_24x32] = x265_pixel_ssd_24x32_sse4; + p.sse_pp[LUMA_48x64] = x265_pixel_ssd_48x64_sse4; + p.sse_pp[LUMA_64x16] = x265_pixel_ssd_64x16_sse4; + p.sse_pp[LUMA_64x32] = x265_pixel_ssd_64x32_sse4; + p.sse_pp[LUMA_64x48] = x265_pixel_ssd_64x48_sse4; + p.sse_pp[LUMA_64x64] = x265_pixel_ssd_64x64_sse4; + + LUMA_SSE_SP(_sse4); + + CHROMA_PIXELSUB_PS(_sse4); + CHROMA_PIXELSUB_PS_422(_sse4); + LUMA_PIXELSUB(_sse4); + + CHROMA_FILTERS_420(_sse4); + CHROMA_FILTERS_422(_sse4); + CHROMA_FILTERS_444(_sse4); + CHROMA_SS_FILTERS_SSE4_420(_sse4); + CHROMA_SS_FILTERS_SSE4_422(_sse4); + CHROMA_SP_FILTERS_SSE4_420(_sse4); + CHROMA_SP_FILTERS_SSE4_422(_sse4); + CHROMA_SP_FILTERS_SSE4_444(_sse4); + LUMA_SP_FILTERS(_sse4); + LUMA_FILTERS(_sse4); + ASSGN_SSE_SS(sse4); + + p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4; + p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4; + p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4; + CHROMA_BLOCKCOPY(ps, _sse4); + CHROMA_BLOCKCOPY_422(ps, _sse4); + LUMA_BLOCKCOPY(ps, _sse4); + + p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4; + p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4; + p.quant = x265_quant_sse4; + p.nquant = x265_nquant_sse4; + p.dequant_normal = x265_dequant_normal_sse4; + p.weight_pp = x265_weight_pp_sse4; + p.weight_sp = x265_weight_sp_sse4; + p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4; + p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4; + p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4; + p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4; + + p.intra_pred_allangs[BLOCK_4x4] = x265_all_angs_pred_4x4_sse4; + p.intra_pred_allangs[BLOCK_8x8] = x265_all_angs_pred_8x8_sse4; + p.intra_pred_allangs[BLOCK_16x16] = x265_all_angs_pred_16x16_sse4; + p.intra_pred_allangs[BLOCK_32x32] = x265_all_angs_pred_32x32_sse4; + + p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4; + p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4; + p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4; + p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4; + + INTRA_ANG_SSE4_COMMON(sse4); + INTRA_ANG_SSE4(sse4); + + p.dct[DCT_8x8] = x265_dct8_sse4; + p.copy_shr = x265_copy_shr_sse4; + p.denoiseDct = x265_denoise_dct_sse4; + } + if (cpuMask & X265_CPU_AVX) + { + p.frame_init_lowres_core = x265_frame_init_lowres_core_avx; + HEVC_SATD(avx); + SA8D_INTER_FROM_BLOCK(avx); + ASSGN_SSE(avx); + + ASSGN_SSE_SS(avx); + SAD_X3(avx); + SAD_X4(avx); + p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx; + p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx; + p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx; + p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx; + + p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx; + p.ssim_end_4 = x265_pixel_ssim_end4_avx; + p.luma_copy_ss[LUMA_64x16] = x265_blockcopy_ss_64x16_avx; + p.luma_copy_ss[LUMA_64x32] = x265_blockcopy_ss_64x32_avx; + p.luma_copy_ss[LUMA_64x48] = x265_blockcopy_ss_64x48_avx; + p.luma_copy_ss[LUMA_64x64] = x265_blockcopy_ss_64x64_avx; + + p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x8] = x265_blockcopy_pp_32x8_avx; + p.luma_copy_pp[LUMA_32x8] = x265_blockcopy_pp_32x8_avx; + + p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x16] = x265_blockcopy_pp_32x16_avx; + p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x16] = x265_blockcopy_pp_32x16_avx; + p.luma_copy_pp[LUMA_32x16] = x265_blockcopy_pp_32x16_avx; + + p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x24] = x265_blockcopy_pp_32x24_avx; + p.luma_copy_pp[LUMA_32x24] = x265_blockcopy_pp_32x24_avx; + + p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x32] = x265_blockcopy_pp_32x32_avx; + p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x32] = x265_blockcopy_pp_32x32_avx; + p.luma_copy_pp[LUMA_32x32] = x265_blockcopy_pp_32x32_avx; + + p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x48] = x265_blockcopy_pp_32x48_avx; + + p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x64] = x265_blockcopy_pp_32x64_avx; + p.luma_copy_pp[LUMA_32x64] = x265_blockcopy_pp_32x64_avx; + } + if (cpuMask & X265_CPU_XOP) + { + p.frame_init_lowres_core = x265_frame_init_lowres_core_xop; + SA8D_INTER_FROM_BLOCK(xop); + INIT7(satd, _xop); + INIT5_NAME(sse_pp, ssd, _xop); + HEVC_SATD(xop); + } + if (cpuMask & X265_CPU_AVX2) + { + INIT2(sad_x4, _avx2); + INIT4(satd, _avx2); + INIT2_NAME(sse_pp, ssd, _avx2); + p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2; + p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2; + p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2; + + /* Need to update assembly code as per changed interface of the copy_cnt primitive, once + * code is updated, avx2 version will be enabled */ + + p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2; + p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2; + p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2; + + p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2; + p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2; + + p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2; + p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2; + p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2; + p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2; + p.denoiseDct = x265_denoise_dct_avx2; + p.dct[DCT_4x4] = x265_dct4_avx2; + p.quant = x265_quant_avx2; + p.nquant = x265_nquant_avx2; + p.dequant_normal = x265_dequant_normal_avx2; + p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx; + p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx; + p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx; + p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x16] = x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x32] = x265_blockcopy_ss_16x32_avx; + p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x8] = x265_blockcopy_ss_16x8_avx; + p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x16] = x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x24] = x265_blockcopy_ss_16x24_avx; + p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx; + p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx; + p.scale1D_128to64 = x265_scale1D_128to64_avx2; + + p.weight_pp = x265_weight_pp_avx2; + +#if X86_64 + p.dct[DCT_8x8] = x265_dct8_avx2; + p.dct[DCT_16x16] = x265_dct16_avx2; + p.dct[DCT_32x32] = x265_dct32_avx2; + p.idct[IDCT_4x4] = x265_idct4_avx2; + p.idct[IDCT_8x8] = x265_idct8_avx2; + p.idct[IDCT_16x16] = x265_idct16_avx2; + p.idct[IDCT_32x32] = x265_idct32_avx2; + + p.transpose[BLOCK_8x8] = x265_transpose8_avx2; + p.transpose[BLOCK_16x16] = x265_transpose16_avx2; + p.transpose[BLOCK_32x32] = x265_transpose32_avx2; + p.transpose[BLOCK_64x64] = x265_transpose64_avx2; +#endif + p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2; + } +#endif // if HIGH_BIT_DEPTH +} +} + +extern "C" { +#ifdef __INTEL_COMPILER + +/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of + * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30) + * adapted to x265's cpu schema. */ + +// Global variable indicating cpu +int __intel_cpu_indicator = 0; +// CPU dispatcher function +void x265_intel_cpu_indicator_init(void) +{ + uint32_t cpu = x265::cpu_detect(); + + if (cpu & X265_CPU_AVX) + __intel_cpu_indicator = 0x20000; + else if (cpu & X265_CPU_SSE42) + __intel_cpu_indicator = 0x8000; + else if (cpu & X265_CPU_SSE4) + __intel_cpu_indicator = 0x2000; + else if (cpu & X265_CPU_SSSE3) + __intel_cpu_indicator = 0x1000; + else if (cpu & X265_CPU_SSE3) + __intel_cpu_indicator = 0x800; + else if (cpu & X265_CPU_SSE2 && !(cpu & X265_CPU_SSE2_IS_SLOW)) + __intel_cpu_indicator = 0x200; + else if (cpu & X265_CPU_SSE) + __intel_cpu_indicator = 0x80; + else if (cpu & X265_CPU_MMX2) + __intel_cpu_indicator = 8; + else + __intel_cpu_indicator = 1; +} + +/* __intel_cpu_indicator_init appears to have a non-standard calling convention that + * assumes certain registers aren't preserved, so we'll route it through a function + * that backs up all the registers. */ +void __intel_cpu_indicator_init(void) +{ + x265_safe_intel_cpu_indicator_init(); +} + +#else // ifdef __INTEL_COMPILER +void x265_intel_cpu_indicator_init(void) {} + +#endif // ifdef __INTEL_COMPILER +} diff --git a/source/common/x86/blockcopy8.asm b/source/common/x86/blockcopy8.asm new file mode 100644 index 0000000..e892157 --- /dev/null +++ b/source/common/x86/blockcopy8.asm @@ -0,0 +1,4925 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Praveen Kumar Tiwari +;* Murugan Vairavel +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 + +cextern pb_4 +cextern pb_1 +cextern pb_16 +cextern pb_64 +cextern pw_4 +cextern pb_8 +cextern pb_32 +cextern pb_128 + +SECTION .text + +;----------------------------------------------------------------------------- +; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_2x4, 4, 7, 0 + mov r4w, [r2] + mov r5w, [r2 + r3] + lea r2, [r2 + r3 * 2] + mov r6w, [r2] + mov r3w, [r2 + r3] + + mov [r0], r4w + mov [r0 + r1], r5w + lea r0, [r0 + 2 * r1] + mov [r0], r6w + mov [r0 + r1], r3w +RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_2x8, 4, 7, 0 + mov r4w, [r2] + mov r5w, [r2 + r3] + mov r6w, [r2 + 2 * r3] + + mov [r0], r4w + mov [r0 + r1], r5w + mov [r0 + 2 * r1], r6w + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + mov r4w, [r2 + r3] + mov r5w, [r2 + 2 * r3] + + mov [r0 + r1], r4w + mov [r0 + 2 * r1], r5w + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + mov r4w, [r2 + r3] + mov r5w, [r2 + 2 * r3] + + mov [r0 + r1], r4w + mov [r0 + 2 * r1], r5w + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + mov r4w, [r2 + r3] + mov [r0 + r1], r4w + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_2x16, 4, 7, 0 + mov r6d, 16/2 +.loop: + mov r4w, [r2] + mov r5w, [r2 + r3] + dec r6d + lea r2, [r2 + r3 * 2] + mov [r0], r4w + mov [r0 + r1], r5w + lea r0, [r0 + r1 * 2] + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_4x2, 4, 6, 0 + mov r4d, [r2] + mov r5d, [r2 + r3] + + mov [r0], r4d + mov [r0 + r1], r5d + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_4x4, 4, 4, 4 + movd m0, [r2] + movd m1, [r2 + r3] + movd m2, [r2 + 2 * r3] + lea r3, [r3 + r3 * 2] + movd m3, [r2 + r3] + + movd [r0], m0 + movd [r0 + r1], m1 + movd [r0 + 2 * r1], m2 + lea r1, [r1 + 2 * r1] + movd [r0 + r1], m3 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W4_H8 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 4 + mov r4d, %2/8 +.loop: + movd m0, [r2] + movd m1, [r2 + r3] + lea r2, [r2 + 2 * r3] + movd m2, [r2] + movd m3, [r2 + r3] + + movd [r0], m0 + movd [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movd [r0], m2 + movd [r0 + r1], m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + movd m0, [r2] + movd m1, [r2 + r3] + lea r2, [r2 + 2 * r3] + movd m2, [r2] + movd m3, [r2 + r3] + + movd [r0], m0 + movd [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movd [r0], m2 + movd [r0 + r1], m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W4_H8 4, 8 +BLOCKCOPY_PP_W4_H8 4, 16 + +BLOCKCOPY_PP_W4_H8 4, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_6x8, 4, 7, 8 + + movd m0, [r2] + movd m1, [r2 + r3] + movd m2, [r2 + 2 * r3] + lea r5, [r2 + 2 * r3] + movd m3, [r5 + r3] + + movd m4, [r5 + 2 * r3] + lea r5, [r5 + 2 * r3] + movd m5, [r5 + r3] + movd m6, [r5 + 2 * r3] + lea r5, [r5 + 2 * r3] + movd m7, [r5 + r3] + + movd [r0], m0 + movd [r0 + r1], m1 + movd [r0 + 2 * r1], m2 + lea r6, [r0 + 2 * r1] + movd [r6 + r1], m3 + + movd [r6 + 2 * r1], m4 + lea r6, [r6 + 2 * r1] + movd [r6 + r1], m5 + movd [r6 + 2 * r1], m6 + lea r6, [r6 + 2 * r1] + movd [r6 + r1], m7 + + mov r4w, [r2 + 4] + mov r5w, [r2 + r3 + 4] + mov r6w, [r2 + 2 * r3 + 4] + + mov [r0 + 4], r4w + mov [r0 + r1 + 4], r5w + mov [r0 + 2 * r1 + 4], r6w + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + mov r4w, [r2 + r3 + 4] + mov r5w, [r2 + 2 * r3 + 4] + + mov [r0 + r1 + 4], r4w + mov [r0 + 2 * r1 + 4], r5w + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + mov r4w, [r2 + r3 + 4] + mov r5w, [r2 + 2 * r3 + 4] + + mov [r0 + r1 + 4], r4w + mov [r0 + 2 * r1 + 4], r5w + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + mov r4w, [r2 + r3 + 4] + mov [r0 + r1 + 4], r4w + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_6x16, 4, 7, 2 + mov r6d, 16/2 +.loop: + movd m0, [r2] + mov r4w, [r2 + 4] + movd m1, [r2 + r3] + mov r5w, [r2 + r3 + 4] + lea r2, [r2 + r3 * 2] + movd [r0], m0 + mov [r0 + 4], r4w + movd [r0 + r1], m1 + mov [r0 + r1 + 4], r5w + lea r0, [r0 + r1 * 2] + dec r6d + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x2, 4, 4, 2 + movh m0, [r2] + movh m1, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 +RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x4, 4, 4, 4 + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + lea r3, [r3 + r3 * 2] + movh m3, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + lea r1, [r1 + 2 * r1] + movh [r0 + r1], m3 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x6, 4, 7, 6 + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + lea r5, [r2 + 2 * r3] + movh m3, [r5 + r3] + movh m4, [r5 + 2 * r3] + lea r5, [r5 + 2 * r3] + movh m5, [r5 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + lea r6, [r0 + 2 * r1] + movh [r6 + r1], m3 + movh [r6 + 2 * r1], m4 + lea r6, [r6 + 2 * r1] + movh [r6 + r1], m5 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x12, 4, 5, 2 + mov r4d, 12/2 +.loop: + movh m0, [r2] + movh m1, [r2 + r3] + movh [r0], m0 + movh [r0 + r1], m1 + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W8_H8 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 6 + mov r4d, %2/8 + +.loop: + movh m0, [r2] + movh m1, [r2 + r3] + lea r2, [r2 + 2 * r3] + movh m2, [r2] + movh m3, [r2 + r3] + lea r2, [r2 + 2 * r3] + movh m4, [r2] + movh m5, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movh [r0], m2 + movh [r0 + r1], m3 + lea r0, [r0 + 2 * r1] + movh [r0], m4 + movh [r0 + r1], m5 + + lea r2, [r2 + 2 * r3] + movh m4, [r2] + movh m5, [r2 + r3] + lea r0, [r0 + 2 * r1] + movh [r0], m4 + movh [r0 + r1], m5 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop +RET +%endmacro + +BLOCKCOPY_PP_W8_H8 8, 8 +BLOCKCOPY_PP_W8_H8 8, 16 +BLOCKCOPY_PP_W8_H8 8, 32 + +BLOCKCOPY_PP_W8_H8 8, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W12_H4 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 4 + mov r4d, %2/4 + +.loop: + movh m0, [r2] + movd m1, [r2 + 8] + movh m2, [r2 + r3] + movd m3, [r2 + r3 + 8] + lea r2, [r2 + 2 * r3] + + movh [r0], m0 + movd [r0 + 8], m1 + movh [r0 + r1], m2 + movd [r0 + r1 + 8], m3 + lea r0, [r0 + 2 * r1] + + movh m0, [r2] + movd m1, [r2 + 8] + movh m2, [r2 + r3] + movd m3, [r2 + r3 + 8] + + movh [r0], m0 + movd [r0 + 8], m1 + movh [r0 + r1], m2 + movd [r0 + r1 + 8], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W12_H4 12, 16 + +BLOCKCOPY_PP_W12_H4 12, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W16_H4 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 4 + mov r4d, %2/4 + +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + 2 * r3] + movu m2, [r2] + movu m3, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + + RET +%endmacro + +BLOCKCOPY_PP_W16_H4 16, 4 +BLOCKCOPY_PP_W16_H4 16, 12 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W16_H8 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 6 + mov r4d, %2/8 + +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + 2 * r3] + movu m2, [r2] + movu m3, [r2 + r3] + lea r2, [r2 + 2 * r3] + movu m4, [r2] + movu m5, [r2 + r3] + lea r2, [r2 + 2 * r3] + + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + lea r0, [r0 + 2 * r1] + movu [r0], m4 + movu [r0 + r1], m5 + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + r3] + movu [r0], m0 + movu [r0 + r1], m1 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W16_H8 16, 8 +BLOCKCOPY_PP_W16_H8 16, 16 +BLOCKCOPY_PP_W16_H8 16, 32 +BLOCKCOPY_PP_W16_H8 16, 64 + +BLOCKCOPY_PP_W16_H8 16, 24 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W24_H4 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 6 + mov r4d, %2/4 + +.loop: + movu m0, [r2] + movh m1, [r2 + 16] + movu m2, [r2 + r3] + movh m3, [r2 + r3 + 16] + lea r2, [r2 + 2 * r3] + movu m4, [r2] + movh m5, [r2 + 16] + + movu [r0], m0 + movh [r0 + 16], m1 + movu [r0 + r1], m2 + movh [r0 + r1 + 16], m3 + lea r0, [r0 + 2 * r1] + movu [r0], m4 + movh [r0 + 16], m5 + + movu m0, [r2 + r3] + movh m1, [r2 + r3 + 16] + movu [r0 + r1], m0 + movh [r0 + r1 + 16], m1 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W24_H4 24, 32 + +BLOCKCOPY_PP_W24_H4 24, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W32_H4 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 4 + mov r4d, %2/4 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + lea r2, [r2 + 2 * r3] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W32_H4 32, 8 +BLOCKCOPY_PP_W32_H4 32, 16 +BLOCKCOPY_PP_W32_H4 32, 24 +BLOCKCOPY_PP_W32_H4 32, 32 +BLOCKCOPY_PP_W32_H4 32, 64 + +BLOCKCOPY_PP_W32_H4 32, 48 + +INIT_YMM avx +cglobal blockcopy_pp_32x8, 4, 6, 6 + lea r4, [3 * r1] + lea r5, [3 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + lea r2, [r2 + 4 * r3] + movu m4, [r2] + movu m5, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r4], m3 + lea r0, [r0 + 4 * r1] + movu [r0], m4 + movu [r0 + r1], m5 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + r5] + + movu [r0 + 2 * r1], m0 + movu [r0 + r4], m1 + RET + +INIT_YMM avx +cglobal blockcopy_pp_32x16, 4, 6, 6 + lea r4, [3 * r1] + lea r5, [3 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + lea r2, [r2 + 4 * r3] + movu m4, [r2] + movu m5, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r4], m3 + lea r0, [r0 + 4 * r1] + movu [r0], m4 + movu [r0 + r1], m5 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + r5] + lea r2, [r2 + 4 * r3] + movu m2, [r2] + movu m3, [r2 + r3] + movu m4, [r2 + 2 * r3] + movu m5, [r2 + r5] + + movu [r0 + 2 * r1], m0 + movu [r0 + r4], m1 + lea r0, [r0 + 4 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + movu [r0 + 2 * r1], m4 + movu [r0 + r4], m5 + + lea r2, [r2 + 4 * r3] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + + lea r0, [r0 + 4 * r1] + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r4], m3 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_YMM avx +cglobal blockcopy_pp_32x24, 4, 7, 6 +lea r4, [3 * r1] +lea r5, [3 * r3] +mov r6d, 24/8 + +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + lea r2, [r2 + 4 * r3] + movu m4, [r2] + movu m5, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r4], m3 + lea r0, [r0 + 4 * r1] + movu [r0], m4 + movu [r0 + r1], m5 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + r5] + + movu [r0 + 2 * r1], m0 + movu [r0 + r4], m1 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W32_H16_avx 2 +INIT_YMM avx +cglobal blockcopy_pp_%1x%2, 4, 7, 6 + lea r4, [3 * r1] + lea r5, [3 * r3] + mov r6d, %2/16 + +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + lea r2, [r2 + 4 * r3] + movu m4, [r2] + movu m5, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r4], m3 + lea r0, [r0 + 4 * r1] + movu [r0], m4 + movu [r0 + r1], m5 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + r5] + lea r2, [r2 + 4 * r3] + movu m2, [r2] + movu m3, [r2 + r3] + movu m4, [r2 + 2 * r3] + movu m5, [r2 + r5] + + movu [r0 + 2 * r1], m0 + movu [r0 + r4], m1 + lea r0, [r0 + 4 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + movu [r0 + 2 * r1], m4 + movu [r0 + r4], m5 + + lea r2, [r2 + 4 * r3] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + + lea r0, [r0 + 4 * r1] + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r4], m3 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + dec r6d + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W32_H16_avx 32, 32 +BLOCKCOPY_PP_W32_H16_avx 32, 48 +BLOCKCOPY_PP_W32_H16_avx 32, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W48_H2 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 6 + mov r4d, %2/4 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + r3] + movu m4, [r2 + r3 + 16] + movu m5, [r2 + r3 + 32] + lea r2, [r2 + 2 * r3] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + movu [r0 + r1 + 32], m5 + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + r3] + movu m4, [r2 + r3 + 16] + movu m5, [r2 + r3 + 32] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + movu [r0 + r1 + 32], m5 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W48_H2 48, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W64_H4 2 +INIT_XMM sse2 +cglobal blockcopy_pp_%1x%2, 4, 5, 6 + mov r4d, %2/4 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + movu m4, [r2 + r3] + movu m5, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m5 + + movu m0, [r2 + r3 + 32] + movu m1, [r2 + r3 + 48] + lea r2, [r2 + 2 * r3] + movu m2, [r2] + movu m3, [r2 + 16] + movu m4, [r2 + 32] + movu m5, [r2 + 48] + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + 16], m3 + movu [r0 + 32], m4 + movu [r0 + 48], m5 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W64_H4 64, 16 +BLOCKCOPY_PP_W64_H4 64, 32 +BLOCKCOPY_PP_W64_H4 64, 48 +BLOCKCOPY_PP_W64_H4 64, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_sp_2x4, 4, 5, 2 + +add r3, r3 + +;Row 0-1 +movd m0, [r2] +movd m1, [r2 + r3] +packuswb m0, m1 +movd r4d, m0 +mov [r0], r4w +pextrw [r0 + r1], m0, 4 + +;Row 2-3 +movd m0, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movd m1, [r2 + r3] +packuswb m0, m1 +movd r4d, m0 +mov [r0 + 2 * r1], r4w +lea r0, [r0 + 2 * r1] +pextrw [r0 + r1], m0, 4 + +RET + + +;----------------------------------------------------------------------------- +; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_sp_2x8, 4, 5, 2 + +add r3, r3 + +;Row 0-1 +movd m0, [r2] +movd m1, [r2 + r3] +packuswb m0, m1 +movd r4d, m0 +mov [r0], r4w +pextrw [r0 + r1], m0, 4 + +;Row 2-3 +movd m0, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movd m1, [r2 + r3] +packuswb m0, m1 +movd r4d, m0 +mov [r0 + 2 * r1], r4w +lea r0, [r0 + 2 * r1] +pextrw [r0 + r1], m0, 4 + +;Row 4-5 +movd m0, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movd m1, [r2 + r3] +packuswb m0, m1 +movd r4d, m0 +mov [r0 + 2 * r1], r4w +lea r0, [r0 + 2 * r1] +pextrw [r0 + r1], m0, 4 + +;Row 6-7 +movd m0, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movd m1, [r2 + r3] +packuswb m0, m1 +movd r4d, m0 +mov [r0 + 2 * r1], r4w +lea r0, [r0 + 2 * r1] +pextrw [r0 + r1], m0, 4 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W2_H2 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride + add r3, r3 + mov r6d, %2/2 +.loop: + movd m0, [r2] + movd m1, [r2 + r3] + dec r6d + lea r2, [r2 + r3 * 2] + packuswb m0, m0 + packuswb m1, m1 + movd r4d, m0 + movd r5d, m1 + mov [r0], r4w + mov [r0 + r1], r5w + lea r0, [r0 + r1 * 2] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SP_W2_H2 2, 4 +BLOCKCOPY_SP_W2_H2 2, 8 + +BLOCKCOPY_SP_W2_H2 2, 16 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride + +add r3, r3 + +movh m0, [r2] +movh m1, [r2 + r3] + +packuswb m0, m1 + +movd [r0], m0 +pshufd m0, m0, 2 +movd [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride + +add r3, r3 + +movh m0, [r2] +movh m1, [r2 + r3] +movh m2, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movh m3, [r2 + r3] + +packuswb m0, m1 +packuswb m2, m3 + +movd [r0], m0 +pshufd m0, m0, 2 +movd [r0 + r1], m0 +movd [r0 + 2 * r1], m2 +lea r0, [r0 + 2 * r1] +pshufd m2, m2, 2 +movd [r0 + r1], m2 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride + +add r3, r3 + +movh m0, [r2] +movh m1, [r2 + r3] +movh m2, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movh m3, [r2 + r3] +movh m4, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movh m5, [r2 + r3] +movh m6, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movh m7, [r2 + r3] + +packuswb m0, m1 +packuswb m2, m3 +packuswb m4, m5 +packuswb m6, m7 + +movd [r0], m0 +pshufd m0, m0, 2 +movd [r0 + r1], m0 +movd [r0 + 2 * r1], m2 +lea r0, [r0 + 2 * r1] +pshufd m2, m2, 2 +movd [r0 + r1], m2 +movd [r0 + 2 * r1], m4 +lea r0, [r0 + 2 * r1] +pshufd m4, m4, 2 +movd [r0 + r1], m4 +movd [r0 + 2 * r1], m6 +lea r0, [r0 + 2 * r1] +pshufd m6, m6, 2 +movd [r0 + r1], m6 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W4_H8 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride + +mov r4d, %2/8 + +add r3, r3 + +.loop: + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movh m3, [r2 + r3] + movh m4, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movh m5, [r2 + r3] + movh m6, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movh m7, [r2 + r3] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + + movd [r0], m0 + pshufd m0, m0, 2 + movd [r0 + r1], m0 + movd [r0 + 2 * r1], m2 + lea r0, [r0 + 2 * r1] + pshufd m2, m2, 2 + movd [r0 + r1], m2 + movd [r0 + 2 * r1], m4 + lea r0, [r0 + 2 * r1] + pshufd m4, m4, 2 + movd [r0 + r1], m4 + movd [r0 + 2 * r1], m6 + lea r0, [r0 + 2 * r1] + pshufd m6, m6, 2 + movd [r0 + r1], m6 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W4_H8 4, 16 + +BLOCKCOPY_SP_W4_H8 4, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_sp_6x8, 4, 4, 2 + + add r3, r3 + + movu m0, [r2] + movu m1, [r2 + r3] + packuswb m0, m1 + + movd [r0], m0 + pextrw [r0 + 4], m0, 2 + + movhlps m0, m0 + movd [r0 + r1], m0 + pextrw [r0 + r1 + 4], m0, 2 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + packuswb m0, m1 + + movd [r0], m0 + pextrw [r0 + 4], m0, 2 + + movhlps m0, m0 + movd [r0 + r1], m0 + pextrw [r0 + r1 + 4], m0, 2 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + packuswb m0, m1 + + movd [r0], m0 + pextrw [r0 + 4], m0, 2 + + movhlps m0, m0 + movd [r0 + r1], m0 + pextrw [r0 + r1 + 4], m0, 2 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + packuswb m0, m1 + + movd [r0], m0 + pextrw [r0 + 4], m0, 2 + + movhlps m0, m0 + movd [r0 + r1], m0 + pextrw [r0 + r1 + 4], m0, 2 + + RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W6_H2 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride + add r3, r3 + mov r6d, %2/2 +.loop: + movh m0, [r2] + movd m2, [r2 + 8] + movh m1, [r2 + r3] + movd m3, [r2 + r3 + 8] + dec r6d + lea r2, [r2 + r3 * 2] + packuswb m0, m0 + packuswb m2, m2 + packuswb m1, m1 + packuswb m3, m3 + movd r4d, m2 + movd r5d, m3 + movd [r0], m0 + mov [r0 + 4], r4w + movd [r0 + r1], m1 + mov [r0 + r1 + 4], r5w + lea r0, [r0 + r1 * 2] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SP_W6_H2 6, 8 + +BLOCKCOPY_SP_W6_H2 6, 16 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride + +add r3, r3 + +movu m0, [r2] +movu m1, [r2 + r3] + +packuswb m0, m1 + +movlps [r0], m0 +movhps [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride + +add r3, r3 + +movu m0, [r2] +movu m1, [r2 + r3] +movu m2, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movu m3, [r2 + r3] + +packuswb m0, m1 +packuswb m2, m3 + +movlps [r0], m0 +movhps [r0 + r1], m0 +movlps [r0 + 2 * r1], m2 +lea r0, [r0 + 2 * r1] +movhps [r0 + r1], m2 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride + +add r3, r3 + +movu m0, [r2] +movu m1, [r2 + r3] +movu m2, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movu m3, [r2 + r3] +movu m4, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movu m5, [r2 + r3] + +packuswb m0, m1 +packuswb m2, m3 +packuswb m4, m5 + +movlps [r0], m0 +movhps [r0 + r1], m0 +movlps [r0 + 2 * r1], m2 +lea r0, [r0 + 2 * r1] +movhps [r0 + r1], m2 +movlps [r0 + 2 * r1], m4 +lea r0, [r0 + 2 * r1] +movhps [r0 + r1], m4 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride + +add r3, r3 + +movu m0, [r2] +movu m1, [r2 + r3] +movu m2, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movu m3, [r2 + r3] +movu m4, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movu m5, [r2 + r3] +movu m6, [r2 + 2 * r3] +lea r2, [r2 + 2 * r3] +movu m7, [r2 + r3] + +packuswb m0, m1 +packuswb m2, m3 +packuswb m4, m5 +packuswb m6, m7 + +movlps [r0], m0 +movhps [r0 + r1], m0 +movlps [r0 + 2 * r1], m2 +lea r0, [r0 + 2 * r1] +movhps [r0 + r1], m2 +movlps [r0 + 2 * r1], m4 +lea r0, [r0 + 2 * r1] +movhps [r0 + r1], m4 +movlps [r0 + 2 * r1], m6 +lea r0, [r0 + 2 * r1] +movhps [r0 + r1], m6 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W8_H4 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride + add r3, r3 + mov r4d, %2/4 +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movu m2, [r2] + movu m3, [r2 + r3] + dec r4d + lea r2, [r2 + r3 * 2] + packuswb m0, m1 + packuswb m2, m3 + movlps [r0], m0 + movhps [r0 + r1], m0 + lea r0, [r0 + r1 * 2] + movlps [r0], m2 + movhps [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SP_W8_H4 8, 12 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W8_H8 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride + +mov r4d, %2/8 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movu m3, [r2 + r3] + movu m4, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movu m5, [r2 + r3] + movu m6, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movu m7, [r2 + r3] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + + movlps [r0], m0 + movhps [r0 + r1], m0 + movlps [r0 + 2 * r1], m2 + lea r0, [r0 + 2 * r1] + movhps [r0 + r1], m2 + movlps [r0 + 2 * r1], m4 + lea r0, [r0 + 2 * r1] + movhps [r0 + r1], m4 + movlps [r0 + 2 * r1], m6 + lea r0, [r0 + 2 * r1] + movhps [r0 + r1], m6 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W8_H8 8, 16 +BLOCKCOPY_SP_W8_H8 8, 32 + +BLOCKCOPY_SP_W8_H8 8, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W12_H4 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride + +mov r4d, %2/4 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + movu m4, [r2 + 2 * r3] + movu m5, [r2 + 2 * r3 + 16] + lea r2, [r2 + 2 * r3] + movu m6, [r2 + r3] + movu m7, [r2 + r3 + 16] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + + movh [r0], m0 + pshufd m0, m0, 2 + movd [r0 + 8], m0 + + movh [r0 + r1], m2 + pshufd m2, m2, 2 + movd [r0 + r1 + 8], m2 + + movh [r0 + 2 * r1], m4 + pshufd m4, m4, 2 + movd [r0 + 2 * r1 + 8], m4 + + lea r0, [r0 + 2 * r1] + movh [r0 + r1], m6 + pshufd m6, m6, 2 + movd [r0 + r1 + 8], m6 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W12_H4 12, 16 + +BLOCKCOPY_SP_W12_H4 12, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W16_H4 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride + +mov r4d, %2/4 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + movu m4, [r2 + 2 * r3] + movu m5, [r2 + 2 * r3 + 16] + lea r2, [r2 + 2 * r3] + movu m6, [r2 + r3] + movu m7, [r2 + r3 + 16] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + + movu [r0], m0 + movu [r0 + r1], m2 + movu [r0 + 2 * r1], m4 + lea r0, [r0 + 2 * r1] + movu [r0 + r1], m6 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W16_H4 16, 4 +BLOCKCOPY_SP_W16_H4 16, 8 +BLOCKCOPY_SP_W16_H4 16, 12 +BLOCKCOPY_SP_W16_H4 16, 16 +BLOCKCOPY_SP_W16_H4 16, 32 +BLOCKCOPY_SP_W16_H4 16, 64 + +BLOCKCOPY_SP_W16_H4 16, 24 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W24_H2 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride + +mov r4d, %2/2 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + r3] + movu m4, [r2 + r3 + 16] + movu m5, [r2 + r3 + 32] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + + movu [r0], m0 + movlps [r0 + 16], m2 + movhps [r0 + r1], m2 + movu [r0 + r1 + 8], m4 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W24_H2 24, 32 + +BLOCKCOPY_SP_W24_H2 24, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W32_H2 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride + +mov r4d, %2/2 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + movu m4, [r2 + r3] + movu m5, [r2 + r3 + 16] + movu m6, [r2 + r3 + 32] + movu m7, [r2 + r3 + 48] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m6 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W32_H2 32, 8 +BLOCKCOPY_SP_W32_H2 32, 16 +BLOCKCOPY_SP_W32_H2 32, 24 +BLOCKCOPY_SP_W32_H2 32, 32 +BLOCKCOPY_SP_W32_H2 32, 64 + +BLOCKCOPY_SP_W32_H2 32, 48 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W48_H2 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride + +mov r4d, %2 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + movu m4, [r2 + 64] + movu m5, [r2 + 80] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + 32], m4 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W48_H2 48, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W64_H1 2 +INIT_XMM sse2 +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride + +mov r4d, %2 + +add r3, r3 + +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + movu m4, [r2 + 64] + movu m5, [r2 + 80] + movu m6, [r2 + 96] + movu m7, [r2 + 112] + + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + 32], m4 + movu [r0 + 48], m6 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_SP_W64_H1 64, 16 +BLOCKCOPY_SP_W64_H1 64, 32 +BLOCKCOPY_SP_W64_H1 64, 48 +BLOCKCOPY_SP_W64_H1 64, 64 + +;----------------------------------------------------------------------------- +; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val + +add r1, r1 + +movd m0, r2d +pshuflw m0, m0, 0 + +movh [r0], m0 +movh [r0 + r1], m0 +movh [r0 + 2 * r1], m0 +lea r0, [r0 + 2 * r1] +movh [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val + +add r1, r1 + +movd m0, r2d +pshuflw m0, m0, 0 +pshufd m0, m0, 0 + +movu [r0], m0 +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 + +lea r0, [r0 + 2 * r1] +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 + +lea r0, [r0 + 2 * r1] +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 + +lea r0, [r0 + 2 * r1] +movu [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val) +;----------------------------------------------------------------------------- +%macro BLOCKFILL_S_W16_H8 2 +INIT_XMM sse2 +cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val + +mov r3d, %2/8 + +add r1, r1 + +movd m0, r2d +pshuflw m0, m0, 0 +pshufd m0, m0, 0 + +.loop: + movu [r0], m0 + movu [r0 + 16], m0 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m0 + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 16], m0 + + lea r4, [r0 + 2 * r1] + movu [r4 + r1], m0 + movu [r4 + r1 + 16], m0 + + movu [r0 + 4 * r1], m0 + movu [r0 + 4 * r1 + 16], m0 + + lea r4, [r0 + 4 * r1] + movu [r4 + r1], m0 + movu [r4 + r1 + 16], m0 + + movu [r4 + 2 * r1], m0 + movu [r4 + 2 * r1 + 16], m0 + + lea r4, [r4 + 2 * r1] + movu [r4 + r1], m0 + movu [r4 + r1 + 16], m0 + + lea r0, [r0 + 8 * r1] + + dec r3d + jnz .loop + +RET +%endmacro + +BLOCKFILL_S_W16_H8 16, 16 + +INIT_YMM avx2 +cglobal blockfill_s_16x16, 3, 4, 1 +add r1, r1 +lea r3, [3 * r1] +movd xm0, r2d +vpbroadcastw m0, xm0 + +movu [r0], m0 +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + r3], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + r3], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + r3], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + r3], m0 +RET + +;----------------------------------------------------------------------------- +; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val) +;----------------------------------------------------------------------------- +%macro BLOCKFILL_S_W32_H4 2 +INIT_XMM sse2 +cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val + +mov r3d, %2/4 + +add r1, r1 + +movd m0, r2d +pshuflw m0, m0, 0 +pshufd m0, m0, 0 + +.loop: + movu [r0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 16], m0 + movu [r0 + 2 * r1 + 32], m0 + movu [r0 + 2 * r1 + 48], m0 + + lea r4, [r0 + 2 * r1] + + movu [r4 + r1], m0 + movu [r4 + r1 + 16], m0 + movu [r4 + r1 + 32], m0 + movu [r4 + r1 + 48], m0 + + lea r0, [r0 + 4 * r1] + + dec r3d + jnz .loop + +RET +%endmacro + +BLOCKFILL_S_W32_H4 32, 32 + +INIT_YMM avx2 +cglobal blockfill_s_32x32, 3, 4, 1 +add r1, r1 +lea r3, [3 * r1] +movd xm0, r2d +vpbroadcastw m0, xm0 + +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +lea r0, [r0 + 4 * r1] +movu [r0], m0 +movu [r0 + 32], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 32], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 32], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 32], m0 +RET + +;----------------------------------------------------------------------------- +; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movd m0, [r2] +pmovzxbw m0, m0 +movd [r0], m0 + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movd [r0 + r1], m0 + +movd m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movd [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movd [r0 + r1], m0 + +RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movd m0, [r2] +pmovzxbw m0, m0 +movd [r0], m0 + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movd [r0 + r1], m0 + +movd m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movd [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movd [r0 + r1], m0 + +movd m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movd [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movd [r0 + r1], m0 + +movd m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movd [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movd [r0 + r1], m0 + +RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride + add r1, r1 + mov r4d, 16/2 +.loop: + movd m0, [r2] + movd m1, [r2 + r3] + dec r4d + lea r2, [r2 + r3 * 2] + pmovzxbw m0, m0 + pmovzxbw m1, m1 + movd [r0], m0 + movd [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movd m0, [r2] +pmovzxbw m0, m0 +movh [r0], m0 + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movh [r0 + r1], m0 + +RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movd m0, [r2] +pmovzxbw m0, m0 +movh [r0], m0 + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movh [r0 + r1], m0 + +movd m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movh [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movd m0, [r2 + r3] +pmovzxbw m0, m0 +movh [r0 + r1], m0 + +RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W4_H4 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/4 + +.loop: + movd m0, [r2] + pmovzxbw m0, m0 + movh [r0], m0 + + movd m0, [r2 + r3] + pmovzxbw m0, m0 + movh [r0 + r1], m0 + + movd m0, [r2 + 2 * r3] + pmovzxbw m0, m0 + movh [r0 + 2 * r1], m0 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movd m0, [r2 + r3] + pmovzxbw m0, m0 + movh [r0 + r1], m0 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W4_H4 4, 8 +BLOCKCOPY_PS_W4_H4 4, 16 + +BLOCKCOPY_PS_W4_H4 4, 32 + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W6_H4 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/4 + +.loop: + movh m0, [r2] + pmovzxbw m0, m0 + movh [r0], m0 + pextrd [r0 + 8], m0, 2 + + movh m0, [r2 + r3] + pmovzxbw m0, m0 + movh [r0 + r1], m0 + pextrd [r0 + r1 + 8], m0, 2 + + movh m0, [r2 + 2 * r3] + pmovzxbw m0, m0 + movh [r0 + 2 * r1], m0 + pextrd [r0 + 2 * r1 + 8], m0, 2 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movh m0, [r2 + r3] + pmovzxbw m0, m0 + movh [r0 + r1], m0 + pextrd [r0 + r1 + 8], m0, 2 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W6_H4 6, 8 + +BLOCKCOPY_PS_W6_H4 6, 16 + +;----------------------------------------------------------------------------- +; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movh m0, [r2] +pmovzxbw m0, m0 +movu [r0], m0 + +movh m0, [r2 + r3] +pmovzxbw m0, m0 +movu [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movh m0, [r2] +pmovzxbw m0, m0 +movu [r0], m0 + +movh m0, [r2 + r3] +pmovzxbw m0, m0 +movu [r0 + r1], m0 + +movh m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movu [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movh m0, [r2 + r3] +pmovzxbw m0, m0 +movu [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride + +add r1, r1 + +movh m0, [r2] +pmovzxbw m0, m0 +movu [r0], m0 + +movh m0, [r2 + r3] +pmovzxbw m0, m0 +movu [r0 + r1], m0 + +movh m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movu [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movh m0, [r2 + r3] +pmovzxbw m0, m0 +movu [r0 + r1], m0 + +movh m0, [r2 + 2 * r3] +pmovzxbw m0, m0 +movu [r0 + 2 * r1], m0 + +lea r2, [r2 + 2 * r3] +lea r0, [r0 + 2 * r1] + +movh m0, [r2 + r3] +pmovzxbw m0, m0 +movu [r0 + r1], m0 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W8_H4 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/4 + +.loop: + movh m0, [r2] + pmovzxbw m0, m0 + movu [r0], m0 + + movh m0, [r2 + r3] + pmovzxbw m0, m0 + movu [r0 + r1], m0 + + movh m0, [r2 + 2 * r3] + pmovzxbw m0, m0 + movu [r0 + 2 * r1], m0 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movh m0, [r2 + r3] + pmovzxbw m0, m0 + movu [r0 + r1], m0 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W8_H4 8, 8 +BLOCKCOPY_PS_W8_H4 8, 16 +BLOCKCOPY_PS_W8_H4 8, 32 + +BLOCKCOPY_PS_W8_H4 8, 12 +BLOCKCOPY_PS_W8_H4 8, 64 + + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W12_H2 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/2 +pxor m0, m0 + +.loop: + movu m1, [r2] + pmovzxbw m2, m1 + movu [r0], m2 + punpckhbw m1, m0 + movh [r0 + 16], m1 + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movh [r0 + r1 + 16], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W12_H2 12, 16 + +BLOCKCOPY_PS_W12_H2 12, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride + +add r1, r1 +pxor m0, m0 + +movu m1, [r2] +pmovzxbw m2, m1 +movu [r0], m2 +punpckhbw m1, m0 +movu [r0 + 16], m1 + +movu m1, [r2 + r3] +pmovzxbw m2, m1 +movu [r0 + r1], m2 +punpckhbw m1, m0 +movu [r0 + r1 + 16], m1 + +movu m1, [r2 + 2 * r3] +pmovzxbw m2, m1 +movu [r0 + 2 * r1], m2 +punpckhbw m1, m0 +movu [r0 + 2 * r1 + 16], m1 + +lea r0, [r0 + 2 * r1] +lea r2, [r2 + 2 * r3] + +movu m1, [r2 + r3] +pmovzxbw m2, m1 +movu [r0 + r1], m2 +punpckhbw m1, m0 +movu [r0 + r1 + 16], m1 + +RET + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W16_H4 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/4 +pxor m0, m0 + +.loop: + movu m1, [r2] + pmovzxbw m2, m1 + movu [r0], m2 + punpckhbw m1, m0 + movu [r0 + 16], m1 + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 16], m1 + + movu m1, [r2 + 2 * r3] + pmovzxbw m2, m1 + movu [r0 + 2 * r1], m2 + punpckhbw m1, m0 + movu [r0 + 2 * r1 + 16], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 16], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W16_H4 16, 8 +BLOCKCOPY_PS_W16_H4 16, 12 +BLOCKCOPY_PS_W16_H4 16, 16 +BLOCKCOPY_PS_W16_H4 16, 32 +BLOCKCOPY_PS_W16_H4 16, 64 + +BLOCKCOPY_PS_W16_H4 16, 24 + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W24_H2 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/2 +pxor m0, m0 + +.loop: + movu m1, [r2] + pmovzxbw m2, m1 + movu [r0], m2 + punpckhbw m1, m0 + movu [r0 + 16], m1 + + movh m1, [r2 + 16] + pmovzxbw m1, m1 + movu [r0 + 32], m1 + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 16], m1 + + movh m1, [r2 + r3 + 16] + pmovzxbw m1, m1 + movu [r0 + r1 + 32], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W24_H2 24, 32 + +BLOCKCOPY_PS_W24_H2 24, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W32_H2 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/2 +pxor m0, m0 + +.loop: + movu m1, [r2] + pmovzxbw m2, m1 + movu [r0], m2 + punpckhbw m1, m0 + movu [r0 + 16], m1 + + movu m1, [r2 + 16] + pmovzxbw m2, m1 + movu [r0 + 32], m2 + punpckhbw m1, m0 + movu [r0 + 48], m1 + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 16], m1 + + movu m1, [r2 + r3 + 16] + pmovzxbw m2, m1 + movu [r0 + r1 + 32], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 48], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W32_H2 32, 8 +BLOCKCOPY_PS_W32_H2 32, 16 +BLOCKCOPY_PS_W32_H2 32, 24 +BLOCKCOPY_PS_W32_H2 32, 32 +BLOCKCOPY_PS_W32_H2 32, 64 + +BLOCKCOPY_PS_W32_H2 32, 48 + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W48_H2 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/2 +pxor m0, m0 + +.loop: + movu m1, [r2] + pmovzxbw m2, m1 + movu [r0], m2 + punpckhbw m1, m0 + movu [r0 + 16], m1 + + movu m1, [r2 + 16] + pmovzxbw m2, m1 + movu [r0 + 32], m2 + punpckhbw m1, m0 + movu [r0 + 48], m1 + + movu m1, [r2 + 32] + pmovzxbw m2, m1 + movu [r0 + 64], m2 + punpckhbw m1, m0 + movu [r0 + 80], m1 + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 16], m1 + + movu m1, [r2 + r3 + 16] + pmovzxbw m2, m1 + movu [r0 + r1 + 32], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 48], m1 + + movu m1, [r2 + r3 + 32] + pmovzxbw m2, m1 + movu [r0 + r1 + 64], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 80], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W48_H2 48, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W64_H2 2 +INIT_XMM sse4 +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride + +add r1, r1 +mov r4d, %2/2 +pxor m0, m0 + +.loop: + movu m1, [r2] + pmovzxbw m2, m1 + movu [r0], m2 + punpckhbw m1, m0 + movu [r0 + 16], m1 + + movu m1, [r2 + 16] + pmovzxbw m2, m1 + movu [r0 + 32], m2 + punpckhbw m1, m0 + movu [r0 + 48], m1 + + movu m1, [r2 + 32] + pmovzxbw m2, m1 + movu [r0 + 64], m2 + punpckhbw m1, m0 + movu [r0 + 80], m1 + + movu m1, [r2 + 48] + pmovzxbw m2, m1 + movu [r0 + 96], m2 + punpckhbw m1, m0 + movu [r0 + 112], m1 + + movu m1, [r2 + r3] + pmovzxbw m2, m1 + movu [r0 + r1], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 16], m1 + + movu m1, [r2 + r3 + 16] + pmovzxbw m2, m1 + movu [r0 + r1 + 32], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 48], m1 + + movu m1, [r2 + r3 + 32] + pmovzxbw m2, m1 + movu [r0 + r1 + 64], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 80], m1 + + movu m1, [r2 + r3 + 48] + pmovzxbw m2, m1 + movu [r0 + r1 + 96], m2 + punpckhbw m1, m0 + movu [r0 + r1 + 112], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +BLOCKCOPY_PS_W64_H2 64, 16 +BLOCKCOPY_PS_W64_H2 64, 32 +BLOCKCOPY_PS_W64_H2 64, 48 +BLOCKCOPY_PS_W64_H2 64, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_2x4, 4, 6, 0 + add r1, r1 + add r3, r3 + + mov r4d, [r2] + mov r5d, [r2 + r3] + mov [r0], r4d + mov [r0 + r1], r5d + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + 2 * r1] + + mov r4d, [r2] + mov r5d, [r2 + r3] + mov [r0], r4d + mov [r0 + r1], r5d + + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_2x8, 4, 6, 0 + add r1, r1 + add r3, r3 + + mov r4d, [r2] + mov r5d, [r2 + r3] + mov [r0], r4d + mov [r0 + r1], r5d + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + 2 * r1] + + mov r4d, [r2] + mov r5d, [r2 + r3] + mov [r0], r4d + mov [r0 + r1], r5d + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + 2 * r1] + + mov r4d, [r2] + mov r5d, [r2 + r3] + mov [r0], r4d + mov [r0 + r1], r5d + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + 2 * r1] + + mov r4d, [r2] + mov r5d, [r2 + r3] + mov [r0], r4d + mov [r0 + r1], r5d + + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_2x16, 4, 7, 0 + add r1, r1 + add r3, r3 + mov r6d, 16/2 +.loop: + mov r4d, [r2] + mov r5d, [r2 + r3] + dec r6d + lea r2, [r2 + r3 * 2] + mov [r0], r4d + mov [r0 + r1], r5d + lea r0, [r0 + r1 * 2] + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_4x2, 4, 4, 2 + add r1, r1 + add r3, r3 + + movh m0, [r2] + movh m1, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_4x4, 4, 4, 4 + add r1, r1 + add r3, r3 + movh m0, [r2] + movh m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movh m2, [r2] + movh m3, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movh [r0], m2 + movh [r0 + r1], m3 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W4_H8 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 4 + mov r4d, %2/8 + add r1, r1 + add r3, r3 +.loop: + movh m0, [r2] + movh m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movh m2, [r2] + movh m3, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movh [r0], m2 + movh [r0 + r1], m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + movh m0, [r2] + movh m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movh m2, [r2] + movh m3, [r2 + r3] + + movh [r0], m0 + movh [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movh [r0], m2 + movh [r0 + r1], m3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W4_H8 4, 8 +BLOCKCOPY_SS_W4_H8 4, 16 + +BLOCKCOPY_SS_W4_H8 4, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_6x8, 4, 4, 4 + add r1, r1 + add r3, r3 + + movu m0, [r2] + movu m1, [r2 + r3] + pshufd m2, m0, 2 + pshufd m3, m1, 2 + movh [r0], m0 + movd [r0 + 8], m2 + movh [r0 + r1], m1 + movd [r0 + r1 + 8], m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + pshufd m2, m0, 2 + pshufd m3, m1, 2 + movh [r0], m0 + movd [r0 + 8], m2 + movh [r0 + r1], m1 + movd [r0 + r1 + 8], m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + pshufd m2, m0, 2 + pshufd m3, m1, 2 + movh [r0], m0 + movd [r0 + 8], m2 + movh [r0 + r1], m1 + movd [r0 + r1 + 8], m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + movu m0, [r2] + movu m1, [r2 + r3] + pshufd m2, m0, 2 + pshufd m3, m1, 2 + movh [r0], m0 + movd [r0 + 8], m2 + movh [r0 + r1], m1 + movd [r0 + r1 + 8], m3 + + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_6x16, 4, 5, 4 + add r1, r1 + add r3, r3 + mov r4d, 16/2 +.loop: + movh m0, [r2] + movd m2, [r2 + 8] + movh m1, [r2 + r3] + movd m3, [r2 + r3 + 8] + dec r4d + lea r2, [r2 + r3 * 2] + movh [r0], m0 + movd [r0 + 8], m2 + movh [r0 + r1], m1 + movd [r0 + r1 + 8], m3 + lea r0, [r0 + r1 * 2] + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_8x2, 4, 4, 2 + add r1, r1 + add r3, r3 + + movu m0, [r2] + movu m1, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_8x4, 4, 4, 4 + add r1, r1 + add r3, r3 + + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movu m2, [r2] + movu m3, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_8x6, 4, 4, 4 + + add r1, r1 + add r3, r3 + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movu m2, [r2] + movu m3, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + r3] + movu [r0], m0 + movu [r0 + r1], m1 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_ss_8x12, 4, 5, 2 + add r1, r1 + add r3, r3 + mov r4d, 12/2 +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + 2 * r3] + dec r4d + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W8_H8 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 4 + mov r4d, %2/8 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movu m2, [r2] + movu m3, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + r3] + lea r2, [r2 + r3 * 2] + movu m2, [r2] + movu m3, [r2 + r3] + + movu [r0], m0 + movu [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movu [r0], m2 + movu [r0 + r1], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop +RET +%endmacro + +BLOCKCOPY_SS_W8_H8 8, 8 +BLOCKCOPY_SS_W8_H8 8, 16 +BLOCKCOPY_SS_W8_H8 8, 32 + +BLOCKCOPY_SS_W8_H8 8, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W12_H4 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 4 + + mov r4d, %2/4 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movh m1, [r2 + 16] + movu m2, [r2 + r3] + movh m3, [r2 + r3 + 16] + lea r2, [r2 + 2 * r3] + + movu [r0], m0 + movh [r0 + 16], m1 + movu [r0 + r1], m2 + movh [r0 + r1 + 16], m3 + + lea r0, [r0 + 2 * r1] + movu m0, [r2] + movh m1, [r2 + 16] + movu m2, [r2 + r3] + movh m3, [r2 + r3 + 16] + + movu [r0], m0 + movh [r0 + 16], m1 + movu [r0 + r1], m2 + movh [r0 + r1 + 16], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W12_H4 12, 16 + +BLOCKCOPY_SS_W12_H4 12, 32 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W16_H4 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 4 + mov r4d, %2/4 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W16_H4 16, 4 +BLOCKCOPY_SS_W16_H4 16, 12 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W16_H4_avx 2 +INIT_YMM avx +cglobal blockcopy_ss_%1x%2, 4, 7, 4 + mov r4d, %2/4 + add r1, r1 + add r3, r3 + lea r5, [3 * r3] + lea r6, [3 * r1] +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + 2 * r1], m2 + movu [r0 + r6], m3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W16_H4_avx 16, 4 +BLOCKCOPY_SS_W16_H4_avx 16, 12 +BLOCKCOPY_SS_W16_H4_avx 16, 8 +BLOCKCOPY_SS_W16_H4_avx 16, 16 +BLOCKCOPY_SS_W16_H4_avx 16, 24 +BLOCKCOPY_SS_W16_H4_avx 16, 32 +BLOCKCOPY_SS_W16_H4_avx 16, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W16_H8 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 4 + mov r4d, %2/8 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 16] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m3 + + dec r4d + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W16_H8 16, 8 +BLOCKCOPY_SS_W16_H8 16, 16 +BLOCKCOPY_SS_W16_H8 16, 32 +BLOCKCOPY_SS_W16_H8 16, 64 + +BLOCKCOPY_SS_W16_H8 16, 24 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W24_H4 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 6 + mov r4d, %2/4 + add r1, r1 + add r3, r3 +.loop + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + r3] + movu m4, [r2 + r3 + 16] + movu m5, [r2 + r3 + 32] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + movu [r0 + r1 + 32], m5 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + r3] + movu m4, [r2 + r3 + 16] + movu m5, [r2 + r3 + 32] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + movu [r0 + r1 + 32], m5 + + dec r4d + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W24_H4 24, 32 + +BLOCKCOPY_SS_W24_H4 24, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W32_H4 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 4 + mov r4d, %2/4 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + dec r4d + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W32_H4 32, 8 +BLOCKCOPY_SS_W32_H4 32, 16 +BLOCKCOPY_SS_W32_H4 32, 24 +BLOCKCOPY_SS_W32_H4 32, 32 +BLOCKCOPY_SS_W32_H4 32, 64 + +BLOCKCOPY_SS_W32_H4 32, 48 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W48_H2 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 6 + mov r4d, %2/4 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + movu m4, [r2 + 64] + movu m5, [r2 + 80] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + movu [r0 + 64], m4 + movu [r0 + 80], m5 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + movu m4, [r2 + r3 + 64] + movu m5, [r2 + r3 + 80] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + movu [r0 + r1 + 64], m4 + movu [r0 + r1 + 80], m5 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + movu m4, [r2 + 64] + movu m5, [r2 + 80] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + movu [r0 + 64], m4 + movu [r0 + 80], m5 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + movu m4, [r2 + r3 + 64] + movu m5, [r2 + r3 + 80] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + movu [r0 + r1 + 64], m4 + movu [r0 + r1 + 80], m5 + + dec r4d + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + jnz .loop +RET +%endmacro + +BLOCKCOPY_SS_W48_H2 48, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W64_H4 2 +INIT_XMM sse2 +cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride + mov r4d, %2/4 + add r1, r1 + add r3, r3 +.loop: + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu m0, [r2 + 64] + movu m1, [r2 + 80] + movu m2, [r2 + 96] + movu m3, [r2 + 112] + + movu [r0 + 64], m0 + movu [r0 + 80], m1 + movu [r0 + 96], m2 + movu [r0 + 112], m3 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + movu m0, [r2 + r3 + 64] + movu m1, [r2 + r3 + 80] + movu m2, [r2 + r3 + 96] + movu m3, [r2 + r3 + 112] + + movu [r0 + r1 + 64], m0 + movu [r0 + r1 + 80], m1 + movu [r0 + r1 + 96], m2 + movu [r0 + r1 + 112], m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu m3, [r2 + 48] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu m0, [r2 + 64] + movu m1, [r2 + 80] + movu m2, [r2 + 96] + movu m3, [r2 + 112] + + movu [r0 + 64], m0 + movu [r0 + 80], m1 + movu [r0 + 96], m2 + movu [r0 + 112], m3 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 16] + movu m2, [r2 + r3 + 32] + movu m3, [r2 + r3 + 48] + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + movu m0, [r2 + r3 + 64] + movu m1, [r2 + r3 + 80] + movu m2, [r2 + r3 + 96] + movu m3, [r2 + r3 + 112] + + movu [r0 + r1 + 64], m0 + movu [r0 + r1 + 80], m1 + movu [r0 + r1 + 96], m2 + movu [r0 + r1 + 112], m3 + + dec r4d + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + jnz .loop + + RET +%endmacro + +BLOCKCOPY_SS_W64_H4 64, 16 +BLOCKCOPY_SS_W64_H4 64, 32 +BLOCKCOPY_SS_W64_H4 64, 48 +BLOCKCOPY_SS_W64_H4 64, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W64_H4_avx 2 +INIT_YMM avx +cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride + mov r4d, %2/4 + add r1, r1 + add r3, r3 + lea r5, [3 * r1] + lea r6, [3 * r3] +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r2 + 64] + movu m3, [r2 + 96] + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 32] + movu m2, [r2 + r3 + 64] + movu m3, [r2 + r3 + 96] + + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 64], m2 + movu [r0 + r1 + 96], m3 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + 32] + movu m2, [r2 + 2 * r3 + 64] + movu m3, [r2 + 2 * r3 + 96] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 32], m1 + movu [r0 + 2 * r1 + 64], m2 + movu [r0 + 2 * r1 + 96], m3 + + movu m0, [r2 + r6] + movu m1, [r2 + r6 + 32] + movu m2, [r2 + r6 + 64] + movu m3, [r2 + r6 + 96] + lea r2, [r2 + 4 * r3] + + movu [r0 + r5], m0 + movu [r0 + r5 + 32], m1 + movu [r0 + r5 + 64], m2 + movu [r0 + r5 + 96], m3 + lea r0, [r0 + 4 * r1] + + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W64_H4_avx 64, 16 +BLOCKCOPY_SS_W64_H4_avx 64, 32 +BLOCKCOPY_SS_W64_H4_avx 64, 48 +BLOCKCOPY_SS_W64_H4_avx 64, 64 + +;----------------------------------------------------------------------------- +; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride +%define rnd m2 +%define shift m1 + + ; make shift + mov r5d, r3m + movd shift, r5d + + ; make round + dec r5 + xor r6, r6 + bts r6, r5 + + movd rnd, r6d + pshufd rnd, rnd, 0 + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride * 2 (short*) + ; r3 - lx + ; r4 - size + ; r5 - ly + ; r6 - diff + add r2d, r2d + + mov r4d, r4m + mov r5, r4 + mov r6, r2 + sub r6, r4 + add r6, r6 + + shr r5, 1 +.loop_row: + + mov r3, r4 + shr r3, 2 +.loop_col: + ; row 0 + movu m0, [r1] + paddd m0, rnd + psrad m0, shift + packssdw m0, m0 + movh [r0], m0 + + ; row 1 + movu m0, [r1 + r4 * 4] + paddd m0, rnd + psrad m0, shift + packssdw m0, m0 + movh [r0 + r2], m0 + + ; move col pointer + add r1, 16 + add r0, 8 + + dec r3 + jg .loop_col + + ; update pointer + lea r1, [r1 + r4 * 4] + add r0, r6 + + ; end of loop_row + dec r5 + jg .loop_row + + RET + + +;-------------------------------------------------------------------------------------- +; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size +%define shift m1 + + ; make shift + mov r5d, r3m + movd shift, r5d + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride + ; r3 - shift + ; r4 - size + + sub r2d, r4d + add r2d, r2d + mov r5d, r4d + shr r4d, 2 +.loop_row: + mov r6d, r4d + +.loop_col: + pmovsxwd m0, [r1] + pslld m0, shift + movu [r0], m0 + + add r1, 8 + add r0, 16 + + dec r6d + jnz .loop_col + + add r1, r2 + dec r5d + jnz .loop_row + RET + + +;-------------------------------------------------------------------------------------- +; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal cvt16to32_shr_4, 3,3,3 + add r2d, r2d + movd m0, r3m + movd m1, r4m + pshufd m1, m1, 0 + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride + ; m0 - shift + ; m1 - dword [offset] + + ; Row 0 + pmovsxwd m2, [r1] + paddd m2, m1 + psrad m2, m0 + movu [r0 + 0 * mmsize], m2 + + ; Row 1 + pmovsxwd m2, [r1 + r2] + paddd m2, m1 + psrad m2, m0 + movu [r0 + 1 * mmsize], m2 + + ; Row 2 + lea r1, [r1 + r2 * 2] + pmovsxwd m2, [r1] + paddd m2, m1 + psrad m2, m0 + movu [r0 + 2 * mmsize], m2 + + ; Row 3 + pmovsxwd m2, [r1 + r2] + paddd m2, m1 + psrad m2, m0 + movu [r0 + 3 * mmsize], m2 + RET + + +;-------------------------------------------------------------------------------------- +; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal cvt16to32_shr_8, 3,5,3 + add r2d, r2d + movd m0, r3m + movd m1, r4m + pshufd m1, m1, 0 + mov r3d, 8/4 + lea r4, [r2 * 3] + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride + ; r3 - loop counter + ; r4 - stride * 3 + ; m0 - shift + ; m1 - dword [offset] + +.loop: + ; Row 0 + pmovsxwd m2, [r1] + pmovsxwd m3, [r1 + mmsize/2] + paddd m2, m1 + paddd m3, m1 + psrad m2, m0 + psrad m3, m0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + + ; Row 1 + pmovsxwd m2, [r1 + r2] + pmovsxwd m3, [r1 + r2 + mmsize/2] + paddd m2, m1 + paddd m3, m1 + psrad m2, m0 + psrad m3, m0 + movu [r0 + 2 * mmsize], m2 + movu [r0 + 3 * mmsize], m3 + + ; Row 2 + pmovsxwd m2, [r1 + r2 * 2] + pmovsxwd m3, [r1 + r2 * 2 + mmsize/2] + paddd m2, m1 + paddd m3, m1 + psrad m2, m0 + psrad m3, m0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + + ; Row 3 + pmovsxwd m2, [r1 + r4] + pmovsxwd m3, [r1 + r4 + mmsize/2] + paddd m2, m1 + paddd m3, m1 + psrad m2, m0 + psrad m3, m0 + movu [r0 + 6 * mmsize], m2 + movu [r0 + 7 * mmsize], m3 + + add r0, 8 * mmsize + lea r1, [r1 + r2 * 4] + dec r3d + jnz .loop + RET + + +;-------------------------------------------------------------------------------------- +; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal cvt16to32_shr_16, 3,4,6 + add r2d, r2d + movd m0, r3m + movd m1, r4m + pshufd m1, m1, 0 + mov r3d, 16/2 + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride + ; r3 - loop counter + ; m0 - shift + ; m1 - dword [offset] + +.loop: + ; Row 0 + pmovsxwd m2, [r1 + 0 * mmsize/2] + pmovsxwd m3, [r1 + 1 * mmsize/2] + pmovsxwd m4, [r1 + 2 * mmsize/2] + pmovsxwd m5, [r1 + 3 * mmsize/2] + paddd m2, m1 + paddd m3, m1 + paddd m4, m1 + paddd m5, m1 + psrad m2, m0 + psrad m3, m0 + psrad m4, m0 + psrad m5, m0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + movu [r0 + 2 * mmsize], m4 + movu [r0 + 3 * mmsize], m5 + + ; Row 1 + pmovsxwd m2, [r1 + r2 + 0 * mmsize/2] + pmovsxwd m3, [r1 + r2 +1 * mmsize/2] + pmovsxwd m4, [r1 + r2 +2 * mmsize/2] + pmovsxwd m5, [r1 + r2 +3 * mmsize/2] + paddd m2, m1 + paddd m3, m1 + paddd m4, m1 + paddd m5, m1 + psrad m2, m0 + psrad m3, m0 + psrad m4, m0 + psrad m5, m0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + movu [r0 + 6 * mmsize], m4 + movu [r0 + 7 * mmsize], m5 + + add r0, 8 * mmsize + lea r1, [r1 + r2 * 2] + dec r3d + jnz .loop + RET + + +;-------------------------------------------------------------------------------------- +; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal cvt16to32_shr_32, 3,4,6 + add r2d, r2d + movd m0, r3m + movd m1, r4m + pshufd m1, m1, 0 + mov r3d, 32/1 + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride + ; r3 - loop counter + ; m0 - shift + ; m1 - dword [offset] + +.loop: + ; Row 0 + pmovsxwd m2, [r1 + 0 * mmsize/2] + pmovsxwd m3, [r1 + 1 * mmsize/2] + pmovsxwd m4, [r1 + 2 * mmsize/2] + pmovsxwd m5, [r1 + 3 * mmsize/2] + paddd m2, m1 + paddd m3, m1 + paddd m4, m1 + paddd m5, m1 + psrad m2, m0 + psrad m3, m0 + psrad m4, m0 + psrad m5, m0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + movu [r0 + 2 * mmsize], m4 + movu [r0 + 3 * mmsize], m5 + + pmovsxwd m2, [r1 + 4 * mmsize/2] + pmovsxwd m3, [r1 + 5 * mmsize/2] + pmovsxwd m4, [r1 + 6 * mmsize/2] + pmovsxwd m5, [r1 + 7 * mmsize/2] + paddd m2, m1 + paddd m3, m1 + paddd m4, m1 + paddd m5, m1 + psrad m2, m0 + psrad m3, m0 + psrad m4, m0 + psrad m5, m0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + movu [r0 + 6 * mmsize], m4 + movu [r0 + 7 * mmsize], m5 + + add r0, 8 * mmsize + add r1, r2 + dec r3d + jnz .loop + RET + + +;-------------------------------------------------------------------------------------- +; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cvt32to16_shl_4, 3,3,5 + add r2d, r2d + movd m0, r3m + + ; Row 0-3 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movh [r0], m1 + movhps [r0 + r2], m1 + movh [r0 + r2 * 2], m3 + lea r2, [r2 * 3] + movhps [r0 + r2], m3 + RET + + +INIT_YMM avx2 +cglobal cvt32to16_shl_4, 3,3,3 + add r2d, r2d + movd xm0, r3m + + ; Row 0-3 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + packssdw m1, m2 + psllw m1, xm0 + vextracti128 xm0, m1, 1 + movq [r0], xm1 + movq [r0 + r2], xm0 + lea r0, [r0 + r2 * 2] + movhps [r0], xm1 + movhps [r0 + r2], xm0 + RET + + +;-------------------------------------------------------------------------------------- +; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cvt32to16_shl_8, 3,5,5 + add r2d, r2d + movd m0, r3m + mov r3d, 8/4 + lea r4, [r2 * 3] + +.loop: + ; Row 0-1 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movu [r0], m1 + movu [r0 + r2], m3 + + ; Row 2-3 + movu m1, [r1 + 4 * mmsize] + movu m2, [r1 + 5 * mmsize] + movu m3, [r1 + 6 * mmsize] + movu m4, [r1 + 7 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movu [r0 + r2 * 2], m1 + movu [r0 + r4], m3 + + add r1, 8 * mmsize + lea r0, [r0 + r2 * 4] + dec r3d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal cvt32to16_shl_8, 3,4,3 + add r2d, r2d + movd xm0, r3m + lea r3, [r2 * 3] + + ; Row 0-1 + movu xm1, [r1 + 0 * mmsize] + vinserti128 m1, m1, [r1 + 1 * mmsize], 1 + movu xm2, [r1 + 0 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 + packssdw m1, m2 + psllw m1, xm0 + movu [r0], xm1 + vextracti128 [r0 + r2], m1, 1 + + ; Row 2-3 + movu xm1, [r1 + 2 * mmsize] + vinserti128 m1, m1, [r1 + 3 * mmsize], 1 + movu xm2, [r1 + 2 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1 + packssdw m1, m2 + psllw m1, xm0 + movu [r0 + r2 * 2], xm1 + vextracti128 [r0 + r3], m1, 1 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + + ; Row 4-5 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + packssdw m1, m2 + vpermq m1, m1, 11011000b + psllw m1, xm0 + movu [r0], xm1 + vextracti128 [r0 + r2], m1, 1 + + ; Row 6-7 + movu m1, [r1 + 2 * mmsize] + movu m2, [r1 + 3 * mmsize] + packssdw m1, m2 + vpermq m1, m1, 11011000b + psllw m1, xm0 + movu [r0 + r2 * 2], xm1 + vextracti128 [r0 + r3], m1, 1 + RET + +;-------------------------------------------------------------------------------------- +; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cvt32to16_shl_16, 3,4,5 + add r2d, r2d + movd m0, r3m + mov r3d, 16/2 + +.loop: + ; Row 0 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movu [r0], m1 + movu [r0 + mmsize], m3 + + ; Row 1 + movu m1, [r1 + 4 * mmsize] + movu m2, [r1 + 5 * mmsize] + movu m3, [r1 + 6 * mmsize] + movu m4, [r1 + 7 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movu [r0 + r2], m1 + movu [r0 + r2 + mmsize], m3 + + add r1, 8 * mmsize + lea r0, [r0 + r2 * 2] + dec r3d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal cvt32to16_shl_16, 3,5,3 + add r2d, r2d + movd xm0, r3m + mov r3d, 16/4 + lea r4, [r2 * 3] + +.loop: + ; Row 0 + movu xm1, [r1 + 0 * mmsize] + vinserti128 m1, m1, [r1 + 1 * mmsize], 1 + movu xm2, [r1 + 0 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 + packssdw m1, m2 + psllw m1, xm0 + movu [r0], m1 + + ; Row 1 + movu xm1, [r1 + 2 * mmsize] + vinserti128 m1, m1, [r1 + 3 * mmsize], 1 + movu xm2, [r1 + 2 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1 + packssdw m1, m2 + psllw m1, xm0 + movu [r0 + r2], m1 + + add r1, 4 * mmsize + + ; Row 2 + movu xm1, [r1 + 0 * mmsize] + vinserti128 m1, m1, [r1 + 1 * mmsize], 1 + movu xm2, [r1 + 0 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 + packssdw m1, m2 + psllw m1, xm0 + movu [r0 + r2 * 2], m1 + + ; Row 3 + movu m1, [r1 + 2 * mmsize] + movu m2, [r1 + 3 * mmsize] + packssdw m1, m2 + psllw m1, xm0 + vpermq m1, m1, 11011000b + movu [r0 + r4], m1 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + dec r3d + jnz .loop + RET + + +;-------------------------------------------------------------------------------------- +; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cvt32to16_shl_32, 3,4,5 + add r2d, r2d + movd m0, r3m + mov r3d, 32/1 + +.loop: + ; Row 0 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movu [r0 + 0 * mmsize], m1 + movu [r0 + 1 * mmsize], m3 + + movu m1, [r1 + 4 * mmsize] + movu m2, [r1 + 5 * mmsize] + movu m3, [r1 + 6 * mmsize] + movu m4, [r1 + 7 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, m0 + psllw m3, m0 + movu [r0 + 2 * mmsize], m1 + movu [r0 + 3 * mmsize], m3 + + add r1, 8 * mmsize + add r0, r2 + dec r3d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal cvt32to16_shl_32, 3,4,5 + add r2d, r2d + movd xm0, r3m + mov r3d, 32/2 + +.loop: + ; Row 0 + movu xm1, [r1 + 0 * mmsize] + vinserti128 m1, m1, [r1 + 1 * mmsize], 1 + movu xm2, [r1 + 0 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 + movu xm3, [r1 + 2 * mmsize] + vinserti128 m3, m3, [r1 + 3 * mmsize], 1 + movu xm4, [r1 + 2 * mmsize + mmsize/2] + vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1 + packssdw m1, m2 + packssdw m3, m4 + psllw m1, xm0 + psllw m3, xm0 + movu [r0], m1 + movu [r0 + mmsize], m3 + + add r1, 4 * mmsize + + ; Row 1 + movu xm1, [r1 + 0 * mmsize] + vinserti128 m1, m1, [r1 + 1 * mmsize], 1 + movu xm2, [r1 + 0 * mmsize + mmsize/2] + vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1 + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + packssdw m1, m2 + packssdw m3, m4 + psllw m1, xm0 + psllw m3, xm0 + vpermq m3, m3, 11011000b + movu [r0 + r2], m1 + movu [r0 + r2 + mmsize], m3 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 2] + dec r3d + jnz .loop + RET + + +;-------------------------------------------------------------------------------------- +; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal copy_cnt_4, 3,3,3 + add r2d, r2d + pxor m2, m2 + + ; row 0 & 1 + movh m0, [r1] + movhps m0, [r1 + r2] + mova [r0], m0 + + ; row 2 & 3 + movh m1, [r1 + r2 * 2] + lea r2, [r2 * 3] + movhps m1, [r1 + r2] + mova [r0 + 16], m1 + + packsswb m0, m1 + pcmpeqb m0, m2 + + ; get count + ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem +%if 0 + pmovmskb eax, m0 + not ax + popcnt ax, ax +%else + mova m1, [pb_1] + paddb m0, m1 + psadbw m0, m2 + pshufd m1, m0, 2 + paddw m0, m1 + movd eax, m0 +%endif + RET + + +;-------------------------------------------------------------------------------------- +; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal copy_cnt_8, 3,3,6 + add r2d, r2d + pxor m4, m4 + pxor m5, m5 + + ; row 0 & 1 + movu m0, [r1] + movu m1, [r1 + r2] + movu [r0], m0 + movu [r0 + 16], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 2 & 3 + lea r1, [r1 + 2 * r2] + movu m0, [r1] + movu m1, [r1 + r2] + movu [r0 + 32], m0 + movu [r0 + 48], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 4 & 5 + lea r1, [r1 + 2 * r2] + movu m0, [r1] + movu m1, [r1 + r2] + movu [r0 + 64], m0 + movu [r0 + 80], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 6 & 7 + lea r1, [r1 + 2 * r2] + movu m0, [r1] + movu m1, [r1 + r2] + movu [r0 + 96], m0 + movu [r0 + 112], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; get count + mova m0, [pb_4] + paddb m5, m0 + psadbw m5, m4 + pshufd m0, m5, 2 + paddw m5, m0 + movd eax, m5 + RET + + +INIT_YMM avx2 +cglobal copy_cnt_8, 3,4,5 + add r2d, r2d + lea r3, [r2 * 3] + + ; row 0 - 1 + movu xm0, [r1] + vinserti128 m0, m0, [r1 + r2], 1 + movu [r0], m0 + + ; row 2 - 3 + movu xm1, [r1 + r2 * 2] + vinserti128 m1, m1, [r1 + r3], 1 + movu [r0 + 32], m1 + lea r1, [r1 + r2 * 4] + + ; row 4 - 5 + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r2], 1 + movu [r0 + 64], m2 + + ; row 6 - 7 + movu xm3, [r1 + r2 * 2] + vinserti128 m3, m3, [r1 + r3], 1 + movu [r0 + 96], m3 + + ; get count + xorpd m4, m4 + vpacksswb m0, m1 + vpacksswb m2, m3 + pminub m0, [pb_1] + pminub m2, [pb_1] + paddb m0, m2 + vextracti128 xm1, m0, 1 + paddb xm0, xm1 + psadbw xm0, xm4 + movhlps xm1, xm0 + paddd xm0, xm1 + movd eax, xm0 + RET + + +;-------------------------------------------------------------------------------------- +; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal copy_cnt_16, 3,4,6 + add r2d, r2d + mov r3d, 4 + pxor m4, m4 + pxor m5, m5 + +.loop + ; row 0 + movu m0, [r1] + movu m1, [r1 + 16] + movu [r0], m0 + movu [r0 + 16], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 1 + movu m0, [r1 + r2] + movu m1, [r1 + r2 + 16] + movu [r0 + 32], m0 + movu [r0 + 48], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 2 + movu m0, [r1 + 2 * r2] + movu m1, [r1 + 2 * r2 + 16] + movu [r0 + 64], m0 + movu [r0 + 80], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 3 + lea r1, [r1 + 2 * r2] + movu m0, [r1 + r2] + movu m1, [r1 + r2 + 16] + movu [r0 + 96], m0 + movu [r0 + 112], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + add r0, 128 + lea r1, [r1 + 2 * r2] + dec r3d + jnz .loop + + mova m0, [pb_16] + paddb m5, m0 + psadbw m5, m4 + pshufd m0, m5, 2 + paddw m5, m0 + movd eax, m5 + RET + + +INIT_YMM avx2 +cglobal copy_cnt_16, 3, 5, 5 + add r2d, r2d + lea r3, [r2 * 3] + mov r4d, 16/4 + + mova m3, [pb_1] + xorpd m4, m4 + +.loop: + ; row 0 - 1 + movu m0, [r1] + movu [r0], m0 + movu m1, [r1 + r2] + movu [r0 + 32], m1 + + packsswb m0, m1 + pminub m0, m3 + + ; row 2 - 3 + movu m1, [r1 + r2 * 2] + movu [r0 + 64], m1 + movu m2, [r1 + r3] + movu [r0 + 96], m2 + + packsswb m1, m2 + pminub m1, m3 + paddb m0, m1 + paddb m4, m0 + + add r0, 128 + lea r1, [r1 + 4 * r2] + dec r4d + jnz .loop + + ; get count + xorpd m0, m0 + vextracti128 xm1, m4, 1 + paddb xm4, xm1 + psadbw xm4, xm0 + movhlps xm1, xm4 + paddd xm4, xm1 + movd eax, xm4 + RET + +;-------------------------------------------------------------------------------------- +; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride); +;-------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal copy_cnt_32, 3,4,6 + add r2d, r2d + mov r3d, 16 + pxor m4, m4 + pxor m5, m5 + +.loop + ; row 0 + movu m0, [r1] + movu m1, [r1 + 16] + movu [r0], m0 + movu [r0 + 16], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + movu m0, [r1 + 32] + movu m1, [r1 + 48] + movu [r0 + 32], m0 + movu [r0 + 48], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + ; row 1 + movu m0, [r1 + r2] + movu m1, [r1 + r2 + 16] + movu [r0 + 64], m0 + movu [r0 + 80], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + movu m0, [r1 + r2 + 32] + movu m1, [r1 + r2 + 48] + movu [r0 + 96], m0 + movu [r0 + 112], m1 + + packsswb m0, m1 + pcmpeqb m0, m4 + paddb m5, m0 + + add r0, 128 + lea r1, [r1 + 2 * r2] + dec r3d + jnz .loop + + ; get count + mova m0, [pb_64] + paddb m5, m0 + psadbw m5, m4 + pshufd m0, m5, 2 + paddw m5, m0 + movd eax, m5 + RET + + +INIT_YMM avx2 +cglobal copy_cnt_32, 3, 5, 5 + add r2d, r2d + mov r3d, 32/2 + + mova m3, [pb_1] + xorpd m4, m4 + +.loop: + ; row 0 + movu m0, [r1] + movu [r0], m0 + movu m1, [r1 + 32] + movu [r0 + 32], m1 + + packsswb m0, m1 + pminub m0, m3 + + ; row 1 + movu m1, [r1 + r2] + movu [r0 + 64], m1 + movu m2, [r1 + r2 + 32] + movu [r0 + 96], m2 + + packsswb m1, m2 + pminub m1, m3 + paddb m0, m1 + paddb m4, m0 + + add r0, 128 + lea r1, [r1 + 2 * r2] + dec r3d + jnz .loop + + ; get count + xorpd m0, m0 + vextracti128 xm1, m4, 1 + paddb xm4, xm1 + psadbw xm4, xm0 + movhlps xm1, xm4 + paddd xm4, xm1 + movd eax, xm4 + RET + +;----------------------------------------------------------------------------- +; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size) +;----------------------------------------------------------------------------- + +INIT_XMM sse4 +cglobal copy_shr, 4, 7, 4, dst, src, stride +%define rnd m2 +%define shift m1 + + ; make shift + mov r5d, r3m + movd shift, r5d + + ; make round + dec r5 + xor r6, r6 + bts r6, r5 + + movd rnd, r6d + pshufd rnd, rnd, 0 + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride * 2 (short*) + ; r3 - lx + ; r4 - size + ; r5 - ly + ; r6 - diff + add r2d, r2d + + mov r4d, r4m + mov r5, r4 ; size + mov r6, r2 ; stride + sub r6, r4 + add r6, r6 + + shr r5, 1 +.loop_row: + + mov r3, r4 + shr r3, 2 +.loop_col: + ; row 0 + movh m3, [r1] + pmovsxwd m0, m3 + paddd m0, rnd + psrad m0, shift + packssdw m0, m0 + movh [r0], m0 + + ; row 1 + movh m3, [r1 + r4 * 2] + pmovsxwd m0, m3 + paddd m0, rnd + psrad m0, shift + packssdw m0, m0 + movh [r0 + r2], m0 + + ; move col pointer + add r1, 8 + add r0, 8 + + dec r3 + jg .loop_col + + ; update pointer + lea r1, [r1 + r4 * 2] + add r0, r6 + + ; end of loop_row + dec r5 + jg .loop_row + + RET + +;-------------------------------------------------------------------------------------- +; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal copy_shl_4, 3,3,3 + add r2d, r2d + movd m0, r3m + + ; Row 0-3 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + psllw m1, m0 + psllw m2, m0 + movh [r0], m1 + movhps [r0 + r2], m1 + movh [r0 + r2 * 2], m2 + lea r2, [r2 * 3] + movhps [r0 + r2], m2 + RET + +;-------------------------------------------------------------------------------------- +; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal copy_shl_8, 3,4,5 + add r2d, r2d + movd m0, r3m + + ; Row 0-3 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + psllw m1, m0 + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + movu [r0], m1 + movu [r0 + r2], m2 + movu [r0 + 2 * r2], m3 + lea r0, [r0 + 2 * r2] + movu [r0 + r2], m4 + + ; Row 4-7 + movu m1, [r1 + 4 * mmsize] + movu m2, [r1 + 5 * mmsize] + movu m3, [r1 + 6 * mmsize] + movu m4, [r1 + 7 * mmsize] + psllw m1, m0 + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + movu [r0 + r2 * 2], m1 + lea r0, [r0 + 2 * r2] + movu [r0 + r2], m2 + movu [r0 + 2 * r2], m3 + lea r0, [r0 + 2 * r2] + movu [r0 + r2], m4 + RET + +;-------------------------------------------------------------------------------------- +; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal copy_shl_16, 3,4,5 + add r2d, r2d + movd m0, r3m + mov r3d, 256/64 + +.loop: + ; Row 0-3 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + psllw m1, m0 + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + movu [r0], m1 + movu [r0 + 16], m2 + movu [r0 + r2], m3 + movu [r0 + r2 + 16], m4 + + ; Row 4-7 + movu m1, [r1 + 4 * mmsize] + movu m2, [r1 + 5 * mmsize] + movu m3, [r1 + 6 * mmsize] + movu m4, [r1 + 7 * mmsize] + psllw m1, m0 + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + movu [r0 + r2 * 2], m1 + movu [r0 + r2 * 2 + 16], m2 + lea r0, [r0 + r2 * 2] + movu [r0 + r2], m3 + movu [r0 + r2 + 16], m4 + + add r1, 8 * mmsize + lea r0, [r0 + r2 * 2] + dec r3d + jnz .loop + RET + +;-------------------------------------------------------------------------------------- +; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +;-------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal copy_shl_32, 3,4,5 + add r2d, r2d + movd m0, r3m + mov r3d, 1024/64 + +.loop: + ; Row 0-3 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + psllw m1, m0 + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + movu [r0], m1 + movu [r0 + 16], m2 + movu [r0 + 32], m3 + movu [r0 + 48], m4 + + ; Row 4-7 + movu m1, [r1 + 4 * mmsize] + movu m2, [r1 + 5 * mmsize] + movu m3, [r1 + 6 * mmsize] + movu m4, [r1 + 7 * mmsize] + psllw m1, m0 + psllw m2, m0 + psllw m3, m0 + psllw m4, m0 + movu [r0 + r2], m1 + movu [r0 + r2 + 16], m2 + movu [r0 + r2 + 32], m3 + movu [r0 + r2 + 48], m4 + + add r1, 8 * mmsize + lea r0, [r0 + r2 * 2] + dec r3d + jnz .loop + RET diff --git a/source/common/x86/blockcopy8.h b/source/common/x86/blockcopy8.h new file mode 100644 index 0000000..115e340 --- /dev/null +++ b/source/common/x86/blockcopy8.h @@ -0,0 +1,216 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_BLOCKCOPY8_H +#define X265_BLOCKCOPY8_H + +void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int); +void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_32_sse2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_4_avx2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int); +void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); +void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); +void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); +void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); +void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); +void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int); +void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int); +void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int); +void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int); +void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int); +uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t); +uint32_t x265_copy_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t); + +#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \ + void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \ + void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); \ + void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb); + +#define SETUP_BLOCKCOPY_PS(W, H, cpu) \ + void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride); + +#define SETUP_BLOCKCOPY_SP(W, H, cpu) \ + void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); + +#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \ + void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \ + void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb); + +#define BLOCKCOPY_COMMON(cpu) \ + SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \ + SETUP_BLOCKCOPY_FUNC(4, 2, cpu); \ + SETUP_BLOCKCOPY_FUNC(8, 8, cpu); \ + SETUP_BLOCKCOPY_FUNC(8, 4, cpu); \ + SETUP_BLOCKCOPY_FUNC(4, 8, cpu); \ + SETUP_BLOCKCOPY_FUNC(8, 6, cpu); \ + SETUP_BLOCKCOPY_FUNC(8, 2, cpu); \ + SETUP_BLOCKCOPY_FUNC(16, 16, cpu); \ + SETUP_BLOCKCOPY_FUNC(16, 8, cpu); \ + SETUP_BLOCKCOPY_FUNC(8, 16, cpu); \ + SETUP_BLOCKCOPY_FUNC(16, 12, cpu); \ + SETUP_BLOCKCOPY_FUNC(12, 16, cpu); \ + SETUP_BLOCKCOPY_FUNC(16, 4, cpu); \ + SETUP_BLOCKCOPY_FUNC(4, 16, cpu); \ + SETUP_BLOCKCOPY_FUNC(32, 32, cpu); \ + SETUP_BLOCKCOPY_FUNC(32, 16, cpu); \ + SETUP_BLOCKCOPY_FUNC(16, 32, cpu); \ + SETUP_BLOCKCOPY_FUNC(32, 24, cpu); \ + SETUP_BLOCKCOPY_FUNC(24, 32, cpu); \ + SETUP_BLOCKCOPY_FUNC(32, 8, cpu); \ + SETUP_BLOCKCOPY_FUNC(8, 32, cpu); \ + SETUP_BLOCKCOPY_FUNC(64, 64, cpu); \ + SETUP_BLOCKCOPY_FUNC(64, 32, cpu); \ + SETUP_BLOCKCOPY_FUNC(32, 64, cpu); \ + SETUP_BLOCKCOPY_FUNC(64, 48, cpu); \ + SETUP_BLOCKCOPY_FUNC(48, 64, cpu); \ + SETUP_BLOCKCOPY_FUNC(64, 16, cpu); \ + SETUP_BLOCKCOPY_FUNC(16, 64, cpu); + +#define BLOCKCOPY_SP(cpu) \ + SETUP_BLOCKCOPY_SP(2, 4, cpu); \ + SETUP_BLOCKCOPY_SP(2, 8, cpu); \ + SETUP_BLOCKCOPY_SP(6, 8, cpu); \ + \ + SETUP_BLOCKCOPY_SP(2, 16, cpu); \ + SETUP_BLOCKCOPY_SP(4, 32, cpu); \ + SETUP_BLOCKCOPY_SP(6, 16, cpu); \ + SETUP_BLOCKCOPY_SP(8, 12, cpu); \ + SETUP_BLOCKCOPY_SP(8, 64, cpu); \ + SETUP_BLOCKCOPY_SP(12, 32, cpu); \ + SETUP_BLOCKCOPY_SP(16, 24, cpu); \ + SETUP_BLOCKCOPY_SP(24, 64, cpu); \ + SETUP_BLOCKCOPY_SP(32, 48, cpu); + +#define BLOCKCOPY_SS_PP(cpu) \ + SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \ + SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \ + SETUP_BLOCKCOPY_SS_PP(6, 8, cpu); \ + \ + SETUP_BLOCKCOPY_SS_PP(2, 16, cpu); \ + SETUP_BLOCKCOPY_SS_PP(4, 32, cpu); \ + SETUP_BLOCKCOPY_SS_PP(6, 16, cpu); \ + SETUP_BLOCKCOPY_SS_PP(8, 12, cpu); \ + SETUP_BLOCKCOPY_SS_PP(8, 64, cpu); \ + SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \ + SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \ + SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \ + SETUP_BLOCKCOPY_SS_PP(32, 48, cpu); + + +#define BLOCKCOPY_PS(cpu) \ + SETUP_BLOCKCOPY_PS(2, 4, cpu); \ + SETUP_BLOCKCOPY_PS(2, 8, cpu); \ + SETUP_BLOCKCOPY_PS(4, 2, cpu); \ + SETUP_BLOCKCOPY_PS(4, 4, cpu); \ + SETUP_BLOCKCOPY_PS(4, 8, cpu); \ + SETUP_BLOCKCOPY_PS(4, 16, cpu); \ + SETUP_BLOCKCOPY_PS(6, 8, cpu); \ + SETUP_BLOCKCOPY_PS(8, 2, cpu); \ + SETUP_BLOCKCOPY_PS(8, 4, cpu); \ + SETUP_BLOCKCOPY_PS(8, 6, cpu); \ + SETUP_BLOCKCOPY_PS(8, 8, cpu); \ + SETUP_BLOCKCOPY_PS(8, 16, cpu); \ + SETUP_BLOCKCOPY_PS(8, 32, cpu); \ + SETUP_BLOCKCOPY_PS(12, 16, cpu); \ + SETUP_BLOCKCOPY_PS(16, 4, cpu); \ + SETUP_BLOCKCOPY_PS(16, 8, cpu); \ + SETUP_BLOCKCOPY_PS(16, 12, cpu); \ + SETUP_BLOCKCOPY_PS(16, 16, cpu); \ + SETUP_BLOCKCOPY_PS(16, 32, cpu); \ + SETUP_BLOCKCOPY_PS(24, 32, cpu); \ + SETUP_BLOCKCOPY_PS(32, 8, cpu); \ + SETUP_BLOCKCOPY_PS(32, 16, cpu); \ + SETUP_BLOCKCOPY_PS(32, 24, cpu); \ + SETUP_BLOCKCOPY_PS(32, 32, cpu); \ + SETUP_BLOCKCOPY_PS(16, 64, cpu); \ + SETUP_BLOCKCOPY_PS(32, 64, cpu); \ + SETUP_BLOCKCOPY_PS(48, 64, cpu); \ + SETUP_BLOCKCOPY_PS(64, 16, cpu); \ + SETUP_BLOCKCOPY_PS(64, 32, cpu); \ + SETUP_BLOCKCOPY_PS(64, 48, cpu); \ + SETUP_BLOCKCOPY_PS(64, 64, cpu); \ + \ + SETUP_BLOCKCOPY_PS(2, 16, cpu); \ + SETUP_BLOCKCOPY_PS(4, 32, cpu); \ + SETUP_BLOCKCOPY_PS(6, 16, cpu); \ + SETUP_BLOCKCOPY_PS(8, 12, cpu); \ + SETUP_BLOCKCOPY_PS(8, 64, cpu); \ + SETUP_BLOCKCOPY_PS(12, 32, cpu); \ + SETUP_BLOCKCOPY_PS(16, 24, cpu); \ + SETUP_BLOCKCOPY_PS(24, 64, cpu); \ + SETUP_BLOCKCOPY_PS(32, 48, cpu); + +BLOCKCOPY_COMMON(_sse2); +BLOCKCOPY_SS_PP(_sse2); +BLOCKCOPY_SP(_sse4); +BLOCKCOPY_PS(_sse4); + +BLOCKCOPY_SP(_sse2); + +void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockcopy_ss_16x4_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_16x8_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_16x12_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_16x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_16x24_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_16x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_16x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_64x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_64x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_64x48_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); +void x265_blockcopy_ss_64x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); + +void x265_blockcopy_pp_32x8_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); +void x265_blockcopy_pp_32x16_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); +void x265_blockcopy_pp_32x24_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); +void x265_blockcopy_pp_32x32_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); +void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); +void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); + +void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val); + +#undef BLOCKCOPY_COMMON +#undef BLOCKCOPY_SS_PP +#undef BLOCKCOPY_SP +#undef BLOCKCOPY_PS +#undef SETUP_BLOCKCOPY_PS +#undef SETUP_BLOCKCOPY_SP +#undef SETUP_BLOCKCOPY_SS_PP +#undef SETUP_BLOCKCOPY_FUNC + +#endif // ifndef X265_I386_PIXEL_H diff --git a/source/common/x86/const-a.asm b/source/common/x86/const-a.asm new file mode 100644 index 0000000..17c3335 --- /dev/null +++ b/source/common/x86/const-a.asm @@ -0,0 +1,111 @@ +;***************************************************************************** +;* const-a.asm: x86 global constants +;***************************************************************************** +;* Copyright (C) 2010-2013 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA 32 + +const pb_1, times 32 db 1 + +const hsub_mul, times 16 db 1, -1 +const pw_1, times 16 dw 1 +const pw_16, times 16 dw 16 +const pw_32, times 16 dw 32 +const pw_128, times 16 dw 128 +const pw_256, times 16 dw 256 +const pw_512, times 16 dw 512 +const pw_1023, times 8 dw 1023 +const pw_1024, times 16 dw 1024 +const pw_4096, times 16 dw 4096 +const pw_00ff, times 16 dw 0x00ff +const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) +const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 +const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 +const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 +const pb_unpackwq1, db 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 +const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7 +const pw_swap, times 2 db 6,7,4,5,2,3,0,1 + +const pb_4, times 16 db 4 +const pb_16, times 16 db 16 +const pb_64, times 16 db 64 +const pb_01, times 8 db 0,1 +const pb_0, times 16 db 0 +const pb_a1, times 16 db 0xa1 +const pb_3, times 16 db 3 +const pb_8, times 16 db 8 +const pb_32, times 16 db 32 +const pb_128, times 16 db 128 +const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 + +const pw_2, times 8 dw 2 +const pw_m2, times 8 dw -2 +const pw_4, times 8 dw 4 +const pw_8, times 8 dw 8 +const pw_64, times 8 dw 64 +const pw_256, times 8 dw 256 +const pw_32_0, times 4 dw 32, + times 4 dw 0 +const pw_2000, times 8 dw 0x2000 +const pw_8000, times 8 dw 0x8000 +const pw_3fff, times 8 dw 0x3fff +const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 +const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 +const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 +const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 +const pd_1, times 8 dd 1 +const pd_2, times 4 dd 2 +const pd_4, times 4 dd 4 +const pd_8, times 4 dd 8 +const pd_16, times 4 dd 16 +const pd_32, times 4 dd 32 +const pd_64, times 4 dd 64 +const pd_128, times 4 dd 128 +const pd_256, times 4 dd 256 +const pd_512, times 4 dd 512 +const pd_1024, times 4 dd 1024 +const pd_2048, times 4 dd 2048 +const pd_ffff, times 4 dd 0xffff +const pd_32767, times 4 dd 32767 +const pd_n32768, times 4 dd 0xffff8000 +const pw_ff00, times 8 dw 0xff00 + +const multi_2Row, dw 1, 2, 3, 4, 1, 2, 3, 4 +const multiL, dw 1, 2, 3, 4, 5, 6, 7, 8 +const multiH, dw 9, 10, 11, 12, 13, 14, 15, 16 +const multiH2, dw 17, 18, 19, 20, 21, 22, 23, 24 +const multiH3, dw 25, 26, 27, 28, 29, 30, 31, 32 + +const popcnt_table +%assign x 0 +%rep 256 +; population count +db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) +%assign x x+1 +%endrep + +const sw_64, dd 64 diff --git a/source/common/x86/cpu-a.asm b/source/common/x86/cpu-a.asm new file mode 100644 index 0000000..460c335 --- /dev/null +++ b/source/common/x86/cpu-a.asm @@ -0,0 +1,197 @@ +;***************************************************************************** +;* cpu-a.asm: x86 cpu utilities +;***************************************************************************** +;* Copyright (C) 2003-2013 x264 project +;* +;* Authors: Laurent Aimar +;* Loren Merritt +;* Fiona Glaser +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +;----------------------------------------------------------------------------- +cglobal cpu_cpuid, 5,7 + push rbx + push r4 + push r3 + push r2 + push r1 + mov eax, r0d + xor ecx, ecx + cpuid + pop r4 + mov [r4], eax + pop r4 + mov [r4], ebx + pop r4 + mov [r4], ecx + pop r4 + mov [r4], edx + pop rbx + RET + +;----------------------------------------------------------------------------- +; void cpu_xgetbv( int op, int *eax, int *edx ) +;----------------------------------------------------------------------------- +cglobal cpu_xgetbv, 3,7 + push r2 + push r1 + mov ecx, r0d + xgetbv + pop r4 + mov [r4], eax + pop r4 + mov [r4], edx + RET + +%if ARCH_X86_64 + +;----------------------------------------------------------------------------- +; void stack_align( void (*func)(void*), void *arg ); +;----------------------------------------------------------------------------- +cglobal stack_align + push rbp + mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~31 + mov rax, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + call rax + leave + ret + +%else + +;----------------------------------------------------------------------------- +; int cpu_cpuid_test( void ) +; return 0 if unsupported +;----------------------------------------------------------------------------- +cglobal cpu_cpuid_test + pushfd + push ebx + push ebp + push esi + push edi + pushfd + pop eax + mov ebx, eax + xor eax, 0x200000 + push eax + popfd + pushfd + pop eax + xor eax, ebx + pop edi + pop esi + pop ebp + pop ebx + popfd + ret + +cglobal stack_align + push ebp + mov ebp, esp + sub esp, 12 + and esp, ~31 + mov ecx, [ebp+8] + mov edx, [ebp+12] + mov [esp], edx + mov edx, [ebp+16] + mov [esp+4], edx + mov edx, [ebp+20] + mov [esp+8], edx + call ecx + leave + ret + +%endif + +;----------------------------------------------------------------------------- +; void cpu_emms( void ) +;----------------------------------------------------------------------------- +cglobal cpu_emms + emms + ret + +;----------------------------------------------------------------------------- +; void cpu_sfence( void ) +;----------------------------------------------------------------------------- +cglobal cpu_sfence + sfence + ret + +cextern intel_cpu_indicator_init + +;----------------------------------------------------------------------------- +; void safe_intel_cpu_indicator_init( void ); +;----------------------------------------------------------------------------- +cglobal safe_intel_cpu_indicator_init + push r0 + push r1 + push r2 + push r3 + push r4 + push r5 + push r6 +%if ARCH_X86_64 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 +%endif + push rbp + mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~31 + call intel_cpu_indicator_init + leave +%if ARCH_X86_64 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + pop r7 +%endif + pop r6 + pop r5 + pop r4 + pop r3 + pop r2 + pop r1 + pop r0 + ret diff --git a/source/common/x86/dct8.asm b/source/common/x86/dct8.asm new file mode 100644 index 0000000..5323a42 --- /dev/null +++ b/source/common/x86/dct8.asm @@ -0,0 +1,2684 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Nabajit Deka +;* Min Chen +;* Li Cao +;* Praveen Kumar Tiwari +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +;TO-DO : Further optimize the routines. + +%include "x86inc.asm" +%include "x86util.asm" +SECTION_RODATA 32 +tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64 + dw 89, 75, 50, 18, -18, -50, -75, -89 + dw 83, 36, -36, -83, -83, -36, 36, 83 + dw 75, -18, -89, -50, 50, 89, 18, -75 + dw 64, -64, -64, 64, 64, -64, -64, 64 + dw 50, -89, 18, 75, -75, -18, 89, -50 + dw 36, -83, 83, -36, -36, 83, -83, 36 + dw 18, -50, 75, -89, 89, -75, 50, -18 + +dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 + +tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64 + dw 90, 87, 80, 70, 57, 43, 25, 9 + dw 89, 75, 50, 18, -18, -50, -75, -89 + dw 87, 57, 9, -43, -80, -90, -70, -25 + dw 83, 36, -36, -83, -83, -36, 36, 83 + dw 80, 9, -70, -87, -25, 57, 90, 43 + dw 75, -18, -89, -50, 50, 89, 18, -75 + dw 70, -43, -87, 9, 90, 25, -80, -57 + dw 64, -64, -64, 64, 64, -64, -64, 64 + dw 57, -80, -25, 90, -9, -87, 43, 70 + dw 50, -89, 18, 75, -75, -18, 89, -50 + dw 43, -90, 57, 25, -87, 70, 9, -80 + dw 36, -83, 83, -36, -36, 83, -83, 36 + dw 25, -70, 90, -80, 43, 9, -57, 87 + dw 18, -50, 75, -89, 89, -75, 50, -18 + dw 9, -25, 43, -57, 70, -80, 87, -90 + + +tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64 + dw -9, -25, -43, -57, -70, -80, -87, -90 + dw -89, -75, -50, -18, 18, 50, 75, 89 + dw 25, 70, 90, 80, 43, -9, -57, -87 + dw 83, 36, -36, -83, -83, -36, 36, 83 + dw -43, -90, -57, 25, 87, 70, -9, -80 + dw -75, 18, 89, 50, -50, -89, -18, 75 + dw 57, 80, -25, -90, -9, 87, 43, -70 + dw 64, -64, -64, 64, 64, -64, -64, 64 + dw -70, -43, 87, 9, -90, 25, 80, -57 + dw -50, 89, -18, -75, 75, 18, -89, 50 + dw 80, -9, -70, 87, -25, -57, 90, -43 + dw 36, -83, 83, -36, -36, 83, -83, 36 + dw -87, 57, -9, -43, 80, -90, 70, -25 + dw -18, 50, -75, 89, -89, 75, -50, 18 + dw 90, -87, 80, -70, 57, -43, 25, -9 + +dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 + +dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9 + +tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 + dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 + dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 + dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 + dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 + dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 + dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 + dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 + dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 + dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 + dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 + dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 + dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 + dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 + dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 + dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 + dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 + dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 + dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 + dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 + dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 + dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 + dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 + dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 + dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 + dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 + dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 + dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 + dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 + dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 + dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 + +tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 + dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 + dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 + dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 + dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 + dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 + dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 + dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 + dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 + dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 + dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 + dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 + dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 + dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 + dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 + dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 + dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 + dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 + dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 + dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 + dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 + dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 + dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 + dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 + dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 + dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 + dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 + dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 + dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 + dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 + dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 + +avx2_idct8_1: times 4 dw 64, 83, 64, 36 + times 4 dw 64, 36, -64, -83 + times 4 dw 64, -36, -64, 83 + times 4 dw 64, -83, 64, -36 + +avx2_idct8_2: times 4 dw 89, 75, 50, 18 + times 4 dw 75, -18, -89, -50 + times 4 dw 50, -89, 18, 75 + times 4 dw 18, -50, 75, -89 + +idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7 + +idct8_shuf2: times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + +idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 + +tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9 + dw 87, 57, 9, -43, -80, -90, -70, -25 + dw 80, 9, -70, -87, -25, 57, 90, 43 + dw 70, -43, -87, 9, 90, 25, -80, -57 + dw 57, -80, -25, 90, -9, -87, 43, 70 + dw 43, -90, 57, 25, -87, 70, 9, -80 + dw 25, -70, 90, -80, 43, 9, -57, 87 + dw 9, -25, 43, -57, 70, -80, 87, -90 + +tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18 + dw 64, 75, 36, -18, -64, -89, -83, -50 + dw 64, 50, -36, -89, -64, 18, 83, 75 + dw 64, 18, -83, -50, 64, 75, -36, -89 + dw 64, -18, -83, 50, 64, -75, -36, 89 + dw 64, -50, -36, 89, -64, -18, 83, -75 + dw 64, -75, 36, 18, -64, 89, -83, 50 + dw 64, -89, 83, -75, 64, -50, 36, -18 + +idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7 + +idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 + +tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 + dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 + dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 + dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 + dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 + dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 + dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 + dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 + dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 + dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 + dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 + dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 + dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 + dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 + dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 + dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 + + +tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18 + dw 64, 75, 36, -18, -64, -89, -83, -50 + dw 64, 50, -36, -89, -64, 18, 83, 75 + dw 64, 18, -83, -50, 64, 75, -36, -89 + dw 64, -18, -83, 50, 64, -75, -36, 89 + dw 64, -50, -36, 89, -64, -18, 83, -75 + dw 64, -75, 36, 18, -64, 89, -83, 50 + dw 64, -89, 83, -75, 64, -50, 36, -18 + + +tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9 + dw 87, 57, 9, -43, -80, -90, -70, -25 + dw 80, 9, -70, -87, -25, 57, 90, 43 + dw 70, -43, -87, 9, 90, 25, -80, -57 + dw 57, -80, -25, 90, -9, -87, 43, 70 + dw 43, -90, 57, 25, -87, 70, 9, -80 + dw 25, -70, 90, -80, 43, 9, -57, 87 + dw 9, -25, 43, -57, 70, -80, 87, -90 + +tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 + dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25 + dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43 + dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57 + dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70 + dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80 + dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87 + dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90 + dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90 + dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87 + dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80 + dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70 + dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57 + dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43 + dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25 + dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 + +avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 + dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83 + +avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 + dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83 + +avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83 + +const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11 + +tab_dct4: times 4 dw 64, 64 + times 4 dw 83, 36 + times 4 dw 64, -64 + times 4 dw 36, -83 + +dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 + +tab_dst4: times 2 dw 29, 55, 74, 84 + times 2 dw 74, 74, 0, -74 + times 2 dw 84, -29, -74, 55 + times 2 dw 55, -84, 74, -29 + +tab_idst4: times 4 dw 29, +84 + times 4 dw +74, +55 + times 4 dw 55, -29 + times 4 dw +74, -84 + times 4 dw 74, -74 + times 4 dw 0, +74 + times 4 dw 84, +55 + times 4 dw -74, -29 + +tab_dct8_1: times 2 dw 89, 50, 75, 18 + times 2 dw 75, -89, -18, -50 + times 2 dw 50, 18, -89, 75 + times 2 dw 18, 75, -50, -89 + +tab_dct8_2: times 2 dd 83, 36 + times 2 dd 36, 83 + times 1 dd 89, 75, 50, 18 + times 1 dd 75, -18, -89, -50 + times 1 dd 50, -89, 18, 75 + times 1 dd 18, -50, 75, -89 + +tab_idct8_3: times 4 dw 89, 75 + times 4 dw 50, 18 + times 4 dw 75, -18 + times 4 dw -89, -50 + times 4 dw 50, -89 + times 4 dw 18, 75 + times 4 dw 18, -50 + times 4 dw 75, -89 + +pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15 + +pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13 + +tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36 + +tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50 + times 1 dw 50, -89, 18, 75, 18, -50, 75, -89 + +pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 + +SECTION .text +cextern pd_1 +cextern pd_2 +cextern pd_4 +cextern pd_8 +cextern pd_16 +cextern pd_32 +cextern pd_64 +cextern pd_128 +cextern pd_256 +cextern pd_512 +cextern pd_1024 +cextern pd_2048 +cextern pw_ppppmmmm + +;------------------------------------------------------ +;void dct4(int16_t *src, int32_t *dst, intptr_t stride) +;------------------------------------------------------ +INIT_XMM sse2 +cglobal dct4, 3, 4, 8 +%if BIT_DEPTH == 10 + %define DCT_SHIFT 3 + mova m7, [pd_4] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT 1 + mova m7, [pd_1] +%else + %error Unsupported BIT_DEPTH! +%endif + add r2d, r2d + lea r3, [tab_dct4] + + mova m4, [r3 + 0 * 16] + mova m5, [r3 + 1 * 16] + mova m6, [r3 + 2 * 16] + movh m0, [r0 + 0 * r2] + movh m1, [r0 + 1 * r2] + punpcklqdq m0, m1 + pshufd m0, m0, 0xD8 + pshufhw m0, m0, 0xB1 + + lea r0, [r0 + 2 * r2] + movh m1, [r0] + movh m2, [r0 + r2] + punpcklqdq m1, m2 + pshufd m1, m1, 0xD8 + pshufhw m1, m1, 0xB1 + + punpcklqdq m2, m0, m1 + punpckhqdq m0, m1 + + paddw m1, m2, m0 + psubw m2, m0 + pmaddwd m0, m1, m4 + paddd m0, m7 + psrad m0, DCT_SHIFT + pmaddwd m3, m2, m5 + paddd m3, m7 + psrad m3, DCT_SHIFT + packssdw m0, m3 + pshufd m0, m0, 0xD8 + pshufhw m0, m0, 0xB1 + pmaddwd m1, m6 + paddd m1, m7 + psrad m1, DCT_SHIFT + pmaddwd m2, [r3 + 3 * 16] + paddd m2, m7 + psrad m2, DCT_SHIFT + packssdw m1, m2 + pshufd m1, m1, 0xD8 + pshufhw m1, m1, 0xB1 + + punpcklqdq m2, m0, m1 + punpckhqdq m0, m1 + + mova m7, [pd_128] + + pmaddwd m1, m2, m4 + pmaddwd m3, m0, m4 + paddd m1, m3 + paddd m1, m7 + psrad m1, 8 + movu [r1 + 0 * 16], m1 + + pmaddwd m1, m2, m5 + pmaddwd m3, m0, m5 + psubd m1, m3 + paddd m1, m7 + psrad m1, 8 + movu [r1 + 1 * 16], m1 + + pmaddwd m1, m2, m6 + pmaddwd m3, m0, m6 + paddd m1, m3 + paddd m1, m7 + psrad m1, 8 + movu [r1 + 2 * 16], m1 + + pmaddwd m2, [r3 + 3 * 16] + pmaddwd m0, [r3 + 3 * 16] + psubd m2, m0 + paddd m2, m7 + psrad m2, 8 + movu [r1 + 3 * 16], m2 + RET + +; DCT 4x4 +; +; Input parameters: +; - r0: source +; - r1: destination +; - r2: source stride +INIT_YMM avx2 +cglobal dct4, 3, 4, 8, src, dst, srcStride +%if BIT_DEPTH == 10 + %define DCT_SHIFT 3 + vbroadcasti128 m7, [pd_4] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT 1 + vbroadcasti128 m7, [pd_1] +%else + %error Unsupported BIT_DEPTH! +%endif + add r2d, r2d + lea r3, [avx2_dct4] + + vbroadcasti128 m4, [dct4_shuf] + mova m5, [r3] + mova m6, [r3 + 32] + movq xm0, [r0] + movhps xm0, [r0 + r2] + lea r0, [r0 + 2 * r2] + movq xm1, [r0] + movhps xm1, [r0 + r2] + + vinserti128 m0, m0, xm1, 1 + pshufb m0, m4 + vpermq m1, m0, 11011101b + vpermq m0, m0, 10001000b + paddw m2, m0, m1 + psubw m0, m1 + + pmaddwd m2, m5 + paddd m2, m7 + psrad m2, DCT_SHIFT + + pmaddwd m0, m6 + paddd m0, m7 + psrad m0, DCT_SHIFT + + packssdw m2, m0 + pshufb m2, m4 + vpermq m1, m2, 11011101b + vpermq m2, m2, 10001000b + vbroadcasti128 m7, [pd_128] + + pmaddwd m0, m2, m5 + pmaddwd m3, m1, m5 + paddd m3, m0 + paddd m3, m7 + psrad m3, 8 + + pmaddwd m2, m6 + pmaddwd m1, m6 + psubd m2, m1 + paddd m2, m7 + psrad m2, 8 + + movu [r1], xm3 + movu [r1 + mmsize/2], m2 + vextracti128 [r1 + mmsize], m3, 1 + vextracti128 [r1 + mmsize + mmsize/2], m2, 1 + RET + +;------------------------------------------------------- +;void idct4(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_XMM sse2 +cglobal idct4, 3, 4, 7 +%if BIT_DEPTH == 8 + %define IDCT4_OFFSET [pd_2048] + %define IDCT4_SHIFT 12 +%elif BIT_DEPTH == 10 + %define IDCT4_OFFSET [pd_512] + %define IDCT4_SHIFT 10 +%else + %error Unsupported BIT_DEPTH! +%endif + add r2d, r2d + lea r3, [tab_dct4] + + mova m6, [pd_64] + + movu m0, [r0 + 0 * 16] + movu m1, [r0 + 1 * 16] + packssdw m0, m1 + + movu m1, [r0 + 2 * 16] + movu m2, [r0 + 3 * 16] + packssdw m1, m2 + + punpcklwd m2, m0, m1 + pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 + paddd m3, m6 + + pmaddwd m2, [r3 + 2 * 16] ; m2 = E2 + paddd m2, m6 + + punpckhwd m0, m1 + pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 + pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 + + paddd m4, m3, m1 + psrad m4, 7 ; m4 = m128iA + paddd m5, m2, m0 + psrad m5, 7 + packssdw m4, m5 ; m4 = m128iA + + psubd m2, m0 + psrad m2, 7 + psubd m3, m1 + psrad m3, 7 + packssdw m2, m3 ; m2 = m128iD + + punpcklwd m1, m4, m2 ; m1 = S0 + punpckhwd m4, m2 ; m4 = S8 + + punpcklwd m0, m1, m4 ; m0 = m128iA + punpckhwd m1, m4 ; m1 = m128iD + + mova m6, IDCT4_OFFSET + + punpcklwd m2, m0, m1 + pmaddwd m3, m2, [r3 + 0 * 16] + paddd m3, m6 ; m3 = E1 + + pmaddwd m2, [r3 + 2 * 16] + paddd m2, m6 ; m2 = E2 + + punpckhwd m0, m1 + pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 + pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 + + paddd m4, m3, m1 + psrad m4, IDCT4_SHIFT ; m4 = m128iA + paddd m5, m2, m0 + psrad m5, IDCT4_SHIFT + packssdw m4, m5 ; m4 = m128iA + + psubd m2, m0 + psrad m2, IDCT4_SHIFT + psubd m3, m1 + psrad m3, IDCT4_SHIFT + packssdw m2, m3 ; m2 = m128iD + + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + + punpcklwd m0, m1, m4 + movlps [r1 + 0 * r2], m0 + movhps [r1 + 1 * r2], m0 + + punpckhwd m1, m4 + movlps [r1 + 2 * r2], m1 + lea r1, [r1 + 2 * r2] + movhps [r1 + r2], m1 + + RET + +;------------------------------------------------------ +;void dst4(int16_t *src, int32_t *dst, intptr_t stride) +;------------------------------------------------------ +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal dst4, 3, 4, 8+2 + %define coef2 m8 + %define coef3 m9 +%else ; ARCH_X86_64 = 0 +cglobal dst4, 3, 4, 8 + %define coef2 [r3 + 2 * 16] + %define coef3 [r3 + 3 * 16] +%endif ; ARCH_X86_64 +%define coef0 m6 +%define coef1 m7 + +%if BIT_DEPTH == 8 + %define DST_SHIFT 1 + mova m5, [pd_1] +%elif BIT_DEPTH == 10 + %define DST_SHIFT 3 + mova m5, [pd_4] +%endif + add r2d, r2d + lea r3, [tab_dst4] + mova coef0, [r3 + 0 * 16] + mova coef1, [r3 + 1 * 16] +%if ARCH_X86_64 + mova coef2, [r3 + 2 * 16] + mova coef3, [r3 + 3 * 16] +%endif + movh m0, [r0 + 0 * r2] ; load + movh m1, [r0 + 1 * r2] + punpcklqdq m0, m1 + lea r0, [r0 + 2 * r2] + movh m1, [r0] + movh m2, [r0 + r2] + punpcklqdq m1, m2 + pmaddwd m2, m0, coef0 ; DST1 + pmaddwd m3, m1, coef0 + phaddd m2, m3 + paddd m2, m5 + psrad m2, DST_SHIFT + pmaddwd m3, m0, coef1 + pmaddwd m4, m1, coef1 + phaddd m3, m4 + paddd m3, m5 + psrad m3, DST_SHIFT + packssdw m2, m3 ; m2 = T70 + pmaddwd m3, m0, coef2 + pmaddwd m4, m1, coef2 + phaddd m3, m4 + paddd m3, m5 + psrad m3, DST_SHIFT + pmaddwd m0, coef3 + pmaddwd m1, coef3 + phaddd m0, m1 + paddd m0, m5 + psrad m0, DST_SHIFT + packssdw m3, m0 ; m3 = T71 + mova m5, [pd_128] + + pmaddwd m0, m2, coef0 ; DST2 + pmaddwd m1, m3, coef0 + phaddd m0, m1 + paddd m0, m5 + psrad m0, 8 + movu [r1 + 0 * 16], m0 + + pmaddwd m0, m2, coef1 + pmaddwd m1, m3, coef1 + phaddd m0, m1 + paddd m0, m5 + psrad m0, 8 + movu [r1 + 1 * 16], m0 + + pmaddwd m0, m2, coef2 + pmaddwd m1, m3, coef2 + phaddd m0, m1 + paddd m0, m5 + psrad m0, 8 + movu [r1 + 2 * 16], m0 + + pmaddwd m2, coef3 + pmaddwd m3, coef3 + phaddd m2, m3 + paddd m2, m5 + psrad m2, 8 + movu [r1 + 3 * 16], m2 + + RET + +;------------------------------------------------------- +;void idst4(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_XMM sse2 +cglobal idst4, 3, 4, 7 +%if BIT_DEPTH == 8 + mova m6, [pd_2048] + %define IDCT4_SHIFT 12 +%elif BIT_DEPTH == 10 + mova m6, [pd_512] + %define IDCT4_SHIFT 10 +%else + %error Unsupported BIT_DEPTH! +%endif + add r2d, r2d + lea r3, [tab_idst4] + mova m5, [pd_64] + + movu m0, [r0 + 0 * 16] + movu m1, [r0 + 1 * 16] + packssdw m0, m1 + + movu m1, [r0 + 2 * 16] + movu m2, [r0 + 3 * 16] + packssdw m1, m2 + + punpcklwd m2, m0, m1 ; m2 = m128iAC + punpckhwd m0, m1 ; m0 = m128iBD + + pmaddwd m1, m2, [r3 + 0 * 16] + pmaddwd m3, m0, [r3 + 1 * 16] + paddd m1, m3 + paddd m1, m5 + psrad m1, 7 ; m1 = S0 + + pmaddwd m3, m2, [r3 + 2 * 16] + pmaddwd m4, m0, [r3 + 3 * 16] + paddd m3, m4 + paddd m3, m5 + psrad m3, 7 ; m3 = S8 + packssdw m1, m3 ; m1 = m128iA + + pmaddwd m3, m2, [r3 + 4 * 16] + pmaddwd m4, m0, [r3 + 5 * 16] + paddd m3, m4 + paddd m3, m5 + psrad m3, 7 ; m3 = S0 + + pmaddwd m2, [r3 + 6 * 16] + pmaddwd m0, [r3 + 7 * 16] + paddd m2, m0 + paddd m2, m5 + psrad m2, 7 ; m2 = S8 + packssdw m3, m2 ; m3 = m128iD + + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + + punpcklwd m2, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmaddwd m0, m1, [r3 + 0 * 16] + pmaddwd m3, m2, [r3 + 1 * 16] + paddd m0, m3 + paddd m0, m6 + psrad m0, IDCT4_SHIFT ; m0 = S0 + pmaddwd m3, m1, [r3 + 2 * 16] + pmaddwd m4, m2, [r3 + 3 * 16] + paddd m3, m4 + paddd m3, m6 + psrad m3, IDCT4_SHIFT ; m3 = S8 + packssdw m0, m3 ; m0 = m128iA + pmaddwd m3, m1, [r3 + 4 * 16] + pmaddwd m4, m2, [r3 + 5 * 16] + paddd m3, m4 + paddd m3, m6 + psrad m3, IDCT4_SHIFT ; m3 = S0 + pmaddwd m1, [r3 + 6 * 16] + pmaddwd m2, [r3 + 7 * 16] + paddd m1, m2 + paddd m1, m6 + psrad m1, IDCT4_SHIFT ; m1 = S8 + packssdw m3, m1 ; m3 = m128iD + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + + punpcklwd m2, m1, m0 + movlps [r1 + 0 * r2], m2 + movhps [r1 + 1 * r2], m2 + + punpckhwd m1, m0 + movlps [r1 + 2 * r2], m1 + lea r1, [r1 + 2 * r2] + movhps [r1 + r2], m1 + RET + + +;------------------------------------------------------- +; void dct8(int16_t *src, int32_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_XMM sse4 +cglobal dct8, 3,6,7,0-16*mmsize + ;------------------------ + ; Stack Mapping(dword) + ;------------------------ + ; Row0[0-3] Row1[0-3] + ; ... + ; Row6[0-3] Row7[0-3] + ; Row0[0-3] Row7[0-3] + ; ... + ; Row6[4-7] Row7[4-7] + ;------------------------ +%if BIT_DEPTH == 10 + %define DCT_SHIFT 4 + mova m6, [pd_8] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT 2 + mova m6, [pd_2] +%else + %error Unsupported BIT_DEPTH! +%endif + + add r2, r2 + lea r3, [r2 * 3] + mov r5, rsp +%assign x 0 +%rep 2 + movu m0, [r0] + movu m1, [r0 + r2] + movu m2, [r0 + r2 * 2] + movu m3, [r0 + r3] + + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + punpckldq m1, m4, m5 ; m1 = [1 0] + punpckhdq m4, m5 ; m4 = [3 2] + punpckldq m3, m0, m2 + punpckhdq m0, m2 + pshufd m2, m3, 0x4E ; m2 = [4 5] + pshufd m0, m0, 0x4E ; m0 = [6 7] + + paddw m3, m1, m0 + psubw m1, m0 ; m1 = [d1 d0] + paddw m0, m4, m2 + psubw m4, m2 ; m4 = [d3 d2] + punpcklqdq m2, m3, m0 ; m2 = [s2 s0] + punpckhqdq m3, m0 + pshufd m3, m3, 0x4E ; m3 = [s1 s3] + + punpcklwd m0, m1, m4 ; m0 = [d2/d0] + punpckhwd m1, m4 ; m1 = [d3/d1] + punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] + punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] + + ; odd + lea r4, [tab_dct8_1] + pmaddwd m1, m4, [r4 + 0*16] + pmaddwd m5, m0, [r4 + 0*16] + phaddd m1, m5 + paddd m1, m6 + psrad m1, DCT_SHIFT + %if x == 1 + pshufd m1, m1, 0x1B + %endif + mova [r5 + 1*2*mmsize], m1 ; Row 1 + + pmaddwd m1, m4, [r4 + 1*16] + pmaddwd m5, m0, [r4 + 1*16] + phaddd m1, m5 + paddd m1, m6 + psrad m1, DCT_SHIFT + %if x == 1 + pshufd m1, m1, 0x1B + %endif + mova [r5 + 3*2*mmsize], m1 ; Row 3 + + pmaddwd m1, m4, [r4 + 2*16] + pmaddwd m5, m0, [r4 + 2*16] + phaddd m1, m5 + paddd m1, m6 + psrad m1, DCT_SHIFT + %if x == 1 + pshufd m1, m1, 0x1B + %endif + mova [r5 + 5*2*mmsize], m1 ; Row 5 + + pmaddwd m4, [r4 + 3*16] + pmaddwd m0, [r4 + 3*16] + phaddd m4, m0 + paddd m4, m6 + psrad m4, DCT_SHIFT + %if x == 1 + pshufd m4, m4, 0x1B + %endif + mova [r5 + 7*2*mmsize], m4; Row 7 + + ; even + lea r4, [tab_dct4] + paddw m0, m2, m3 ; m0 = [EE1 EE0] + pshufb m0, [pb_unpackhlw1] + psubw m2, m3 ; m2 = [EO1 EO0] + psignw m2, [pw_ppppmmmm] + pshufb m2, [pb_unpackhlw1] + pmaddwd m3, m0, [r4 + 0*16] + paddd m3, m6 + psrad m3, DCT_SHIFT + %if x == 1 + pshufd m3, m3, 0x1B + %endif + mova [r5 + 0*2*mmsize], m3 ; Row 0 + pmaddwd m0, [r4 + 2*16] + paddd m0, m6 + psrad m0, DCT_SHIFT + %if x == 1 + pshufd m0, m0, 0x1B + %endif + mova [r5 + 4*2*mmsize], m0 ; Row 4 + pmaddwd m3, m2, [r4 + 1*16] + paddd m3, m6 + psrad m3, DCT_SHIFT + %if x == 1 + pshufd m3, m3, 0x1B + %endif + mova [r5 + 2*2*mmsize], m3 ; Row 2 + pmaddwd m2, [r4 + 3*16] + paddd m2, m6 + psrad m2, DCT_SHIFT + %if x == 1 + pshufd m2, m2, 0x1B + %endif + mova [r5 + 6*2*mmsize], m2 ; Row 6 + + %if x != 1 + lea r0, [r0 + r2 * 4] + add r5, mmsize + %endif +%assign x x+1 +%endrep + + mov r2, 2 + mov r0, rsp ; r0 = pointer to Low Part + lea r4, [tab_dct8_2] + mova m6, [pd_256] + +.pass2: +%rep 2 + mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] + mova m1, [r0 + 1*2*mmsize] + paddd m2, m0, [r0 + (0*2+1)*mmsize] + pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] + paddd m3, m1, [r0 + (1*2+1)*mmsize] + pshufd m3, m3, 0x9C ; m3 = ^^ + psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] + psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ + + ; even + phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0] + phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0] + + pslld m4, 6 ; m4 = [64*EE1 64*EE0] + pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0] + pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0] + + phaddd m3, m4, m5 ; m3 = [Row2 Row0] + paddd m3, m6 + psrad m3, 9 + phsubd m4, m2 ; m4 = [Row6 Row4] + paddd m4, m6 + psrad m4, 9 + movh [r1 + 0*2*mmsize], m3 + movhps [r1 + 2*2*mmsize], m3 + movh [r1 + 4*2*mmsize], m4 + movhps [r1 + 6*2*mmsize], m4 + + ; odd + pmulld m2, m0, [r4 + 2*16] + pmulld m3, m1, [r4 + 2*16] + pmulld m4, m0, [r4 + 3*16] + pmulld m5, m1, [r4 + 3*16] + phaddd m2, m3 + phaddd m4, m5 + phaddd m2, m4 ; m2 = [Row3 Row1] + paddd m2, m6 + psrad m2, 9 + movh [r1 + 1*2*mmsize], m2 + movhps [r1 + 3*2*mmsize], m2 + + pmulld m2, m0, [r4 + 4*16] + pmulld m3, m1, [r4 + 4*16] + pmulld m4, m0, [r4 + 5*16] + pmulld m5, m1, [r4 + 5*16] + phaddd m2, m3 + phaddd m4, m5 + phaddd m2, m4 ; m2 = [Row7 Row5] + paddd m2, m6 + psrad m2, 9 + movh [r1 + 5*2*mmsize], m2 + movhps [r1 + 7*2*mmsize], m2 + + add r1, mmsize/2 + add r0, 2*2*mmsize +%endrep + + dec r2 + jnz .pass2 + RET + +;------------------------------------------------------- +; void idct8(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_XMM ssse3 + +cglobal patial_butterfly_inverse_internal_pass1 + movu m0, [r0] + movu m1, [r0 + 4 * 32] + movu m2, [r0 + 2 * 32] + movu m3, [r0 + 6 * 32] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 ; [2 6] + punpcklwd m0, m1 ; [0 4] + pmaddwd m1, m0, [r6] ; EE[0] + pmaddwd m0, [r6 + 32] ; EE[1] + pmaddwd m3, m2, [r6 + 16] ; EO[0] + pmaddwd m2, [r6 + 48] ; EO[1] + + paddd m4, m1, m3 ; E[0] + psubd m1, m3 ; E[3] + paddd m3, m0, m2 ; E[1] + psubd m0, m2 ; E[2] + + ;E[K] = E[k] + add + mova m5, [pd_64] + paddd m0, m5 + paddd m1, m5 + paddd m3, m5 + paddd m4, m5 + + movu m2, [r0 + 32] + movu m5, [r0 + 5 * 32] + packssdw m2, m5 + movu m5, [r0 + 3 * 32] + movu m6, [r0 + 7 * 32] + packssdw m5, m6 + punpcklwd m6, m2, m5 ;[1 3] + punpckhwd m2, m5 ;[5 7] + + pmaddwd m5, m6, [r4] + pmaddwd m7, m2, [r4 + 16] + paddd m5, m7 ; O[0] + + paddd m7, m4, m5 + psrad m7, 7 + + psubd m4, m5 + psrad m4, 7 + + packssdw m7, m4 + movh [r5 + 0 * 16], m7 + movhps [r5 + 7 * 16], m7 + + pmaddwd m5, m6, [r4 + 32] + pmaddwd m4, m2, [r4 + 48] + paddd m5, m4 ; O[1] + + paddd m4, m3, m5 + psrad m4, 7 + + psubd m3, m5 + psrad m3, 7 + + packssdw m4, m3 + movh [r5 + 1 * 16], m4 + movhps [r5 + 6 * 16], m4 + + pmaddwd m5, m6, [r4 + 64] + pmaddwd m4, m2, [r4 + 80] + paddd m5, m4 ; O[2] + + paddd m4, m0, m5 + psrad m4, 7 + + psubd m0, m5 + psrad m0, 7 + + packssdw m4, m0 + movh [r5 + 2 * 16], m4 + movhps [r5 + 5 * 16], m4 + + pmaddwd m5, m6, [r4 + 96] + pmaddwd m4, m2, [r4 + 112] + paddd m5, m4 ; O[3] + + paddd m4, m1, m5 + psrad m4, 7 + + psubd m1, m5 + psrad m1, 7 + + packssdw m4, m1 + movh [r5 + 3 * 16], m4 + movhps [r5 + 4 * 16], m4 + + ret + +%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1 +%if BIT_DEPTH == 10 + %define IDCT_SHIFT 10 +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT 12 +%else + %error Unsupported BIT_DEPTH! +%endif + pshufb m4, %1, [pb_idct8even] + pmaddwd m4, [tab_idct8_1] + phsubd m5, m4 + pshufd m4, m4, 0x4E + phaddd m4, m4 + punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3] + paddd m4, m6 + + pshufb %1, %1, [r6] + pmaddwd m5, %1, [r4] + pmaddwd %1, [r4 + 16] + phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3] + + paddd %1, m4, m5 + psrad %1, IDCT_SHIFT + + psubd m4, m5 + psrad m4, IDCT_SHIFT + pshufd m4, m4, 0x1B + + packssdw %1, m4 +%undef IDCT_SHIFT +%endmacro + +cglobal patial_butterfly_inverse_internal_pass2 + + mova m0, [r5] + PARTIAL_BUTTERFLY_PROCESS_ROW m0 + movu [r1], m0 + + mova m2, [r5 + 16] + PARTIAL_BUTTERFLY_PROCESS_ROW m2 + movu [r1 + r2], m2 + + mova m1, [r5 + 32] + PARTIAL_BUTTERFLY_PROCESS_ROW m1 + movu [r1 + 2 * r2], m1 + + mova m3, [r5 + 48] + PARTIAL_BUTTERFLY_PROCESS_ROW m3 + movu [r1 + r3], m3 + + ret + +cglobal idct8, 3,7,8 ;,0-16*mmsize + ; alignment stack to 64-bytes + mov r5, rsp + sub rsp, 16*mmsize + gprsize + and rsp, ~(64-1) + mov [rsp + 16*mmsize], r5 + mov r5, rsp + + lea r4, [tab_idct8_3] + lea r6, [tab_dct4] + + call patial_butterfly_inverse_internal_pass1 + + add r0, 16 + add r5, 8 + + call patial_butterfly_inverse_internal_pass1 + +%if BIT_DEPTH == 10 + mova m6, [pd_512] +%elif BIT_DEPTH == 8 + mova m6, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif + add r2, r2 + lea r3, [r2 * 3] + lea r4, [tab_idct8_2] + lea r6, [pb_idct8odd] + sub r5, 8 + + call patial_butterfly_inverse_internal_pass2 + + lea r1, [r1 + 4 * r2] + add r5, 64 + + call patial_butterfly_inverse_internal_pass2 + + ; restore origin stack pointer + mov rsp, [rsp + 16*mmsize] + RET + + +;----------------------------------------------------------------------------- +; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal denoise_dct, 4, 4, 6 + pxor m5, m5 + shr r3d, 2 +.loop: + mova m0, [r0] + pabsd m1, m0 + mova m2, [r1] + paddd m2, m1 + mova [r1], m2 + pmovzxwd m3, [r2] + psubd m1, m3 + pcmpgtd m4, m1, m5 + pand m1, m4 + psignd m1, m0 + mova [r0], m1 + add r0, 16 + add r1, 16 + add r2, 8 + dec r3d + jnz .loop + RET + +INIT_YMM avx2 +cglobal denoise_dct, 4, 4, 6 + pxor m5, m5 + shr r3d, 3 +.loop: + movu m0, [r0] + pabsd m1, m0 + movu m2, [r1] + paddd m2, m1 + movu [r1], m2 + pmovzxwd m3, [r2] + psubd m1, m3 + pcmpgtd m4, m1, m5 + pand m1, m4 + psignd m1, m0 + movu [r0], m1 + add r0, 32 + add r1, 32 + add r2, 16 + dec r3d + jnz .loop + RET +%if ARCH_X86_64 == 1 +%macro DCT8_PASS_1 4 + vpbroadcastq m0, [r6 + %1] + pmaddwd m2, m%3, m0 + pmaddwd m0, m%4 + phaddd m2, m0 + paddd m2, m5 + psrad m2, DCT_SHIFT + packssdw m2, m2 + vpermq m2, m2, 0x08 + mova [r5 + %2], xm2 +%endmacro + +%macro DCT8_PASS_2 1 + vbroadcasti128 m4, [r6 + %1] + pmaddwd m6, m0, m4 + pmaddwd m7, m1, m4 + pmaddwd m8, m2, m4 + pmaddwd m9, m3, m4 + phaddd m6, m7 + phaddd m8, m9 + phaddd m6, m8 + paddd m6, m5 + psrad m6, DCT_SHIFT2 +%endmacro + +INIT_YMM avx2 +cglobal dct8, 3, 7, 10, 0-8*16 +%if BIT_DEPTH == 10 + %define DCT_SHIFT 4 + vbroadcasti128 m5, [pd_8] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT 2 + vbroadcasti128 m5, [pd_2] +%else + %error Unsupported BIT_DEPTH! +%endif +%define DCT_SHIFT2 9 + + add r2d, r2d + lea r3, [r2 * 3] + lea r4, [r0 + r2 * 4] + mov r5, rsp + lea r6, [tab_dct8] + mova m6, [dct8_shuf] + + ;pass1 + mova xm0, [r0] + vinserti128 m0, m0, [r4], 1 + mova xm1, [r0 + r2] + vinserti128 m1, m1, [r4 + r2], 1 + mova xm2, [r0 + r2 * 2] + vinserti128 m2, m2, [r4 + r2 * 2], 1 + mova xm3, [r0 + r3] + vinserti128 m3, m3, [r4 + r3], 1 + + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m1, m2, m3 + punpckhqdq m2, m3 + + pshufb m0, m6 + pshufb m2, m6 + + paddw m3, m4, m0 + paddw m7, m1, m2 + + psubw m4, m0 + psubw m1, m2 + + DCT8_PASS_1 0 * 16, 0 * 16, 3, 7 + DCT8_PASS_1 1 * 16, 2 * 16, 4, 1 + DCT8_PASS_1 2 * 16, 4 * 16, 3, 7 + DCT8_PASS_1 3 * 16, 6 * 16, 4, 1 + DCT8_PASS_1 4 * 16, 1 * 16, 3, 7 + DCT8_PASS_1 5 * 16, 3 * 16, 4, 1 + DCT8_PASS_1 6 * 16, 5 * 16, 3, 7 + DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 + + ;pass2 + mov r2d, 32 + lea r3, [r2 * 3] + lea r4, [r1 + r2 * 4] + vbroadcasti128 m5, [pd_256] + + mova m0, [r5] + mova m1, [r5 + 32] + mova m2, [r5 + 64] + mova m3, [r5 + 96] + + DCT8_PASS_2 0 * 16 + movu [r1], m6 + DCT8_PASS_2 1 * 16 + movu [r1 + r2], m6 + DCT8_PASS_2 2 * 16 + movu [r1 + r2 * 2], m6 + DCT8_PASS_2 3 * 16 + movu [r1 + r3], m6 + DCT8_PASS_2 4 * 16 + movu [r4], m6 + DCT8_PASS_2 5 * 16 + movu [r4 + r2], m6 + DCT8_PASS_2 6 * 16 + movu [r4 + r2 * 2], m6 + DCT8_PASS_2 7 * 16 + movu [r4 + r3], m6 + RET + +%macro DCT16_PASS_1_E 2 + vpbroadcastq m7, [r7 + %1] + + pmaddwd m4, m0, m7 + pmaddwd m6, m2, m7 + phaddd m4, m6 + + paddd m4, m9 + psrad m4, DCT_SHIFT + + packssdw m4, m4 + vpermq m4, m4, 0x08 + + mova [r5 + %2], xm4 +%endmacro + +%macro DCT16_PASS_1_O 2 + vbroadcasti128 m7, [r7 + %1] + + pmaddwd m10, m0, m7 + pmaddwd m11, m2, m7 + phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5] + + pmaddwd m11, m4, m7 + pmaddwd m12, m6, m7 + phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7] + + phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7] + + paddd m10, m9 + psrad m10, DCT_SHIFT + + packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -] + vpermq m10, m10, 0x08 + + mova [r5 + %2], xm10 +%endmacro + +%macro DCT16_PASS_2 1 + vbroadcasti128 m8, [r7 + %1] + vbroadcasti128 m13, [r8 + %1] + + pmaddwd m10, m0, m8 + pmaddwd m11, m1, m13 + paddd m10, m11 + + pmaddwd m11, m2, m8 + pmaddwd m12, m3, m13 + paddd m11, m12 + phaddd m10, m11 + + pmaddwd m11, m4, m8 + pmaddwd m12, m5, m13 + paddd m11, m12 + + pmaddwd m12, m6, m8 + pmaddwd m13, m7, m13 + paddd m12, m13 + phaddd m11, m12 + + phaddd m10, m11 + paddd m10, m9 + psrad m10, DCT_SHIFT2 +%endmacro +INIT_YMM avx2 +cglobal dct16, 3, 9, 15, 0-16*mmsize +%if BIT_DEPTH == 10 + %define DCT_SHIFT 5 + vbroadcasti128 m9, [pd_16] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT 3 + vbroadcasti128 m9, [pd_4] +%else + %error Unsupported BIT_DEPTH! +%endif +%define DCT_SHIFT2 10 + + add r2d, r2d + + mova m13, [dct16_shuf1] + mova m14, [dct16_shuf2] + lea r7, [tab_dct16_1 + 8 * 16] + lea r8, [tab_dct16_2 + 8 * 16] + lea r3, [r2 * 3] + mov r5, rsp + mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations + +.pass1: + lea r6, [r0 + r2 * 4] + + movu m2, [r0] + movu m1, [r6] + vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo] + vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi] + + movu m4, [r0 + r2] + movu m3, [r6 + r2] + vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo] + vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi] + + movu m6, [r0 + r2 * 2] + movu m5, [r6 + r2 * 2] + vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo] + vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi] + + movu m8, [r0 + r3] + movu m7, [r6 + r3] + vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo] + vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi] + + pshufb m1, m13 + pshufb m3, m13 + pshufb m5, m13 + pshufb m7, m13 + + paddw m8, m0, m1 ;E + psubw m0, m1 ;O + + paddw m1, m2, m3 ;E + psubw m2, m3 ;O + + paddw m3, m4, m5 ;E + psubw m4, m5 ;O + + paddw m5, m6, m7 ;E + psubw m6, m7 ;O + + DCT16_PASS_1_O -7 * 16, 1 * 32 + DCT16_PASS_1_O -5 * 16, 3 * 32 + DCT16_PASS_1_O -3 * 16, 1 * 32 + 16 + DCT16_PASS_1_O -1 * 16, 3 * 32 + 16 + DCT16_PASS_1_O 1 * 16, 5 * 32 + DCT16_PASS_1_O 3 * 16, 7 * 32 + DCT16_PASS_1_O 5 * 16, 5 * 32 + 16 + DCT16_PASS_1_O 7 * 16, 7 * 32 + 16 + + pshufb m8, m14 + pshufb m1, m14 + phaddw m0, m8, m1 + + pshufb m3, m14 + pshufb m5, m14 + phaddw m2, m3, m5 + + DCT16_PASS_1_E -8 * 16, 0 * 32 + DCT16_PASS_1_E -4 * 16, 0 * 32 + 16 + DCT16_PASS_1_E 0 * 16, 4 * 32 + DCT16_PASS_1_E 4 * 16, 4 * 32 + 16 + + phsubw m0, m8, m1 + phsubw m2, m3, m5 + + DCT16_PASS_1_E -6 * 16, 2 * 32 + DCT16_PASS_1_E -2 * 16, 2 * 32 + 16 + DCT16_PASS_1_E 2 * 16, 6 * 32 + DCT16_PASS_1_E 6 * 16, 6 * 32 + 16 + + lea r0, [r0 + 8 * r2] + add r5, 256 + + dec r4d + jnz .pass1 + + mov r5, rsp + mov r4d, 2 + mov r2d, 64 + lea r3, [r2 * 3] + vbroadcasti128 m9, [pd_512] + +.pass2: + mova m0, [r5 + 0 * 32] ; [row0lo row4lo] + mova m1, [r5 + 8 * 32] ; [row0hi row4hi] + + mova m2, [r5 + 1 * 32] ; [row1lo row5lo] + mova m3, [r5 + 9 * 32] ; [row1hi row5hi] + + mova m4, [r5 + 2 * 32] ; [row2lo row6lo] + mova m5, [r5 + 10 * 32] ; [row2hi row6hi] + + mova m6, [r5 + 3 * 32] ; [row3lo row7lo] + mova m7, [r5 + 11 * 32] ; [row3hi row7hi] + + DCT16_PASS_2 -8 * 16 + movu [r1], m10 + DCT16_PASS_2 -7 * 16 + movu [r1 + r2], m10 + DCT16_PASS_2 -6 * 16 + movu [r1 + r2 * 2], m10 + DCT16_PASS_2 -5 * 16 + movu [r1 + r3], m10 + + lea r6, [r1 + r2 * 4] + DCT16_PASS_2 -4 * 16 + movu [r6], m10 + DCT16_PASS_2 -3 * 16 + movu [r6 + r2], m10 + DCT16_PASS_2 -2 * 16 + movu [r6 + r2 * 2], m10 + DCT16_PASS_2 -1 * 16 + movu [r6 + r3], m10 + + lea r6, [r6 + r2 * 4] + DCT16_PASS_2 0 * 16 + movu [r6], m10 + DCT16_PASS_2 1 * 16 + movu [r6 + r2], m10 + DCT16_PASS_2 2 * 16 + movu [r6 + r2 * 2], m10 + DCT16_PASS_2 3 * 16 + movu [r6 + r3], m10 + + lea r6, [r6 + r2 * 4] + DCT16_PASS_2 4 * 16 + movu [r6], m10 + DCT16_PASS_2 5 * 16 + movu [r6 + r2], m10 + DCT16_PASS_2 6 * 16 + movu [r6 + r2 * 2], m10 + DCT16_PASS_2 7 * 16 + movu [r6 + r3], m10 + + add r1, 32 + add r5, 128 + + dec r4d + jnz .pass2 + RET + +%macro DCT32_PASS_1 4 + vbroadcasti128 m8, [r7 + %1] + + pmaddwd m11, m%3, m8 + pmaddwd m12, m%4, m8 + phaddd m11, m12 + + vbroadcasti128 m8, [r7 + %1 + 32] + vbroadcasti128 m10, [r7 + %1 + 48] + pmaddwd m12, m5, m8 + pmaddwd m13, m6, m10 + phaddd m12, m13 + + pmaddwd m13, m4, m8 + pmaddwd m14, m7, m10 + phaddd m13, m14 + + phaddd m12, m13 + + phaddd m11, m12 + paddd m11, m9 + psrad m11, DCT_SHIFT + + vpermq m11, m11, 0xD8 + packssdw m11, m11 + movq [r5 + %2], xm11 + vextracti128 xm10, m11, 1 + movq [r5 + %2 + 64], xm10 +%endmacro + +%macro DCT32_PASS_2 1 + mova m8, [r7 + %1] + mova m10, [r8 + %1] + pmaddwd m11, m0, m8 + pmaddwd m12, m1, m10 + paddd m11, m12 + + pmaddwd m12, m2, m8 + pmaddwd m13, m3, m10 + paddd m12, m13 + + phaddd m11, m12 + + pmaddwd m12, m4, m8 + pmaddwd m13, m5, m10 + paddd m12, m13 + + pmaddwd m13, m6, m8 + pmaddwd m14, m7, m10 + paddd m13, m14 + + phaddd m12, m13 + + phaddd m11, m12 + vextracti128 xm10, m11, 1 + paddd xm11, xm10 + + paddd xm11, xm9 + psrad xm11, DCT_SHIFT2 + +%endmacro + +INIT_YMM avx2 +cglobal dct32, 3, 9, 16, 0-64*mmsize +%if BIT_DEPTH == 10 + %define DCT_SHIFT 6 + vpbroadcastq m9, [pd_32] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT 4 + vpbroadcastq m9, [pd_8] +%else + %error Unsupported BIT_DEPTH! +%endif +%define DCT_SHIFT2 11 + + add r2d, r2d + + lea r7, [tab_dct32_1] + lea r8, [tab_dct32_2] + lea r3, [r2 * 3] + mov r5, rsp + mov r4d, 8 + mova m15, [dct16_shuf1] + +.pass1: + movu m2, [r0] + movu m1, [r0 + 32] + pshufb m1, m15 + vpermq m1, m1, 0x4E + psubw m7, m2, m1 + paddw m2, m1 + + movu m1, [r0 + r2 * 2] + movu m0, [r0 + r2 * 2 + 32] + pshufb m0, m15 + vpermq m0, m0, 0x4E + psubw m8, m1, m0 + paddw m1, m0 + vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E + vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E + pshufb m3, m15 + psubw m1, m0, m3 + paddw m0, m3 + + vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O + vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O + + + movu m4, [r0 + r2] + movu m2, [r0 + r2 + 32] + pshufb m2, m15 + vpermq m2, m2, 0x4E + psubw m10, m4, m2 + paddw m4, m2 + + movu m3, [r0 + r3] + movu m2, [r0 + r3 + 32] + pshufb m2, m15 + vpermq m2, m2, 0x4E + psubw m11, m3, m2 + paddw m3, m2 + vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E + vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E + pshufb m8, m15 + psubw m3, m2, m8 + paddw m2, m8 + + vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O + vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O + + + DCT32_PASS_1 0 * 32, 0 * 64, 0, 2 + DCT32_PASS_1 2 * 32, 2 * 64, 1, 3 + DCT32_PASS_1 4 * 32, 4 * 64, 0, 2 + DCT32_PASS_1 6 * 32, 6 * 64, 1, 3 + DCT32_PASS_1 8 * 32, 8 * 64, 0, 2 + DCT32_PASS_1 10 * 32, 10 * 64, 1, 3 + DCT32_PASS_1 12 * 32, 12 * 64, 0, 2 + DCT32_PASS_1 14 * 32, 14 * 64, 1, 3 + DCT32_PASS_1 16 * 32, 16 * 64, 0, 2 + DCT32_PASS_1 18 * 32, 18 * 64, 1, 3 + DCT32_PASS_1 20 * 32, 20 * 64, 0, 2 + DCT32_PASS_1 22 * 32, 22 * 64, 1, 3 + DCT32_PASS_1 24 * 32, 24 * 64, 0, 2 + DCT32_PASS_1 26 * 32, 26 * 64, 1, 3 + DCT32_PASS_1 28 * 32, 28 * 64, 0, 2 + DCT32_PASS_1 30 * 32, 30 * 64, 1, 3 + + add r5, 8 + lea r0, [r0 + r2 * 4] + + dec r4d + jnz .pass1 + + mov r2d, 128 + lea r3, [r2 * 3] + mov r5, rsp + mov r4d, 8 + vpbroadcastq m9, [pd_1024] + +.pass2: + mova m0, [r5 + 0 * 64] + mova m1, [r5 + 0 * 64 + 32] + + mova m2, [r5 + 1 * 64] + mova m3, [r5 + 1 * 64 + 32] + + mova m4, [r5 + 2 * 64] + mova m5, [r5 + 2 * 64 + 32] + + mova m6, [r5 + 3 * 64] + mova m7, [r5 + 3 * 64 + 32] + + DCT32_PASS_2 0 * 32 + movu [r1], xm11 + DCT32_PASS_2 1 * 32 + movu [r1 + r2], xm11 + DCT32_PASS_2 2 * 32 + movu [r1 + r2 * 2], xm11 + DCT32_PASS_2 3 * 32 + movu [r1 + r3], xm11 + + lea r6, [r1 + r2 * 4] + DCT32_PASS_2 4 * 32 + movu [r6], xm11 + DCT32_PASS_2 5 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 6 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 7 * 32 + movu [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 8 * 32 + movu [r6], xm11 + DCT32_PASS_2 9 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 10 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 11 * 32 + movu [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 12 * 32 + movu [r6], xm11 + DCT32_PASS_2 13 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 14 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 15 * 32 + movu [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 16 * 32 + movu [r6], xm11 + DCT32_PASS_2 17 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 18 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 19 * 32 + movu [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 20 * 32 + movu [r6], xm11 + DCT32_PASS_2 21 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 22 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 23 * 32 + movu [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 24 * 32 + movu [r6], xm11 + DCT32_PASS_2 25 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 26 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 27 * 32 + movu [r6 + r3], xm11 + + lea r6, [r6 + r2 * 4] + DCT32_PASS_2 28 * 32 + movu [r6], xm11 + DCT32_PASS_2 29 * 32 + movu [r6 + r2], xm11 + DCT32_PASS_2 30 * 32 + movu [r6 + r2 * 2], xm11 + DCT32_PASS_2 31 * 32 + movu [r6 + r3], xm11 + + add r5, 256 + add r1, 16 + + dec r4d + jnz .pass2 + RET + +%macro IDCT8_PASS_1 1 + vpbroadcastd m7, [r5 + %1] + vpbroadcastd m10, [r5 + %1 + 4] + pmaddwd m5, m4, m7 + pmaddwd m6, m0, m10 + paddd m5, m6 + + vpbroadcastd m7, [r6 + %1] + vpbroadcastd m10, [r6 + %1 + 4] + pmaddwd m6, m1, m7 + pmaddwd m3, m2, m10 + paddd m6, m3 + + paddd m3, m5, m6 + paddd m3, m11 + psrad m3, IDCT_SHIFT1 + + psubd m5, m6 + paddd m5, m11 + psrad m5, IDCT_SHIFT1 + + vpbroadcastd m7, [r5 + %1 + 32] + vpbroadcastd m10, [r5 + %1 + 36] + pmaddwd m6, m4, m7 + pmaddwd m8, m0, m10 + paddd m6, m8 + + vpbroadcastd m7, [r6 + %1 + 32] + vpbroadcastd m10, [r6 + %1 + 36] + pmaddwd m8, m1, m7 + pmaddwd m9, m2, m10 + paddd m8, m9 + + paddd m9, m6, m8 + paddd m9, m11 + psrad m9, IDCT_SHIFT1 + + psubd m6, m8 + paddd m6, m11 + psrad m6, IDCT_SHIFT1 + + packssdw m3, m9 + vpermq m3, m3, 0xD8 + + packssdw m6, m5 + vpermq m6, m6, 0xD8 +%endmacro + +%macro IDCT8_PASS_2 0 + punpcklqdq m2, m0, m1 + punpckhqdq m0, m1 + + pmaddwd m3, m2, [r5] + pmaddwd m5, m2, [r5 + 32] + pmaddwd m6, m2, [r5 + 64] + pmaddwd m7, m2, [r5 + 96] + phaddd m3, m5 + phaddd m6, m7 + pshufb m3, [idct8_shuf2] + pshufb m6, [idct8_shuf2] + punpcklqdq m7, m3, m6 + punpckhqdq m3, m6 + + pmaddwd m5, m0, [r6] + pmaddwd m6, m0, [r6 + 32] + pmaddwd m8, m0, [r6 + 64] + pmaddwd m9, m0, [r6 + 96] + phaddd m5, m6 + phaddd m8, m9 + pshufb m5, [idct8_shuf2] + pshufb m8, [idct8_shuf2] + punpcklqdq m6, m5, m8 + punpckhqdq m5, m8 + + paddd m8, m7, m6 + paddd m8, m12 + psrad m8, IDCT_SHIFT2 + + psubd m7, m6 + paddd m7, m12 + psrad m7, IDCT_SHIFT2 + + pshufb m7, [idct8_shuf3] + packssdw m8, m7 + + paddd m9, m3, m5 + paddd m9, m12 + psrad m9, IDCT_SHIFT2 + + psubd m3, m5 + paddd m3, m12 + psrad m3, IDCT_SHIFT2 + + pshufb m3, [idct8_shuf3] + packssdw m9, m3 +%endmacro + +INIT_YMM avx2 +cglobal idct8, 3, 7, 13, 0-8*16 +%if BIT_DEPTH == 10 + %define IDCT_SHIFT2 10 + vpbroadcastd m12, [pd_512] +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT2 12 + vpbroadcastd m12, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif +%define IDCT_SHIFT1 7 + + vbroadcasti128 m11, [pd_64] + + mov r4, rsp + lea r5, [avx2_idct8_1] + lea r6, [avx2_idct8_2] + + ;pass1 + mova m0, [r0 + 0 * 32] + mova m1, [r0 + 4 * 32] + packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4] + mova m1, [r0 + 2 * 32] + mova m2, [r0 + 6 * 32] + packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6] + mova m2, [r0 + 1 * 32] + mova m3, [r0 + 5 * 32] + packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5] + mova m3, [r0 + 3 * 32] + mova m4, [r0 + 7 * 32] + packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7] + + mova m5, [idct8_shuf1] + + punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] + punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] + vpermd m4, m5, m4 + vpermd m0, m5, m0 + + punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] + punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] + vpermd m1, m5, m1 + vpermd m2, m5, m2 + + IDCT8_PASS_1 0 + mova [r4], m3 + mova [r4 + 96], m6 + + IDCT8_PASS_1 64 + mova [r4 + 32], m3 + mova [r4 + 64], m6 + + ;pass2 + add r2d, r2d + lea r3, [r2 * 3] + + mova m0, [r4] + mova m1, [r4 + 32] + IDCT8_PASS_2 + + vextracti128 xm3, m8, 1 + mova [r1], xm8 + mova [r1 + r2], xm3 + vextracti128 xm3, m9, 1 + mova [r1 + r2 * 2], xm9 + mova [r1 + r3], xm3 + + lea r1, [r1 + r2 * 4] + mova m0, [r4 + 64] + mova m1, [r4 + 96] + IDCT8_PASS_2 + + vextracti128 xm3, m8, 1 + mova [r1], xm8 + mova [r1 + r2], xm3 + vextracti128 xm3, m9, 1 + mova [r1 + r2 * 2], xm9 + mova [r1 + r3], xm3 + RET + +%macro IDCT_PASS1 2 + vbroadcasti128 m5, [tab_idct16_2 + %1 * 16] + + pmaddwd m9, m0, m5 + pmaddwd m10, m7, m5 + phaddd m9, m10 + + pmaddwd m10, m6, m5 + pmaddwd m11, m8, m5 + phaddd m10, m11 + + phaddd m9, m10 + vbroadcasti128 m5, [tab_idct16_1 + %1 * 16] + + pmaddwd m10, m1, m5 + pmaddwd m11, m3, m5 + phaddd m10, m11 + + pmaddwd m11, m4, m5 + pmaddwd m12, m2, m5 + phaddd m11, m12 + + phaddd m10, m11 + + paddd m11, m9, m10 + paddd m11, m14 + psrad m11, IDCT_SHIFT1 + + psubd m9, m10 + paddd m9, m14 + psrad m9, IDCT_SHIFT1 + + vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16] + + pmaddwd m10, m0, m5 + pmaddwd m12, m7, m5 + phaddd m10, m12 + + pmaddwd m12, m6, m5 + pmaddwd m13, m8, m5 + phaddd m12, m13 + + phaddd m10, m12 + vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16] + + pmaddwd m12, m1, m5 + pmaddwd m13, m3, m5 + phaddd m12, m13 + + pmaddwd m13, m4, m5 + pmaddwd m5, m2 + phaddd m13, m5 + + phaddd m12, m13 + + paddd m5, m10, m12 + paddd m5, m14 + psrad m5, IDCT_SHIFT1 + + psubd m10, m12 + paddd m10, m14 + psrad m10, IDCT_SHIFT1 + + packssdw m11, m5 + packssdw m9, m10 + + mova m10, [idct16_shuff] + mova m5, [idct16_shuff1] + + vpermd m12, m10, m11 + vpermd m13, m5, m9 + mova [r3 + %1 * 16 * 2], xm12 + mova [r3 + %2 * 16 * 2], xm13 + vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1 + vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1 +%endmacro + +;------------------------------------------------------- +; void idct16(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_YMM avx2 +cglobal idct16, 3, 7, 16, 0-16*mmsize +%if BIT_DEPTH == 10 + %define IDCT_SHIFT2 10 + vpbroadcastd m15, [pd_512] +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT2 12 + vpbroadcastd m15, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif +%define IDCT_SHIFT1 7 + + vbroadcasti128 m14, [pd_64] + + add r2d, r2d + mov r3, rsp + mov r4d, 2 + +.pass1: + movu m0, [r0 + 0 * 64] + movu m1, [r0 + 8 * 64] + packssdw m0, m1 ;[0L 8L 0H 8H] + + movu m1, [r0 + 1 * 64] + movu m2, [r0 + 9 * 64] + packssdw m1, m2 ;[1L 9L 1H 9H] + + movu m2, [r0 + 2 * 64] + movu m3, [r0 + 10 * 64] + packssdw m2, m3 ;[2L 10L 2H 10H] + + movu m3, [r0 + 3 * 64] + movu m4, [r0 + 11 * 64] + packssdw m3, m4 ;[3L 11L 3H 11H] + + movu m4, [r0 + 4 * 64] + movu m5, [r0 + 12 * 64] + packssdw m4, m5 ;[4L 12L 4H 12H] + + movu m5, [r0 + 5 * 64] + movu m6, [r0 + 13 * 64] + packssdw m5, m6 ;[5L 13L 5H 13H] + + movu m6, [r0 + 6 * 64] + movu m7, [r0 + 14 * 64] + packssdw m6, m7 ;[6L 14L 6H 14H] + + movu m7, [r0 + 7 * 64] + movu m8, [r0 + 15 * 64] + packssdw m7, m8 ;[7L 15L 7H 15H] + + punpckhwd m8, m0, m2 ;[8 10] + punpcklwd m0, m2 ;[0 2] + + punpckhwd m2, m1, m3 ;[9 11] + punpcklwd m1, m3 ;[1 3] + + punpckhwd m3, m4, m6 ;[12 14] + punpcklwd m4, m6 ;[4 6] + + punpckhwd m6, m5, m7 ;[13 15] + punpcklwd m5, m7 ;[5 7] + + punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] + punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] + + punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] + punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] + + punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] + punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] + + punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] + punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] + + punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] + punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] + + punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] + punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] + + punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] + punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] + + punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] + punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] + + IDCT_PASS1 0, 14 + IDCT_PASS1 2, 12 + IDCT_PASS1 4, 10 + IDCT_PASS1 6, 8 + + add r0, 32 + add r3, 16 + dec r4d + jnz .pass1 + + mov r3, rsp + mov r4d, 8 + lea r5, [tab_idct16_2] + lea r6, [tab_idct16_1] + + vbroadcasti128 m7, [r5] + vbroadcasti128 m8, [r5 + 16] + vbroadcasti128 m9, [r5 + 32] + vbroadcasti128 m10, [r5 + 48] + vbroadcasti128 m11, [r5 + 64] + vbroadcasti128 m12, [r5 + 80] + vbroadcasti128 m13, [r5 + 96] + +.pass2: + movu m1, [r3] + vpermq m0, m1, 0xD8 + + pmaddwd m1, m0, m7 + pmaddwd m2, m0, m8 + phaddd m1, m2 + + pmaddwd m2, m0, m9 + pmaddwd m3, m0, m10 + phaddd m2, m3 + + phaddd m1, m2 + + pmaddwd m2, m0, m11 + pmaddwd m3, m0, m12 + phaddd m2, m3 + + vbroadcasti128 m14, [r5 + 112] + pmaddwd m3, m0, m13 + pmaddwd m4, m0, m14 + phaddd m3, m4 + + phaddd m2, m3 + + movu m3, [r3 + 32] + vpermq m0, m3, 0xD8 + + vbroadcasti128 m14, [r6] + pmaddwd m3, m0, m14 + vbroadcasti128 m14, [r6 + 16] + pmaddwd m4, m0, m14 + phaddd m3, m4 + + vbroadcasti128 m14, [r6 + 32] + pmaddwd m4, m0, m14 + vbroadcasti128 m14, [r6 + 48] + pmaddwd m5, m0, m14 + phaddd m4, m5 + + phaddd m3, m4 + + vbroadcasti128 m14, [r6 + 64] + pmaddwd m4, m0, m14 + vbroadcasti128 m14, [r6 + 80] + pmaddwd m5, m0, m14 + phaddd m4, m5 + + vbroadcasti128 m14, [r6 + 96] + pmaddwd m6, m0, m14 + vbroadcasti128 m14, [r6 + 112] + pmaddwd m0, m14 + phaddd m6, m0 + + phaddd m4, m6 + + paddd m5, m1, m3 + paddd m5, m15 + psrad m5, IDCT_SHIFT2 + + psubd m1, m3 + paddd m1, m15 + psrad m1, IDCT_SHIFT2 + + paddd m6, m2, m4 + paddd m6, m15 + psrad m6, IDCT_SHIFT2 + + psubd m2, m4 + paddd m2, m15 + psrad m2, IDCT_SHIFT2 + + packssdw m5, m6 + packssdw m1, m2 + pshufb m2, m1, [dct16_shuf1] + + mova [r1], xm5 + mova [r1 + 16], xm2 + vextracti128 [r1 + r2], m5, 1 + vextracti128 [r1 + r2 + 16], m2, 1 + + lea r1, [r1 + 2 * r2] + add r3, 64 + dec r4d + jnz .pass2 + RET + +%macro IDCT32_PASS1 1 + vbroadcasti128 m3, [tab_idct32_1 + %1 * 32] + vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16] + pmaddwd m9, m4, m3 + pmaddwd m10, m8, m13 + phaddd m9, m10 + + pmaddwd m10, m2, m3 + pmaddwd m11, m1, m13 + phaddd m10, m11 + + phaddd m9, m10 + + vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32] + vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16] + pmaddwd m10, m4, m3 + pmaddwd m11, m8, m13 + phaddd m10, m11 + + pmaddwd m11, m2, m3 + pmaddwd m12, m1, m13 + phaddd m11, m12 + + phaddd m10, m11 + phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15] + + vbroadcasti128 m3, [tab_idct32_2 + %1 * 16] + pmaddwd m10, m0, m3 + pmaddwd m11, m7, m3 + phaddd m10, m11 + phaddd m10, m10 + + vbroadcasti128 m3, [tab_idct32_3 + %1 * 16] + pmaddwd m11, m5, m3 + pmaddwd m12, m6, m3 + phaddd m11, m12 + phaddd m11, m11 + + paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL] + psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL] + + punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15] + paddd m10, m9, m12 + paddd m10, m15 + psrad m10, IDCT_SHIFT1 + + psubd m12, m9 + paddd m12, m15 + psrad m12, IDCT_SHIFT1 + + packssdw m10, m12 + vextracti128 xm12, m10, 1 + movd [r3 + %1 * 64], xm10 + movd [r3 + 32 + %1 * 64], xm12 + pextrd [r4 - %1 * 64], xm10, 1 + pextrd [r4+ 32 - %1 * 64], xm12, 1 + pextrd [r3 + 16 * 64 + %1 *64], xm10, 3 + pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3 + pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2 + pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2 +%endmacro + +;------------------------------------------------------- +; void idct32(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- + +; TODO: Reduce PHADDD instruction by PADDD + +INIT_YMM avx2 +cglobal idct32, 3, 6, 16, 0-32*64 + +%define IDCT_SHIFT1 7 + + vbroadcasti128 m15, [pd_64] + + mov r3, rsp + lea r4, [r3 + 15 * 64] + mov r5d, 8 + +.pass1: + movu xm0, [r0 + 2 * 128] + movu xm1, [r0 + 18 * 128] + vinserti128 m0, m0, [r0 + 0 * 128], 1 + vinserti128 m1, m1, [r0 + 16 * 128], 1 + + packssdw m0, m1 ;[2 18 0 16] + + movu xm1, [r0 + 1 * 128] + movu xm2, [r0 + 9 * 128] + vinserti128 m1, m1, [r0 + 17 * 128], 1 + vinserti128 m2, m2, [r0 + 25 * 128], 1 + packssdw m1, m2 ;[1 9 17 25] + + movu xm2, [r0 + 6 * 128] + movu xm3, [r0 + 22 * 128] + vinserti128 m2, m2, [r0 + 4 * 128], 1 + vinserti128 m3, m3, [r0 + 20 * 128], 1 + packssdw m2, m3 ;[6 22 4 20] + + movu xm3, [r0 + 3 * 128] + movu xm4, [r0 + 11 * 128] + vinserti128 m3, m3, [r0 + 19 * 128], 1 + vinserti128 m4, m4, [r0 + 27 * 128], 1 + packssdw m3, m4 ;[3 11 19 27] + + movu xm4, [r0 + 10 * 128] + movu xm5, [r0 + 26 * 128] + vinserti128 m4, m4, [r0 + 8 * 128], 1 + vinserti128 m5, m5, [r0 + 24 * 128], 1 + packssdw m4, m5 ;[10 26 8 24] + + movu xm5, [r0 + 5 * 128] + movu xm6, [r0 + 13 * 128] + vinserti128 m5, m5, [r0 + 21 * 128], 1 + vinserti128 m6, m6, [r0 + 29 * 128], 1 + packssdw m5, m6 ;[5 13 21 29] + + movu xm6, [r0 + 14 * 128] + movu xm7, [r0 + 30 * 128] + vinserti128 m6, m6, [r0 + 12 * 128], 1 + vinserti128 m7, m7, [r0 + 28 * 128], 1 + packssdw m6, m7 ;[14 30 12 28] + + movu xm7, [r0 + 7 * 128] + movu xm8, [r0 + 15 * 128] + vinserti128 m7, m7, [r0 + 23 * 128], 1 + vinserti128 m8, m8, [r0 + 31 * 128], 1 + packssdw m7, m8 ;[7 15 23 31] + + punpckhwd m8, m0, m2 ;[18 22 16 20] + punpcklwd m0, m2 ;[2 6 0 4] + + punpckhwd m2, m1, m3 ;[9 11 25 27] + punpcklwd m1, m3 ;[1 3 17 19] + + punpckhwd m3, m4, m6 ;[26 30 24 28] + punpcklwd m4, m6 ;[10 14 8 12] + + punpckhwd m6, m5, m7 ;[13 15 29 31] + punpcklwd m5, m7 ;[5 7 21 23] + + punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] + punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] + + punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] + punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] + + punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] + punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] + + punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] + punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] + + punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] + punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] + + punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] + punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] + + punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] + punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] + + punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] + punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] + + vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301] + vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281] + + vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303] + vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283] + + vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311] + vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151] + + vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313] + vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153] + + IDCT32_PASS1 0 + IDCT32_PASS1 1 + IDCT32_PASS1 2 + IDCT32_PASS1 3 + IDCT32_PASS1 4 + IDCT32_PASS1 5 + IDCT32_PASS1 6 + IDCT32_PASS1 7 + + add r0, 16 + add r3, 4 + add r4, 4 + dec r5d + jnz .pass1 + +%if BIT_DEPTH == 10 + %define IDCT_SHIFT2 10 + vpbroadcastd m15, [pd_512] +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT2 12 + vpbroadcastd m15, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif + + mov r3, rsp + add r2d, r2d + mov r4d, 32 + + mova m7, [tab_idct32_4] + mova m8, [tab_idct32_4 + 32] + mova m9, [tab_idct32_4 + 64] + mova m10, [tab_idct32_4 + 96] + mova m11, [tab_idct32_4 + 128] + mova m12, [tab_idct32_4 + 160] + mova m13, [tab_idct32_4 + 192] + mova m14, [tab_idct32_4 + 224] +.pass2: + movu m0, [r3] + movu m1, [r3 + 32] + + pmaddwd m2, m0, m7 + pmaddwd m3, m0, m8 + phaddd m2, m3 + + pmaddwd m3, m0, m9 + pmaddwd m4, m0, m10 + phaddd m3, m4 + + phaddd m2, m3 + + pmaddwd m3, m0, m11 + pmaddwd m4, m0, m12 + phaddd m3, m4 + + pmaddwd m4, m0, m13 + pmaddwd m5, m0, m14 + phaddd m4, m5 + + phaddd m3, m4 + + vperm2i128 m4, m2, m3, 0x31 + vperm2i128 m2, m2, m3, 0x20 + paddd m2, m4 + + pmaddwd m3, m0, [tab_idct32_4 + 256] + pmaddwd m4, m0, [tab_idct32_4 + 288] + phaddd m3, m4 + + pmaddwd m4, m0, [tab_idct32_4 + 320] + pmaddwd m5, m0, [tab_idct32_4 + 352] + phaddd m4, m5 + + phaddd m3, m4 + + pmaddwd m4, m0, [tab_idct32_4 + 384] + pmaddwd m5, m0, [tab_idct32_4 + 416] + phaddd m4, m5 + + pmaddwd m5, m0, [tab_idct32_4 + 448] + pmaddwd m0, [tab_idct32_4 + 480] + phaddd m5, m0 + + phaddd m4, m5 + + vperm2i128 m0, m3, m4, 0x31 + vperm2i128 m3, m3, m4, 0x20 + paddd m3, m0 + + pmaddwd m4, m1, [tab_idct32_1] + pmaddwd m0, m1, [tab_idct32_1 + 32] + phaddd m4, m0 + + pmaddwd m5, m1, [tab_idct32_1 + 64] + pmaddwd m0, m1, [tab_idct32_1 + 96] + phaddd m5, m0 + + phaddd m4, m5 + + pmaddwd m5, m1, [tab_idct32_1 + 128] + pmaddwd m0, m1, [tab_idct32_1 + 160] + phaddd m5, m0 + + pmaddwd m6, m1, [tab_idct32_1 + 192] + pmaddwd m0, m1, [tab_idct32_1 + 224] + phaddd m6, m0 + + phaddd m5, m6 + + vperm2i128 m0, m4, m5, 0x31 + vperm2i128 m4, m4, m5, 0x20 + paddd m4, m0 + + pmaddwd m5, m1, [tab_idct32_1 + 256] + pmaddwd m0, m1, [tab_idct32_1 + 288] + phaddd m5, m0 + + pmaddwd m6, m1, [tab_idct32_1 + 320] + pmaddwd m0, m1, [tab_idct32_1 + 352] + phaddd m6, m0 + + phaddd m5, m6 + + pmaddwd m6, m1, [tab_idct32_1 + 384] + pmaddwd m0, m1, [tab_idct32_1 + 416] + phaddd m6, m0 + + pmaddwd m0, m1, [tab_idct32_1 + 448] + pmaddwd m1, [tab_idct32_1 + 480] + phaddd m0, m1 + + phaddd m6, m0 + + vperm2i128 m0, m5, m6, 0x31 + vperm2i128 m5, m5, m6, 0x20 + paddd m5, m0 + + paddd m6, m2, m4 + paddd m6, m15 + psrad m6, IDCT_SHIFT2 + + psubd m2, m4 + paddd m2, m15 + psrad m2, IDCT_SHIFT2 + + paddd m4, m3, m5 + paddd m4, m15 + psrad m4, IDCT_SHIFT2 + + psubd m3, m5 + paddd m3, m15 + psrad m3, IDCT_SHIFT2 + + packssdw m6, m4 + packssdw m2, m3 + + vpermq m6, m6, 0xD8 + vpermq m2, m2, 0x8D + pshufb m2, [dct16_shuf1] + + mova [r1], m6 + mova [r1 + 32], m2 + + add r1, r2 + add r3, 64 + dec r4d + jnz .pass2 + RET + +;------------------------------------------------------- +; void idct4(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_YMM avx2 +cglobal idct4, 3, 4, 6 + +%define IDCT_SHIFT1 7 +%if BIT_DEPTH == 10 + %define IDCT_SHIFT2 10 + vpbroadcastd m5, [pd_512] +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT2 12 + vpbroadcastd m5, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif + vbroadcasti128 m4, [pd_64] + + add r2d, r2d + lea r3, [r2 * 3] + + movu m0, [r0] ;[00 01 02 03 10 11 12 13] + movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33] + + packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33] + pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33] + vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] + vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] + + mova m1, [avx2_idct4_1] + mova m3, [avx2_idct4_1 + 32] + pmaddwd m1, m2 + pmaddwd m3, m0 + + paddd m0, m1, m3 + paddd m0, m4 + psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31] + + psubd m1, m3 + paddd m1, m4 + psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32] + + packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32] + vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32] + vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22] + + vpbroadcastq m2, [avx2_idct4_2] + vpbroadcastq m3, [avx2_idct4_2 + 8] + pmaddwd m0, m2 + pmaddwd m1, m3 + + paddd m2, m0, m1 + paddd m2, m5 + psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21] + + psubd m0, m1 + paddd m0, m5 + psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22] + + pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23] + punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13] + punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23] + packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23] + vextracti128 xm0, m1, 1 + + movq [r1], xm1 + movq [r1 + r2], xm0 + movhps [r1 + 2 * r2], xm0 + movhps [r1 + r3], xm1 + RET +%endif diff --git a/source/common/x86/dct8.h b/source/common/x86/dct8.h new file mode 100644 index 0000000..3b74f2a --- /dev/null +++ b/source/common/x86/dct8.h @@ -0,0 +1,45 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_DCT8_H +#define X265_DCT8_H +void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride); + +void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride); +void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride); +void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride); +void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride); +void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride); +void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride); + +void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size); +void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size); + +#endif // ifndef X265_DCT8_H diff --git a/source/common/x86/intrapred.h b/source/common/x86/intrapred.h new file mode 100644 index 0000000..9a71457 --- /dev/null +++ b/source/common/x86/intrapred.h @@ -0,0 +1,164 @@ +/***************************************************************************** + * intrapred.h: Intra Prediction metrics + ***************************************************************************** + * Copyright (C) 2003-2013 x264 project + * + * Authors: Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_INTRAPRED_H +#define X265_INTRAPRED_H + +void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter); +void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter); +void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter); +void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter); + +void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int); +void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int); +void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int); +void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int); + +#define DECL_ANG(bsize, mode, cpu) \ + void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel * dst, intptr_t dstStride, pixel * refLeft, pixel * refAbove, int dirMode, int bFilter); + +DECL_ANG(4, 2, ssse3); +DECL_ANG(4, 3, sse4); +DECL_ANG(4, 4, sse4); +DECL_ANG(4, 5, sse4); +DECL_ANG(4, 6, sse4); +DECL_ANG(4, 7, sse4); +DECL_ANG(4, 8, sse4); +DECL_ANG(4, 9, sse4); +DECL_ANG(4, 10, sse4); +DECL_ANG(4, 11, sse4); +DECL_ANG(4, 12, sse4); +DECL_ANG(4, 13, sse4); +DECL_ANG(4, 14, sse4); +DECL_ANG(4, 15, sse4); +DECL_ANG(4, 16, sse4); +DECL_ANG(4, 17, sse4); +DECL_ANG(4, 18, sse4); +DECL_ANG(4, 26, sse4); +DECL_ANG(8, 2, ssse3); +DECL_ANG(8, 3, sse4); +DECL_ANG(8, 4, sse4); +DECL_ANG(8, 5, sse4); +DECL_ANG(8, 6, sse4); +DECL_ANG(8, 7, sse4); +DECL_ANG(8, 8, sse4); +DECL_ANG(8, 9, sse4); +DECL_ANG(8, 10, sse4); +DECL_ANG(8, 11, sse4); +DECL_ANG(8, 12, sse4); +DECL_ANG(8, 13, sse4); +DECL_ANG(8, 14, sse4); +DECL_ANG(8, 15, sse4); +DECL_ANG(8, 16, sse4); +DECL_ANG(8, 17, sse4); +DECL_ANG(8, 18, sse4); +DECL_ANG(8, 19, sse4); +DECL_ANG(8, 20, sse4); +DECL_ANG(8, 21, sse4); +DECL_ANG(8, 22, sse4); +DECL_ANG(8, 23, sse4); +DECL_ANG(8, 24, sse4); +DECL_ANG(8, 25, sse4); +DECL_ANG(8, 26, sse4); +DECL_ANG(8, 27, sse4); +DECL_ANG(8, 28, sse4); +DECL_ANG(8, 29, sse4); +DECL_ANG(8, 30, sse4); +DECL_ANG(8, 31, sse4); +DECL_ANG(8, 32, sse4); +DECL_ANG(8, 33, sse4); + +DECL_ANG(16, 2, ssse3); +DECL_ANG(16, 3, sse4); +DECL_ANG(16, 4, sse4); +DECL_ANG(16, 5, sse4); +DECL_ANG(16, 6, sse4); +DECL_ANG(16, 7, sse4); +DECL_ANG(16, 8, sse4); +DECL_ANG(16, 9, sse4); +DECL_ANG(16, 10, sse4); +DECL_ANG(16, 11, sse4); +DECL_ANG(16, 12, sse4); +DECL_ANG(16, 13, sse4); +DECL_ANG(16, 14, sse4); +DECL_ANG(16, 15, sse4); +DECL_ANG(16, 16, sse4); +DECL_ANG(16, 17, sse4); +DECL_ANG(16, 18, sse4); +DECL_ANG(16, 19, sse4); +DECL_ANG(16, 20, sse4); +DECL_ANG(16, 21, sse4); +DECL_ANG(16, 22, sse4); +DECL_ANG(16, 23, sse4); +DECL_ANG(16, 24, sse4); +DECL_ANG(16, 25, sse4); +DECL_ANG(16, 26, sse4); +DECL_ANG(16, 27, sse4); +DECL_ANG(16, 28, sse4); +DECL_ANG(16, 29, sse4); +DECL_ANG(16, 30, sse4); +DECL_ANG(16, 31, sse4); +DECL_ANG(16, 32, sse4); +DECL_ANG(16, 33, sse4); + +DECL_ANG(32, 2, ssse3); +DECL_ANG(32, 3, sse4); +DECL_ANG(32, 4, sse4); +DECL_ANG(32, 5, sse4); +DECL_ANG(32, 6, sse4); +DECL_ANG(32, 7, sse4); +DECL_ANG(32, 8, sse4); +DECL_ANG(32, 9, sse4); +DECL_ANG(32, 10, sse4); +DECL_ANG(32, 11, sse4); +DECL_ANG(32, 12, sse4); +DECL_ANG(32, 13, sse4); +DECL_ANG(32, 14, sse4); +DECL_ANG(32, 15, sse4); +DECL_ANG(32, 16, sse4); +DECL_ANG(32, 17, sse4); +DECL_ANG(32, 18, sse4); +DECL_ANG(32, 19, sse4); +DECL_ANG(32, 20, sse4); +DECL_ANG(32, 21, sse4); +DECL_ANG(32, 22, sse4); +DECL_ANG(32, 23, sse4); +DECL_ANG(32, 24, sse4); +DECL_ANG(32, 25, sse4); +DECL_ANG(32, 26, sse4); +DECL_ANG(32, 27, sse4); +DECL_ANG(32, 28, sse4); +DECL_ANG(32, 29, sse4); +DECL_ANG(32, 30, sse4); +DECL_ANG(32, 31, sse4); +DECL_ANG(32, 32, sse4); +DECL_ANG(32, 33, sse4); + +#undef DECL_ANG +void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); +void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); +void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); +void x265_all_angs_pred_32x32_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma); +#endif // ifndef X265_INTRAPRED_H diff --git a/source/common/x86/intrapred16.asm b/source/common/x86/intrapred16.asm new file mode 100644 index 0000000..236be2c --- /dev/null +++ b/source/common/x86/intrapred16.asm @@ -0,0 +1,12780 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Dnyaneshwar Gorade +;* Yuvaraj Venkatesh +;* Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +const ang_table +%assign x 0 +%rep 32 + times 4 dw (32-x), x +%assign x x+1 +%endrep + +const shuf_mode_13_23, db 0, 0, 14, 15, 6, 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 +const shuf_mode_14_22, db 14, 15, 10, 11, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 +const shuf_mode_15_21, db 12, 13, 8, 9, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 +const shuf_mode_16_20, db 2, 3, 0, 1, 14, 15, 12, 13, 8, 9, 6, 7, 2, 3, 0, 1 +const shuf_mode_17_19, db 0, 1, 14, 15, 12, 13, 10, 11, 6, 7, 4, 5, 2, 3, 0, 1 +const shuf_mode32_18, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + +const pw_unpackwdq, times 8 db 0,1 +const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1 +const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1 +const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1 +const pw_ang8_15, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1 +const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1 +const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1 +const pw_swap16, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 + +const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 + +SECTION .text + +cextern pw_1 +cextern pw_8 +cextern pw_1023 +cextern pd_16 +cextern pd_32 +cextern pw_4096 +cextern multiL +cextern multiH +cextern multi_2Row +cextern pw_swap +cextern pb_unpackwq1 +cextern pb_unpackwq2 + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc4, 4,6,2 + mov r4d, r5m + add r2, 2 + add r3, 2 + + movh m0, [r3] ; sumAbove + movh m1, [r2] ; sumLeft + + paddw m0, m1 + pshufd m1, m0, 1 + paddw m0, m1 + phaddw m0, m0 ; m0 = sum + + test r4d, r4d + + pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8 + movd r4d, m0 ; r4d = dc_val + movzx r4d, r4w + pshuflw m0, m0, 0 ; m0 = word [dc_val ...] + + ; store DC 4x4 + movh [r0], m0 + movh [r0 + r1 * 2], m0 + movh [r0 + r1 * 4], m0 + lea r5, [r0 + r1 * 4] + movh [r5 + r1 * 2], m0 + + ; do DC filter + jz .end + lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2 + add r4d, r5d ; r4d = DC * 3 + 2 + movd m0, r4d + pshuflw m0, m0, 0 ; m0 = pixDCx3 + + ; filter top + movu m1, [r3] + paddw m1, m0 + psraw m1, 2 + movh [r0], m1 ; overwrite top-left pixel, we will update it later + + ; filter top-left + movzx r3d, word [r3] + add r5d, r3d + movzx r3d, word [r2] + add r3d, r5d + shr r3d, 2 + mov [r0], r3w + + ; filter left + lea r0, [r0 + r1 * 2] + movu m1, [r2 + 2] + paddw m1, m0 + psraw m1, 2 + movd r3d, m1 + mov [r0], r3w + shr r3d, 16 + mov [r0 + r1 * 2], r3w + pextrw [r0 + r1 * 4], m1, 2 + +.end: + + RET + + + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc8, 4, 7, 2 + mov r4d, r5m + add r2, 2 + add r3, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r2] + + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + phaddw m0, m0 + pmaddwd m0, [pw_1] + + movd r5d, m0 + add r5d, 8 + shr r5d, 4 ; sum = sum / 16 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = word [dc_val ...] + pshufd m1, m1, 0 + + test r4d, r4d + + ; store DC 8x8 + mov r6, r0 + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m1 + + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 + + ; filter top + movu m0, [r3] + paddw m0, m1 + psraw m0, 2 + movu [r6], m0 + + ; filter top-left + movzx r3d, word [r3] + add r4d, r3d + movzx r3d, word [r2] + add r3d, r4d + shr r3d, 2 + mov [r6], r3w + + ; filter left + add r6, r1 + movu m0, [r2 + 2] + paddw m0, m1 + psraw m0, 2 + pextrw [r6], m0, 0 + pextrw [r6 + r1], m0, 1 + pextrw [r6 + r1 * 2], m0, 2 + lea r6, [r6 + r1 * 2] + pextrw [r6 + r1], m0, 3 + pextrw [r6 + r1 * 2], m0, 4 + lea r6, [r6 + r1 * 2] + pextrw [r6 + r1], m0, 5 + pextrw [r6 + r1 * 2], m0, 6 + +.end: + RET + + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc16, 4, 7, 4 + mov r4d, r5m + add r2, 2 + add r3, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r2] + movu m3, [r2 + 16] + + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movhlps m1, m0 + paddw m0, m1 + phaddw m0, m0 + pmaddwd m0, [pw_1] + + movd r5d, m0 + add r5d, 16 + shr r5d, 5 ; sum = sum / 16 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = word [dc_val ...] + pshufd m1, m1, 0 + + test r4d, r4d + + ; store DC 16x16 + mov r6, r0 + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 + + ; filter top + movu m2, [r3] + paddw m2, m1 + psraw m2, 2 + movu [r6], m2 + movu m3, [r3 + 16] + paddw m3, m1 + psraw m3, 2 + movu [r6 + 16], m3 + + ; filter top-left + movzx r3d, word [r3] + add r4d, r3d + movzx r3d, word [r2] + add r3d, r4d + shr r3d, 2 + mov [r6], r3w + + ; filter left + add r6, r1 + movu m2, [r2 + 2] + paddw m2, m1 + psraw m2, 2 + + pextrw [r6], m2, 0 + pextrw [r6 + r1], m2, 1 + lea r6, [r6 + r1 * 2] + pextrw [r6], m2, 2 + pextrw [r6 + r1], m2, 3 + lea r6, [r6 + r1 * 2] + pextrw [r6], m2, 4 + pextrw [r6 + r1], m2, 5 + lea r6, [r6 + r1 * 2] + pextrw [r6], m2, 6 + pextrw [r6 + r1], m2, 7 + + lea r6, [r6 + r1 * 2] + movu m3, [r2 + 18] + paddw m3, m1 + psraw m3, 2 + + pextrw [r6], m3, 0 + pextrw [r6 + r1], m3, 1 + lea r6, [r6 + r1 * 2] + pextrw [r6], m3, 2 + pextrw [r6 + r1], m3, 3 + lea r6, [r6 + r1 * 2] + pextrw [r6], m3, 4 + pextrw [r6 + r1], m3, 5 + lea r6, [r6 + r1 * 2] + pextrw [r6], m3, 6 + +.end: + RET + + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc32, 4, 5, 6 + mov r4d, r5m + add r2, 2 + add r3, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r3 + 32] + movu m3, [r3 + 48] + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movu m1, [r2] + movu m3, [r2 + 16] + movu m4, [r2 + 32] + movu m5, [r2 + 48] + paddw m1, m3 + paddw m4, m5 + paddw m1, m4 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + phaddw m0, m0 + pmaddwd m0, [pw_1] + + paddd m0, [pd_32] ; sum = sum + 32 + psrld m0, 6 ; sum = sum / 64 + pshuflw m0, m0, 0 + pshufd m0, m0, 0 + + lea r2, [r1 * 3] + mov r3d, 4 +.loop: + ; store DC 32x32 + movu [r0 + 0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + movu [r0 + r1 + 0], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r1 * 2 + 32], m0 + movu [r0 + r1 * 2 + 48], m0 + movu [r0 + r2 + 0], m0 + movu [r0 + r2 + 16], m0 + movu [r0 + r2 + 32], m0 + movu [r0 + r2 + 48], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + 0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + movu [r0 + r1 + 0], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r1 * 2 + 32], m0 + movu [r0 + r1 * 2 + 48], m0 + movu [r0 + r2 + 0], m0 + movu [r0 + r2 + 16], m0 + movu [r0 + r2 + 32], m0 + movu [r0 + r2 + 48], m0 + lea r0, [r0 + r1 * 4] + dec r3d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar4, 4,7,5 + add r2, 2 + add r3, 2 + add r1, r1 + movh m0, [r3] ; topRow[i] = above[i]; + punpcklqdq m0, m0 + + pxor m1, m1 + movd m2, [r2 + 8] ; bottomLeft = left[4] + movzx r6d, word [r3 + 8] ; topRight = above[4]; + pshuflw m2, m2, 0 + pshufd m2, m2, 0 + + psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i] + psllw m0, 2 + punpcklqdq m3, m2, m1 + psubw m0, m3 + paddw m2, m2 + +%macro COMP_PRED_PLANAR_2ROW 1 + movzx r4d, word [r2 + %1] + lea r4d, [r4d * 4 + 4] + movd m3, r4d + pshuflw m3, m3, 0 + + movzx r4d, word [r2 + %1 + 2] + lea r4d, [r4d * 4 + 4] + movd m4, r4d + pshuflw m4, m4, 0 + punpcklqdq m3, m4 ; horPred + + movzx r4d, word [r2 + %1] + mov r5d, r6d + sub r5d, r4d + movd m4, r5d + pshuflw m4, m4, 0 + + movzx r4d, word [r2 + %1 + 2] + mov r5d, r6d + sub r5d, r4d + movd m1, r5d + pshuflw m1, m1, 0 + punpcklqdq m4, m1 ; rightColumnN + + pmullw m4, [multi_2Row] + paddw m3, m4 + paddw m0, m2 + paddw m3, m0 + psraw m3, 3 + + movh [r0], m3 + pshufd m3, m3, 0xAE + movh [r0 + r1], m3 + lea r0, [r0 + 2 * r1] +%endmacro + + COMP_PRED_PLANAR_2ROW 0 + COMP_PRED_PLANAR_2ROW 4 +%undef COMP_PRED_PLANAR_2ROW + RET + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar8, 4,4,7 + add r2, 2 + add r3, 2 + add r1, r1 + movu m1, [r3] ; v_topRow + movu m2, [r2] ; v_leftColumn + + movd m3, [r3 + 16] ; topRight = above[8]; + movd m4, [r2 + 16] ; bottomLeft = left[8]; + + pshuflw m3, m3, 0 + pshufd m3, m3, 0 + pshuflw m4, m4, 0 + pshufd m4, m4, 0 + + psubw m4, m1 ; v_bottomRow + psubw m3, m2 ; v_rightColumn + + psllw m1, 3 ; v_topRow + psllw m2, 3 ; v_leftColumn + + paddw m6, m2, [pw_8] + +%macro PRED_PLANAR_ROW8 1 + %if (%1 < 4) + pshuflw m5, m6, 0x55 * %1 + pshufd m5, m5, 0 + pshuflw m2, m3, 0x55 * %1 + pshufd m2, m2, 0 + %else + pshufhw m5, m6, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA + pshufhw m2, m3, 0x55 * (%1 - 4) + pshufd m2, m2, 0xAA + %endif + + pmullw m2, [multiL] + paddw m5, m2 + paddw m1, m4 + paddw m5, m1 + psraw m5, 4 + + movu [r0], m5 + add r0, r1 + +%endmacro + + PRED_PLANAR_ROW8 0 + PRED_PLANAR_ROW8 1 + PRED_PLANAR_ROW8 2 + PRED_PLANAR_ROW8 3 + PRED_PLANAR_ROW8 4 + PRED_PLANAR_ROW8 5 + PRED_PLANAR_ROW8 6 + PRED_PLANAR_ROW8 7 + +%undef PRED_PLANAR_ROW8 + RET + + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +%if (BIT_DEPTH == 12) + +%if (ARCH_X86_64 == 1) +cglobal intra_pred_planar16, 4,7,8+3 +%define bottomRow0 m7 +%define bottomRow1 m8 +%define bottomRow2 m9 +%define bottomRow3 m10 +%else +cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize +%define bottomRow0 [rsp + 0*mmsize] +%define bottomRow1 [rsp + 1*mmsize] +%define bottomRow2 [rsp + 2*mmsize] +%define bottomRow3 m7 +%endif + + add r2, 2 + add r3, 2 + add r1, r1 + + pxor m0, m0 + + ; bottomRow + movzx r4d, word [r2 + 16*2] + movd m1, r4d + pshufd m1, m1, 0 ; m1 = bottomLeft + movu m2, [r3] + pmovzxwd m3, m2 + punpckhwd m2, m0 + psubd m4, m1, m3 + mova bottomRow0, m4 + psubd m4, m1, m2 + mova bottomRow1, m4 + movu m2, [r3 + 16] + pmovzxwd m3, m2 + punpckhwd m2, m0 + psubd m4, m1, m3 + mova bottomRow2, m4 + psubd m1, m2 + mova bottomRow3, m1 + + ; topRow + pmovzxwd m0, [r3 + 0*8] + pslld m0, 4 + pmovzxwd m1, [r3 + 1*8] + pslld m1, 4 + pmovzxwd m2, [r3 + 2*8] + pslld m2, 4 + pmovzxwd m3, [r3 + 3*8] + pslld m3, 4 + + xor r6, r6 +.loopH: + movzx r4d, word [r2 + r6*2] + movzx r5d, word [r3 + 16*2] ; r5 = topRight + sub r5d, r4d + movd m5, r5d + pshuflw m5, m5, 0 + pmullw m5, [multiL] + pmovsxwd m5, m5 ; m5 = rightCol + add r4d, r4d + lea r4d, [r4d * 8 + 16] + movd m4, r4d + pshufd m4, m4, 0 ; m4 = horPred + paddd m4, m5 + pshufd m6, m5, 0xFF ; m6 = [4 4 4 4] + + ; 0-3 + paddd m0, bottomRow0 + paddd m5, m0, m4 + psrad m5, 5 + packusdw m5, m5 + movh [r0 + 0*8], m5 + + ; 4-7 + paddd m4, m6 + paddd m1, bottomRow1 + paddd m5, m1, m4 + psrad m5, 5 + packusdw m5, m5 + movh [r0 + 1*8], m5 + + ; 8-11 + paddd m4, m6 + paddd m2, bottomRow2 + paddd m5, m2, m4 + psrad m5, 5 + packusdw m5, m5 + movh [r0 + 2*8], m5 + + ; 12-15 + paddd m4, m6 + paddd m3, bottomRow3 + paddd m5, m3, m4 + psrad m5, 5 + packusdw m5, m5 + movh [r0 + 3*8], m5 + + add r0, r1 + inc r6d + cmp r6d, 16 + jnz .loopH + RET + +%else ; BIT_DEPTH == 10 +INIT_XMM sse4 +cglobal intra_pred_planar16, 4,6,7 + add r2, 2 + add r3, 2 + add r1, r1 + + movu m1, [r3] ; topRow[0-7] + movu m2, [r3 + 16] ; topRow[8-15] + + movd m3, [r2 + 32] + pshuflw m3, m3, 0 + pshufd m3, m3, 0 + movzx r4d, word [r3 + 32] ; topRight = above[16] + + psubw m4, m3, m1 ; v_bottomRow[0] + psubw m3, m2 ; v_bottomRow[1] + + psllw m1, 4 + psllw m2, 4 + +%macro PRED_PLANAR_ROW16 1 + movzx r5d, word [r2 + %1 * 2] + add r5d, r5d + lea r5d, [r5d * 8 + 16] + movd m5, r5d + pshuflw m5, m5, 0 + pshufd m5, m5, 0 ; horPred + + movzx r5d, word [r2 + %1 * 2] + mov r3d, r4d + sub r3d, r5d + movd m0, r3d + pshuflw m0, m0, 0 + pshufd m0, m0, 0 + + pmullw m6, m0, [multiL] + paddw m6, m5 + paddw m1, m4 + paddw m6, m1 + psraw m6, 5 + + pmullw m0, m0, [multiH] + paddw m5, m0 + paddw m2, m3 + paddw m5, m2 + psraw m5, 5 + + movu [r0], m6 + movu [r0 + 16], m5 + add r0, r1 +%endmacro + + PRED_PLANAR_ROW16 0 + PRED_PLANAR_ROW16 1 + PRED_PLANAR_ROW16 2 + PRED_PLANAR_ROW16 3 + PRED_PLANAR_ROW16 4 + PRED_PLANAR_ROW16 5 + PRED_PLANAR_ROW16 6 + PRED_PLANAR_ROW16 7 + PRED_PLANAR_ROW16 8 + PRED_PLANAR_ROW16 9 + PRED_PLANAR_ROW16 10 + PRED_PLANAR_ROW16 11 + PRED_PLANAR_ROW16 12 + PRED_PLANAR_ROW16 13 + PRED_PLANAR_ROW16 14 + PRED_PLANAR_ROW16 15 +%undef PRED_PLANAR_ROW16 + RET +%endif + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +%if (ARCH_X86_64 == 1) +cglobal intra_pred_planar32, 4,7,8+8, 0-4*mmsize + %define bottomRow0 m8 + %define bottomRow1 m9 + %define bottomRow2 m10 + %define bottomRow3 m11 + %define bottomRow4 m12 + %define bottomRow5 m13 + %define bottomRow6 m14 + %define bottomRow7 m15 + %define tmp0 [rsp + 0*mmsize] + %define tmp1 [rsp + 1*mmsize] + %define tmp2 [rsp + 2*mmsize] + %define tmp3 [rsp + 3*mmsize] +%else +cglobal intra_pred_planar32, 4,7,8, 0-12*mmsize + %define bottomRow0 [rsp + 0*mmsize] + %define bottomRow1 [rsp + 1*mmsize] + %define bottomRow2 [rsp + 2*mmsize] + %define bottomRow3 [rsp + 3*mmsize] + %define bottomRow4 [rsp + 4*mmsize] + %define bottomRow5 [rsp + 5*mmsize] + %define bottomRow6 [rsp + 6*mmsize] + %define bottomRow7 [rsp + 7*mmsize] + %define tmp0 [rsp + 8*mmsize] + %define tmp1 [rsp + 9*mmsize] + %define tmp2 [rsp + 10*mmsize] + %define tmp3 [rsp + 11*mmsize] +%endif + + add r2, 2 + add r3, 2 + add r1, r1 + + pxor m0, m0 + + ; bottomRow + movzx r4d, word [r2 + 32*2] + movd m1, r4d + pshufd m1, m1, 0 ; m1 = bottomLeft + movu m2, [r3] + pmovzxwd m3, m2 + punpckhwd m2, m0 + psubd m4, m1, m3 + mova bottomRow0, m4 + psubd m4, m1, m2 + mova bottomRow1, m4 + movu m2, [r3 + 16] + pmovzxwd m3, m2 + punpckhwd m2, m0 + psubd m4, m1, m3 + mova bottomRow2, m4 + psubd m4, m1, m2 + mova bottomRow3, m4 + + movu m2, [r3 + 32] + pmovzxwd m3, m2 + punpckhwd m2, m0 + psubd m4, m1, m3 + mova bottomRow4, m4 + psubd m4, m1, m2 + mova bottomRow5, m4 + movu m2, [r3 + 48] + pmovzxwd m3, m2 + punpckhwd m2, m0 + psubd m4, m1, m3 + mova bottomRow6, m4 + psubd m1, m2 + mova bottomRow7, m1 + + ; topRow + pmovzxwd m0, [r3 + 0*8] + pslld m0, 5 + pmovzxwd m1, [r3 + 1*8] + pslld m1, 5 + pmovzxwd m2, [r3 + 2*8] + pslld m2, 5 + pmovzxwd m3, [r3 + 3*8] + pslld m3, 5 + + pmovzxwd m4, [r3 + 4*8] + pslld m4, 5 + mova tmp0, m4 + pmovzxwd m4, [r3 + 5*8] + pslld m4, 5 + mova tmp1, m4 + pmovzxwd m4, [r3 + 6*8] + pslld m4, 5 + mova tmp2, m4 + pmovzxwd m4, [r3 + 7*8] + pslld m4, 5 + mova tmp3, m4 + + xor r6, r6 +.loopH: + movzx r4d, word [r2 + r6*2] + movzx r5d, word [r3 + 32*2] ; r5 = topRight + sub r5d, r4d + movd m5, r5d + pshuflw m5, m5, 0 + pmullw m5, [multiL] + pmovsxwd m5, m5 ; m5 = rightCol + shl r4d, 5 + add r4d, 32 + movd m4, r4d + pshufd m4, m4, 0 ; m4 = horPred + paddd m4, m5 + pshufd m6, m5, 0xFF ; m6 = [4 4 4 4] + + ; 0-3 + paddd m0, bottomRow0 + paddd m5, m0, m4 + psrad m5, 6 + packusdw m5, m5 + movh [r0 + 0*8], m5 + + ; 4-7 + paddd m4, m6 + paddd m1, bottomRow1 + paddd m5, m1, m4 + psrad m5, 6 + packusdw m5, m5 + movh [r0 + 1*8], m5 + + ; 8-11 + paddd m4, m6 + paddd m2, bottomRow2 + paddd m5, m2, m4 + psrad m5, 6 + packusdw m5, m5 + movh [r0 + 2*8], m5 + + ; 12-15 + paddd m4, m6 + paddd m3, bottomRow3 + paddd m5, m3, m4 + psrad m5, 6 + packusdw m5, m5 + movh [r0 + 3*8], m5 + + ; 16-19 + paddd m4, m6 + mova m7, tmp0 + paddd m7, bottomRow4 + mova tmp0, m7 + paddd m7, m4 + psrad m7, 6 + packusdw m7, m7 + movh [r0 + 4*8], m7 + + ; 20-23 + paddd m4, m6 + mova m7, tmp1 + paddd m7, bottomRow5 + mova tmp1, m7 + paddd m7, m4 + psrad m7, 6 + packusdw m7, m7 + movh [r0 + 5*8], m7 + + ; 24-27 + paddd m4, m6 + mova m7, tmp2 + paddd m7, bottomRow6 + mova tmp2, m7 + paddd m7, m4 + psrad m7, 6 + packusdw m7, m7 + movh [r0 + 6*8], m7 + + ; 28-31 + paddd m4, m6 + mova m7, tmp3 + paddd m7, bottomRow7 + mova tmp3, m7 + paddd m7, m4 + psrad m7, 6 + packusdw m7, m7 + movh [r0 + 7*8], m7 + + add r0, r1 + inc r6d + cmp r6d, 32 + jnz .loopH + + RET + +;----------------------------------------------------------------------------- +; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang4_2, 3,3,4 + cmp r4m, byte 34 + cmove r2, r3mp + add r1, r1 + movu m0, [r2 + 4] + movh [r0], m0 + palignr m1, m0, 2 + movh [r0 + r1], m1 + palignr m2, m0, 4 + movh [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + psrldq m0, 6 + movh [r0 + r1], m0 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang4_3, 3,4,8 + cmp r4m, byte 33 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + palignr m5, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2] + palignr m1, m0, 6 ; [x x x 8 7 6 5 4] + punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3] + movhlps m0, m0 ; [x x x x 8 7 6 5] + punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4] + + mova m0, [r3 + 6 * 16] ; [26] + mova m1, [r3] ; [20] + mova m6, [r3 - 6 * 16] ; [14] + mova m7, [r3 - 12 * 16] ; [ 8] + jmp .do_filter4x4 + +ALIGN 16 +.do_filter4x4: + pmaddwd m2, m0 + paddd m2, [pd_16] + psrld m2, 5 + + pmaddwd m3, m1 + paddd m3, [pd_16] + psrld m3, 5 + packusdw m2, m3 + + pmaddwd m4, m6 + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m5, m7 + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + jz .store + + ; transpose 4x4 + punpckhwd m0, m2, m4 + punpcklwd m2, m4 + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + +.store: + add r1, r1 + movh [r0], m2 + movhps [r0 + r1], m2 + movh [r0 + r1 * 2], m4 + lea r1, [r1 * 3] + movhps [r0 + r1], m4 + RET + +cglobal intra_pred_ang4_4, 3,4,8 + cmp r4m, byte 32 + cmove r2, r3mp + lea r3, [ang_table + 18 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2] + mova m4, m3 + palignr m7, m0, 6 ; [x x x 8 7 6 5 4] + punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3] + + mova m0, [r3 + 3 * 16] ; [21] + mova m1, [r3 - 8 * 16] ; [10] + mova m6, [r3 + 13 * 16] ; [31] + mova m7, [r3 + 2 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_5, 3,4,8 + cmp r4m, byte 31 + cmove r2, r3mp + lea r3, [ang_table + 10 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2] + mova m4, m3 + palignr m7, m0, 6 ; [x x x 8 7 6 5 4] + punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3] + + mova m0, [r3 + 7 * 16] ; [17] + mova m1, [r3 - 8 * 16] ; [ 2] + mova m6, [r3 + 9 * 16] ; [19] + mova m7, [r3 - 6 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_6, 3,4,8 + cmp r4m, byte 30 + cmove r2, r3mp + lea r3, [ang_table + 19 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2] + mova m5, m4 + + mova m0, [r3 - 6 * 16] ; [13] + mova m1, [r3 + 7 * 16] ; [26] + mova m6, [r3 - 12 * 16] ; [ 7] + mova m7, [r3 + 1 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_7, 3,4,8 + cmp r4m, byte 29 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + mova m4, m2 + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2] + + mova m0, [r3 - 11 * 16] ; [ 9] + mova m1, [r3 - 2 * 16] ; [18] + mova m6, [r3 + 7 * 16] ; [27] + mova m7, [r3 - 16 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_8, 3,4,8 + cmp r4m, byte 28 + cmove r2, r3mp + lea r3, [ang_table + 13 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + mova m0, [r3 - 8 * 16] ; [ 5] + mova m1, [r3 - 3 * 16] ; [10] + mova m6, [r3 + 2 * 16] ; [15] + mova m7, [r3 + 7 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_9, 3,4,8 + cmp r4m, byte 27 + cmove r2, r3mp + lea r3, [ang_table + 4 * 16] + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + mova m0, [r3 - 2 * 16] ; [ 2] + mova m1, [r3 - 0 * 16] ; [ 4] + mova m6, [r3 + 2 * 16] ; [ 6] + mova m7, [r3 + 4 * 16] ; [ 8] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_10, 3,3,4 + movh m0, [r2 + 2] ; [4 3 2 1] + pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3] + pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1] + add r1, r1 + movhlps m1, m0 ; [2 2 2 2] + movhlps m3, m2 ; [4 4 4 4] + movh [r0 + r1], m1 + movh [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + movh [r0 + r1], m3 + + cmp r5m, byte 0 + jz .quit + + ; filter + mov r2, r3mp + movu m1, [r2] ; [7 6 5 4 3 2 1 0] + pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0] + palignr m1, m1, 2 ; [4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_1023] + +.quit: + movh [r0], m0 + RET + +cglobal intra_pred_ang4_26, 4,4,3 + movh m0, [r3 + 2] ; [8 7 6 5 4 3 2 1] + add r1, r1 + ; store + movh [r0], m0 + movh [r0 + r1], m0 + movh [r0 + r1 * 2], m0 + lea r3, [r1 * 3] + movh [r0 + r3], m0 + + ; filter + cmp r5m, byte 0 + jz .quit + + pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1] + movu m1, [r2] ; [7 6 5 4 3 2 1 0] + pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0] + palignr m1, m1, 2 ; [4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_1023] + + pextrw [r0], m0, 0 + pextrw [r0 + r1], m0, 1 + pextrw [r0 + r1 * 2], m0, 2 + pextrw [r0 + r3], m0, 3 + +.quit: + RET + +cglobal intra_pred_ang4_11, 3,4,8 + cmp r4m, byte 25 + cmove r2, r3mp + lea r3, [ang_table + 24 * 16] + movu m2, [r2] ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + mova m0, [r3 + 6 * 16] ; [24] + mova m1, [r3 + 4 * 16] ; [26] + mova m6, [r3 + 2 * 16] ; [28] + mova m7, [r3 + 0 * 16] ; [30] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_12, 3,4,8 + cmp r4m, byte 24 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + movu m2, [r2] ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + mova m0, [r3 + 7 * 16] ; [27] + mova m1, [r3 + 2 * 16] ; [22] + mova m6, [r3 - 3 * 16] ; [17] + mova m7, [r3 - 8 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_13, 4,4,8 + cmp r4m, byte 23 + jnz .load + xchg r2, r3 +.load: + movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x] + palignr m2, m5, 2 ; [x x x 4 3 2 1 0] + palignr m0, m5, 4 ; [x x x x 4 3 2 1] + pinsrw m5, [r3 + 8], 0 + punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m2 + + lea r3, [ang_table + 21 * 16] + mova m0, [r3 + 2 * 16] ; [23] + mova m1, [r3 - 7 * 16] ; [14] + mova m6, [r3 - 16 * 16] ; [ 5] + mova m7, [r3 + 7 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_14, 4,4,8 + cmp r4m, byte 22 + jnz .load + xchg r2, r3 +.load: + movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x] + palignr m2, m5, 2 ; [x x x 4 3 2 1 0] + palignr m0, m5, 4 ; [x x x x 4 3 2 1] + pinsrw m5, [r3 + 4], 0 + punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m5 + + lea r3, [ang_table + 19 * 16] + mova m0, [r3 + 0 * 16] ; [19] + mova m1, [r3 - 13 * 16] ; [ 6] + mova m6, [r3 + 6 * 16] ; [25] + mova m7, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_15, 4,4,8 + cmp r4m, byte 21 + jnz .load + xchg r2, r3 +.load: + movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x] + palignr m2, m3, 2 ; [x x x 4 3 2 1 0] + palignr m0, m3, 4 ; [x x x x 4 3 2 1] + pinsrw m3, [r3 + 4], 0 + pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y] + pinsrw m5, [r3 + 8], 0 + punpcklwd m5, m3 ; [2 1 1 0 0 x x y] + punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m4, m3 + + lea r3, [ang_table + 23 * 16] + mova m0, [r3 - 8 * 16] ; [15] + mova m1, [r3 + 7 * 16] ; [30] + mova m6, [r3 - 10 * 16] ; [13] + mova m7, [r3 + 5 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_16, 4,4,8 + cmp r4m, byte 20 + jnz .load + xchg r2, r3 +.load: + movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x] + palignr m2, m3, 2 ; [x x x 4 3 2 1 0] + palignr m0, m3, 4 ; [x x x x 4 3 2 1] + pinsrw m3, [r3 + 4], 0 + pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y] + pinsrw m5, [r3 + 6], 0 + punpcklwd m5, m3 ; [2 1 1 0 0 x x y] + punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m4, m3 + + lea r3, [ang_table + 19 * 16] + mova m0, [r3 - 8 * 16] ; [11] + mova m1, [r3 + 3 * 16] ; [22] + mova m6, [r3 - 18 * 16] ; [ 1] + mova m7, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_17, 4,4,8 + cmp r4m, byte 19 + jnz .load + xchg r2, r3 +.load: + movu m6, [r2 - 2] ; [- - 4 3 2 1 0 x] + palignr m2, m6, 2 ; [- - - 4 3 2 1 0] + palignr m1, m6, 4 ; [- - - - 4 3 2 1] + mova m4, m2 + punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] + + pinsrw m6, [r3 + 2], 0 + punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x] + + pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y] + pinsrw m4, [r3 + 4], 0 + pslldq m5, m4, 2 ; [4 3 2 1 0 x y z] + pinsrw m5, [r3 + 8], 0 + punpcklwd m5, m4 ; [1 0 0 x x y y z] + punpcklwd m4, m6 ; [2 1 1 0 0 x x y] + + lea r3, [ang_table + 14 * 16] + mova m0, [r3 - 8 * 16] ; [ 6] + mova m1, [r3 - 2 * 16] ; [12] + mova m6, [r3 + 4 * 16] ; [18] + mova m7, [r3 + 10 * 16] ; [24] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_18, 4,4,1 + movh m0, [r2] + pshufb m0, [pw_swap] + movhps m0, [r3 + 2] + add r1, r1 + lea r2, [r1 * 3] + movh [r0 + r2], m0 + psrldq m0, 2 + movh [r0 + r1 * 2], m0 + psrldq m0, 2 + movh [r0 + r1], m0 + psrldq m0, 2 + movh [r0], m0 + RET + +;----------------------------------------------------------------------------- +; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang8_2, 3,4,3 + cmp r4m, byte 34 + cmove r2, r3mp + add r1, r1 + lea r3, [r1 * 3] + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu [r0], m0 + palignr m2, m1, m0, 2 + movu [r0 + r1], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 6 + movu [r0 + r3], m2 + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 10 + movu [r0 + r1], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m1, m0, 14 + movu [r0 + r3], m1 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang8_3, 3,5,8 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] + + mova m4, m3 + pmaddwd m4, [r3 + 12 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m2, [r3 + 6 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 6 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m6, [r3] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [ 8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m3, [r3 - 6 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m3 + movhps [r2 + r4], m3 + + mova m4, m0 + pmaddwd m4, [r3 - 12 * 16] ; [ 2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 + 14 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m5 + pmaddwd m6, [r3 + 14 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 8 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 + 2 * 16] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + + RET + +cglobal intra_pred_ang8_4, 3,6,8 + lea r3, [ang_table + 19 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 2 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 9 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 12 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 1 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r5, [r0 + r1 * 4] + movh [r5], m6 + movhps [r5 + r1], m6 + movh [r5 + r1 * 2], m1 + movhps [r5 + r4], m1 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 - 10 * 16] ; [ 9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m6, m3 + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m4, m3 + + pmaddwd m2, [r3 + 11 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m6, [r3 + 11 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m0 + pmaddwd m6, [r3] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m5 + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + movh m1, [r2 + 26] ; [16 15 14 13] + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 11 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + + RET + +cglobal intra_pred_ang8_5, 3,5,8 + lea r3, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 4 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 4 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 6 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 9 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m4, [r3 + 8 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m6, m2 + pmaddwd m2, [r3 - 7 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m1 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 10 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 10 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m0 + pmaddwd m7, [r3 - 5 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m5 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + + RET + +cglobal intra_pred_ang8_6, 3,5,8 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m6 + pmaddwd m6, [r3 - 7 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 6 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m6, m4 + pmaddwd m4, [r3 - 13 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m2 + pmaddwd m2, [r3 - 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m5, [r3 - 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + + RET + +cglobal intra_pred_ang8_7, 3,5,8 + lea r3, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 9 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 9 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 14 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m4 + pmaddwd m4, [r3 - 5 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m2 + pmaddwd m2, [r3 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3 + 4 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 10 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m5, [r3 - 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + + RET + +cglobal intra_pred_ang8_8, 3,6,7 + lea r3, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 12 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 7 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 2 * 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 3 * 16] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r5, [r0 + r1 * 4] + movh [r5], m6 + movhps [r5 + r1], m6 + movh [r5 + r1 * 2], m1 + movhps [r5 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r3 + 8 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 13 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + movh m1, [r2 + 18] ; [12 11 10 9] + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m5, m6 + pmaddwd m6, [r3 - 14 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m3, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m5, [r3 - 9 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3 - 9 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m5 + movhps [r0 + r1 + 8], m5 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + + RET + +cglobal intra_pred_ang8_9, 3,5,7 + lea r3, [ang_table + 9 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 7 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 3 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 - 1 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 3 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 5 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 + 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_10, 4,5,3 + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1] + add r1, r1 + lea r4, [r1 * 3] + + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2] + movu [r0 + r1], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3] + movu [r0 + r1 * 2], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4] + movu [r0 + r4], m2 + + lea r2, [r0 + r1 *4] + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5] + movu [r2], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6] + movu [r2 + r1], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7] + movu [r2 + r1 * 2], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8] + movu [r2 + r4], m2 + + cmp r5m, byte 0 + jz .quit + + ; filter + + movh m1, [r3] ; [3 2 1 0] + pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_1023] + +.quit: + movu [r0], m0 + RET + +cglobal intra_pred_ang8_11, 3,5,7 + lea r3, [ang_table + 23 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 7 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 5 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 3 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 1 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 3 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 5 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 - 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_12, 4,6,7 + lea r5, [ang_table + 16 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 + 6 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r5 + 1 * 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r5 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r5 - 4 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r5 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r5 - 9 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r3] + pshufb m1, [pw_ang8_12] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r5 + 8 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 8 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_13, 4,6,8 + lea r5, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 9 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r5 - 9 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r5 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + movu m1, [r3] + pshufb m1, [pw_ang8_13] + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 14 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + mova m4, m3 + pmaddwd m4, [r5 + 5 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 4 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 4 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 13 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 + 10 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 10 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_14, 4,6,8 + lea r5, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 1 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r3] + pshufb m1, [pw_ang8_14] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 7 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + mova m5, m3 + pmaddwd m5, [r5 - 6 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 13 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 13 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 + 6 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 6 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_15, 4,6,8 + lea r5, [ang_table + 20 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 5 * 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r3] + pshufb m1, [pw_ang8_15] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 10 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 7 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 8 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + mova m4, m3 + pmaddwd m4, [r5 - 9 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 6 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 11 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r3 + 16], 0 + + pmaddwd m3, [r5 + 4 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 4 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_16, 4,6,8 + lea r5, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 2 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r3] + pshufb m1, [pw_ang8_16] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 9 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 9 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 12 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 12 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 - 1 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 - 1 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 10 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r3 + 16], 0 + + pmaddwd m3, [r5 + 11 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 11 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_17, 4,6,8 + lea r5, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 11 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r3] + pshufb m1, [pw_ang8_17] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 - 5 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 1 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 7 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 7 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 13 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 13 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 - 7 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 - 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 - 1 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + + RET + +cglobal intra_pred_ang8_18, 4,5,3 + add r1, r1 + lea r4, [r1 * 3] + movu m1, [r3] + movu m0, [r2 + 2] + pshufb m0, [pw_swap16] + movu [r0], m1 + palignr m2, m1, m0, 14 + movu [r0 + r1], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 10 + movu [r0 + r4], m2 + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m1, m0, 2 + movu [r0 + r4], m1 + RET + +cglobal intra_pred_ang8_19, 4,6,8 + lea r5, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 11 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_17] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 - 5 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 1 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 7 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 7 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 13 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 13 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 - 7 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 - 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 - 1 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_20, 4,6,8 + lea r5, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 2 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_16] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 9 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 9 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 12 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 12 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 - 1 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 - 1 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 10 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r2 + 16], 0 + + pmaddwd m3, [r5 + 11 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 11 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_21, 4,6,8 + lea r5, [ang_table + 20 * 16] + add r1, r1 + + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 5 * 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_15] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 10 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 7 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 8 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r5 - 9 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 6 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 11 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r2 + 16], 0 + + pmaddwd m3, [r5 + 4 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 4 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_22, 4,6,8 + lea r5, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 1 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_14] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 7 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + mova m5, m3 + pmaddwd m5, [r5 - 6 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 13 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 13 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 + 6 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 6 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_23, 4,6,8 + lea r5, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 9 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r5 - 9 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r5 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_13] + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 14 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r5 + 5 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 4 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 4 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 13 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 + 10 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 10 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_24, 4,6,7 + lea r5, [ang_table + 16 * 16] + add r1, r1 + + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 + 6 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r5 + 1 * 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r5 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r5 - 4 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r5 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r5 - 9 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_12] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r5 + 8 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 8 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_25, 3,5,7 + mov r2, r3mp + lea r3, [ang_table + 23 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 7 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 5 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 3 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 1 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 3 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 5 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 - 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_26, 4,5,3 + movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1] + add r1, r1 + lea r4, [r1 * 3] + + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r4], m0 + + lea r3, [r0 + r1 *4] + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r4], m0 + + cmp r5m, byte 0 + jz .quit + + ; filter + + pshufb m0, [pw_unpackwdq] + movh m1, [r2] ; [3 2 1 0] + pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_1023] + pextrw [r0], m0, 0 + pextrw [r0 + r1], m0, 1 + pextrw [r0 + r1 * 2], m0, 2 + pextrw [r0 + r4], m0, 3 + pextrw [r3], m0, 4 + pextrw [r3 + r1], m0, 5 + pextrw [r3 + r1 * 2], m0, 6 + pextrw [r3 + r4], m0, 7 + +.quit: + RET + +cglobal intra_pred_ang8_27, 3,5,7 + mov r2, r3mp + lea r3, [ang_table + 9 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 7 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 3 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 - 1 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 3 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 5 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 + 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + + RET + +cglobal intra_pred_ang8_28, 3,5,7 + mov r2, r3mp + lea r3, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 12 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 7 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 2 * 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 3 * 16] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 + 8 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 13 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m5, m6 + pmaddwd m6, [r3 - 14 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m3, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m5, [r3 - 9 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3 - 9 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + RET + +cglobal intra_pred_ang8_29, 3,5,8 + mov r2, r3mp + lea r3, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 9 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 9 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 14 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m4 + pmaddwd m4, [r3 - 5 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m2 + pmaddwd m2, [r3 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3 + 4 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 10 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m5, [r3 - 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + RET + +cglobal intra_pred_ang8_30, 3,5,8 + mov r2, r3mp + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m6 + pmaddwd m6, [r3 - 7 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 6 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m6, m4 + pmaddwd m4, [r3 - 13 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m2 + pmaddwd m2, [r3 - 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m5, [r3 - 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + RET + +cglobal intra_pred_ang8_31, 3,5,8 + mov r2, r3mp + lea r3, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 4 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 4 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 6 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 9 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m4, [r3 + 8 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m6, m2 + pmaddwd m2, [r3 - 7 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m1 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 10 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 10 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m0 + pmaddwd m7, [r3 - 5 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m5 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + RET + +cglobal intra_pred_ang8_32, 3,6,8 + mov r2, r3mp + lea r3, [ang_table + 19 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 2 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 9 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 12 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 1 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 - 10 * 16] ; [ 9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m6, m3 + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m4, m3 + + pmaddwd m2, [r3 + 11 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m6, [r3 + 11 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m0 + pmaddwd m6, [r3] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m5 + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + movh m1, [r2 + 26] ; [16 15 14 13] + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 11 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + RET + +cglobal intra_pred_ang8_33, 3,5,8 + mov r2, r3mp + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] + + mova m4, m3 + pmaddwd m4, [r3 + 12 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m2, [r3 + 6 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 6 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m6, [r3] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [ 8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m3, [r3 - 6 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + mova m4, m0 + pmaddwd m4, [r3 - 12 * 16] ; [ 2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 + 14 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m5 + pmaddwd m6, [r3 + 14 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 8 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 + 2 * 16] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + RET + +;----------------------------------------------------------------------------- +; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang16_2, 3,4,5 + cmp r4m, byte 34 + cmove r2, r3mp + add r1, r1 + lea r3, [r1 * 3] + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu m2, [r2 + 36] + + movu [r0], m0 + movu [r0 + 16], m1 + palignr m3, m1, m0, 2 + palignr m4, m2, m1, 2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m1, m0, 4 + palignr m4, m2, m1, 4 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m1, m0, 6 + palignr m4, m2, m1, 6 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r1 * 4] + palignr m3, m1, m0, 8 + palignr m4, m2, m1, 8 + movu [r0], m3 + movu [r0 + 16], m4 + palignr m3, m1, m0, 10 + palignr m4, m2, m1, 10 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m1, m0, 12 + palignr m4, m2, m1, 12 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m1, m0, 14 + palignr m4, m2, m1, 14 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + movu m0, [r2 + 52] + lea r0, [r0 + r1 * 4] + movu [r0], m1 + movu [r0 + 16], m2 + palignr m3, m2, m1, 2 + palignr m4, m0, m2, 2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m2, m1, 4 + palignr m4, m0, m2, 4 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m2, m1, 6 + palignr m4, m0, m2, 6 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r1 * 4] + palignr m3, m2, m1, 8 + palignr m4, m0, m2, 8 + movu [r0], m3 + movu [r0 + 16], m4 + palignr m3, m2, m1, 10 + palignr m4, m0, m2, 10 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m2, m1, 12 + palignr m4, m0, m2, 12 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 14 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + RET + +%macro TRANSPOSE_STORE 6 + jnz .skip%6 + punpckhwd %5, %1, %2 + punpcklwd %1, %2 + punpckhwd %2, %3, %4 + punpcklwd %3, %4 + + punpckldq %4, %1, %3 + punpckhdq %1, %3 + punpckldq %3, %5, %2 + punpckhdq %5, %2 + + movh [r0 + %6], %4 + movhps [r0 + r1 + %6], %4 + movh [r0 + r1 * 2 + %6], %1 + movhps [r0 + r4 + %6], %1 + lea r5, [r0 + r1 * 4] + movh [r5 + %6], %3 + movhps [r5 + r1 + %6], %3 + movh [r5 + r1 * 2 + %6], %5 + movhps [r5 + r4 + %6], %5 + jmp .end%6 + +.skip%6: + movu [r5], %1 + movu [r5 + r1], %2 + movu [r5 + r1 * 2], %3 + movu [r5 + r4], %4 +.end%6: +%endmacro + +INIT_XMM sse4 +cglobal ang16_mode_3_33 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m2, [r3 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m6, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 - 2 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 8 * 16] ; [ 8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m3, [r3 - 8 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 0 + + mova m4, m0 + pmaddwd m4, [r3 - 14 * 16] ; [ 2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 - 14 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m5 + pmaddwd m6, [r3 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 + + movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] + psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] + + punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] + punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17] + + palignr m4, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m4, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + pmaddwd m2, [r3 - 6 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m5 + pmaddwd m2, [r3 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m3 + pmaddwd m6, [r3 - 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m5 + pmaddwd m6, [r3 + 14 * 16] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m3 + pmaddwd m7, [r3 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + palignr m0, m1, m3, 4 ; [18 17 17 16 16 15 15 14] + pmaddwd m0, [r3 + 8 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m0, 16 + + palignr m4, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m4, [r3 + 2 * 16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m1, m3, 8 ; [19 18 18 17 17 16 16 15] + pmaddwd m2, [r3 + 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m3, m5, 12 ; [16 15 15 14 14 13 13 12] + pmaddwd m2, [r3 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m1, m3, 12 ; [20 19 19 18 18 17 17 16] + pmaddwd m6, [r3 - 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pinsrw m1, [r2 + 42], 7 + pmaddwd m3, [r3 - 10 * 16] ; [6] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m3, m1 + + movu m7, [r2 + 28] + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m3, m7, m0, 24 + + ret + +cglobal intra_pred_ang16_3, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_3_33 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_3_33 + + RET + +cglobal intra_pred_ang16_33, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_3_33 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_3_33 + + RET + +cglobal ang16_mode_4_32 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 3 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 3 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 8 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 2 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 - 9 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m7, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m6, m7 + pmaddwd m7, [r3 - 9 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m2, [r3 + 12 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m6, [r3 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m0 + pmaddwd m6, [r3 + 1 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m5 + pmaddwd m7, [r3 + 1 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] + + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 10 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 + + psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] + + punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] + punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17] + + palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m4, [r3 + 11 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m2, [r3 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m6, [r3] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m6 + pmaddwd m6, [r3 - 11 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + pmaddwd m0, [r3 - 11 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m7, [r3 + 10 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + pmaddwd m0, [r3 + 10 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m0, 16 + + mova m4, m5 + pmaddwd m4, [r3 - 1 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m3 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + mova m7, m2 + pmaddwd m2, [r3 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m1, m3, 4 ; [18 17 17 16 16 15 15 14] + mova m0, m6 + pmaddwd m6, [r3 - 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pmaddwd m7, [r3 + 9 * 16] ; [27] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m0, [r3 + 9 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + palignr m0, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m0, [r3 - 2 * 16] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + palignr m1, m3, 8 ; [19 18 18 17 17 16 16 15] + pmaddwd m1, [r3 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m0, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m0, m3, 24 + + ret + +cglobal intra_pred_ang16_4, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 18 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_4_32 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_4_32 + + RET + +cglobal intra_pred_ang16_32, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 18 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_4_32 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_4_32 + + RET + +cglobal ang16_mode_5_31 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 3 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 3 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 12 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m4, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 + 5 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m6, m2 + pmaddwd m2, [r3 - 10 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m1 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 7 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m0 + pmaddwd m7, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m3, m5 + pmaddwd m3, [r3 - 8 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 + + movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] + psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] + + punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] + + mova m4, m0 + pmaddwd m4, [r3 + 9 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 + 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m6, m2 + pmaddwd m2, [r3 - 6 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + mova m1, m7 + pmaddwd m7, [r3 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m2, m7 + + pmaddwd m6, [r3 + 11 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m1, [r3 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 - 4 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + palignr m4, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m4, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m2, [r3 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m2 + pmaddwd m2, [r3 - 2 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + mova m0, m6 + pmaddwd m6, [r3 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pmaddwd m7, [r3 + 15 * 16] ; [31] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m0, [r3 + 15 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + pmaddwd m5, [r3] ; [16] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m5, m3, 24 + + ret + +cglobal intra_pred_ang16_5, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_5_31 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_5_31 + + RET + +cglobal intra_pred_ang16_31, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_5_31 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_5_31 + + RET + +cglobal ang16_mode_6_30 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 2 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 11 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m6 + pmaddwd m6, [r3 - 8 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m6, m4 + pmaddwd m4, [r3 - 14 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m6 + pmaddwd m2, [r3 - 1 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m7 + pmaddwd m1, [r3 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 12 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 7 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m4, [r3 + 6 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m2, [r3 + 6 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 - 13 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m7, m5 + pmaddwd m7, [r3 - 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m2, m7 + + mova m6, m0 + pmaddwd m6, [r3] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m5 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m0 + pmaddwd m7, [r3 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m5 + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + movh m3, [r2 + 26] ; [16 15 14 13] + + palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m2, m4 + pmaddwd m4, [r3 - 6 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + mova m6, m1 + pmaddwd m1, [r3 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m2, [r3 + 7 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m6 + pmaddwd m1, [r3 + 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + psrldq m3, 2 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m5, m7 + pmaddwd m7, [r3 - 12 * 16] ; [3] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m6, 4 ; [15 14 14 13 13 12 12 11] + mova m1, m3 + pmaddwd m3, [r3 - 12 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m5, [r3 + 1 * 16] ; [16] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m5, m3, 24 + + ret + +cglobal intra_pred_ang16_6, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_6_30 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_6_30 + + RET + +cglobal intra_pred_ang16_30, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_6_30 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_6_30 + + RET + +cglobal ang16_mode_7_29 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 8 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 1 * 16] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 10 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 13 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m4 + pmaddwd m4, [r3 - 4 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m6 + pmaddwd m2, [r3 + 5 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m7 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 14 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 9 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m2, m4 + pmaddwd m4, [r3] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m1 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m2, [r3 + 9 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m7, [r3 + 9 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m2, m7 + + palignr m6, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m6, [r3 - 14 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 5 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 + 4 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m3, m1 + pmaddwd m1, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m2, [r3 + 13 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m3, [r3 + 13 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m2, m3 + + mova m7, m0 + pmaddwd m7, [r3 - 10 * 16] ; [7] + paddd m7, [pd_16] + psrld m7, 5 + mova m3, m5 + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m0, [r3 - 1 * 16] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + pmaddwd m5, [r3 - 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m0, m5 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m0, m3, 24 + + ret + +cglobal intra_pred_ang16_7, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 17 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_7_29 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_7_29 + + RET + +cglobal intra_pred_ang16_29, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 17 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_7_29 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_7_29 + + RET + +cglobal ang16_mode_8_28 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 10 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 + 15 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 15 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m6, [r3 - 12 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 7 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m4 + pmaddwd m4, [r3 - 2 *16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m1, m6 + pmaddwd m6, [r3 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m7 + pmaddwd m2, [r3 + 3 * 16] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m1 + pmaddwd m6, [r3 + 3 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m7 + pmaddwd m6, [r3 + 8 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m1, [r3 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + palignr m1, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m4, m1 + pmaddwd m4, [r3 - 14 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m0, m5 + pmaddwd m0, [r3 - 14 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m4, m0 + + mova m2, m1 + pmaddwd m2, [r3 - 9 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m3, m5 + pmaddwd m3, [r3 - 9 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m2, m3 + + mova m7, m1 + pmaddwd m7, [r3 - 4 * 16] ; [11] + paddd m7, [pd_16] + psrld m7, 5 + mova m3, m5 + pmaddwd m3, [r3 - 4 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m1, [r3 + 1 * 16] ; [16] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m5, [r3 + 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m1, m5 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m1, m3, 24 + + ret + +cglobal intra_pred_ang16_8, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_8_28 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_8_28 + + RET + +cglobal intra_pred_ang16_28, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_8_28 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_8_28 + + RET + +cglobal ang16_mode_9_27 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 14 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 14 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 10 *16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r3 - 2 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m3 + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r3 + 2 *16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r3 + 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r3 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r3 + 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m3, [r3 + 14 * 16] ; [30] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 14 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + movu m7, [r2 + 4] + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m3, m7, m1, 24 + + ret + +cglobal intra_pred_ang16_9, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_9_27 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_9_27 + + RET + +cglobal intra_pred_ang16_27, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_9_27 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_9_27 + + RET + +cglobal ang16_mode_11_25 + test r6d, r6d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 14 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 10 *16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r3 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r3 + 2 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m3 + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r3 - 2 *16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r3 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r3 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r3 - 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r3 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r3 - 10 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r3 - 14 * 16] ; [2] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + movu m3, [r2] + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_11, 3,7,8 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_11_25 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_11_25 + + RET + +cglobal intra_pred_ang16_25, 4,7,8 + xor r6d, r6d + inc r6d + mov r2, r3 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_11_25 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_11_25 + + RET + +cglobal ang16_mode_12_24 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r6 + 6 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 + 1 *16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 4 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r6 - 9 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m3 + pmaddwd m7, [r6 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 + 3 *16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 + 3 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r6 - 2 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 7 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 12 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 15 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 15 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 + 10 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r6 + 5 * 16] ; [21] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_12, 4,7,8 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 16 * 16] + movu m5, [r3] + pshufb m5, [pw_ang8_12] + pinsrw m5, [r3 + 26], 5 + xor r3d, r3d + + call ang16_mode_12_24 + + lea r0, [r0 + r1 * 8] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_12_24 + + RET + +cglobal intra_pred_ang16_24, 4,7,8 + xchg r2, r3 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 16 * 16] + movu m5, [r3] + pshufb m5, [pw_ang8_12] + pinsrw m5, [r3 + 26], 5 + xor r3d, r3d + inc r3d + + call ang16_mode_12_24 + + lea r0, [r0 + 16] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_12_24 + + RET + +cglobal ang16_mode_13_23 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 + 8 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r6 - 1 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 10 *16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r6 + 4 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 5 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 14 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 9 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r6 - 9 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 - 9 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 14 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r6 - 4 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 13 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 10 * 16] ; [25] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6 + 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 + 1 *16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_13, 4,7,8 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m5, [r3] + pshufb m5, [pw_ang16_13] + movu m6, [r3 + 14] + pshufb m6, [pw_ang8_13] + pslldq m6, 2 + palignr m5, m6, 6 + xor r3d, r3d + + call ang16_mode_13_23 + + lea r0, [r0 + r1 * 8] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_13_23 + + RET + +cglobal intra_pred_ang16_23, 4,7,8 + xchg r2, r3 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m5, [r3] + pshufb m5, [pw_ang16_13] + movu m6, [r3 + 14] + pshufb m6, [pw_ang8_13] + pslldq m6, 2 + palignr m5, m6, 6 + xor r3d, r3d + inc r3d + + call ang16_mode_13_23 + + lea r0, [r0 + 16] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_13_23 + + RET + +cglobal ang16_mode_14_22 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 + 1 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r6 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 7 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 6 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 13 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 13 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 6 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 - 7 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 - 7 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 12 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 1 * 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 14 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 5 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 8 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 11 * 16] ; [29] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6 - 2 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 - 2 *16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_14, 4,7,8 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 18 * 16] + movu m6, [r3] + pshufb m6, [pw_ang8_14] + movu m5, [r3 + 20] + pshufb m5, [pw_ang8_14] + punpckhqdq m5, m6 + xor r3d, r3d + + call ang16_mode_14_22 + + lea r0, [r0 + r1 * 8] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_14_22 + + RET + +cglobal intra_pred_ang16_22, 4,7,8 + xchg r2, r3 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 18 * 16] + movu m6, [r3] + pshufb m6, [pw_ang8_14] + movu m5, [r3 + 20] + pshufb m5, [pw_ang8_14] + punpckhqdq m5, m6 + xor r3d, r3d + inc r3d + + call ang16_mode_14_22 + + lea r0, [r0 + 16] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_14_22 + + RET + +cglobal ang16_mode_15_21 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + palignr m6, m0, m5, 2 + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + palignr m3, m6, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 15 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 15 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 2 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r6 - 4 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 11 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 6 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 9 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 - 8 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 - 8 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 7 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 7 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 10 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r6 - 12 * 16] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 3 * 16] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 14 * 16] ; [1] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + pmaddwd m3, [r6 + 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 + 1 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_15, 4,7,8 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m6, [r3 + 4] + pshufb m6, [pw_ang8_15] + movu m5, [r3 + 18] + pshufb m5, [pw_ang8_15] + punpckhqdq m5, m6 + xor r3d, r3d + + call ang16_mode_15_21 + + lea r0, [r0 + r1 * 8] + movu m5, [r2] + lea r2, [r2 + 16] + + call ang16_mode_15_21 + + RET + +cglobal intra_pred_ang16_21, 4,7,8 + xchg r2, r3 + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m6, [r3 + 4] + pshufb m6, [pw_ang8_15] + movu m5, [r3 + 18] + pshufb m5, [pw_ang8_15] + punpckhqdq m5, m6 + xor r3d, r3d + inc r3d + + call ang16_mode_15_21 + + lea r0, [r0 + 16] + movu m5, [r2] + lea r2, [r2 + 16] + + call ang16_mode_15_21 + + RET + +cglobal ang16_mode_16_20 + test r4d, r4d + lea r4, [r1 * 3] + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + palignr m6, m0, m5, 2 + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 - 2 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + palignr m3, m6, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 9 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 12 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 - 1 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 10 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 11 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 - 10 * 16] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 - 10 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 1 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 1 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 12 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 9 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 2 * 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m5, [r3] + pshufb m5, [pw_ang8_16] + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 13 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 8 * 16] ; [5] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + pmaddwd m3, [r6 + 3 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 + 3 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_16, 4,7,8,0-(1*mmsize) + add r1, r1 + lea r6, [ang_table + 13 * 16] + movu m6, [r3 + 4] + pshufb m6, [pw_ang16_16] + movu m5, [r3 + 16] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r3 + lea r3, [r3 + 24] + xor r4, r4 + + call ang16_mode_16_20 + + lea r0, [r0 + r1 * 8] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + + call ang16_mode_16_20 + + RET + +cglobal intra_pred_ang16_20, 4,7,8,0-(1*mmsize) + xchg r2, r3 + add r1, r1 + lea r6, [ang_table + 13 * 16] + movu m6, [r3 + 4] + pshufb m6, [pw_ang16_16] + movu m5, [r3 + 16] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r3 + lea r3, [r3 + 24] + xor r4, r4 + inc r4 + + call ang16_mode_16_20 + + lea r0, [r0 + 16] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + inc r4 + + call ang16_mode_16_20 + + RET + +cglobal ang16_mode_17_19 + test r4d, r4d + lea r4, [r1 * 3] + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + palignr m6, m0, m5, 2 + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 - 10 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 - 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + palignr m3, m6, 12 + + mova m2, m3 + pmaddwd m2, [r6 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 + 6 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 14 * 16] ; [2] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m5, [r3] + pshufb m5, [pw_ang8_17] + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 - 2 * 16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 10 * 16] ; [26] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6 - 16 * 16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 - 16 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal intra_pred_ang16_17, 4,7,8,0-(1*mmsize) + add r1, r1 + lea r6, [ang_table + 16 * 16] + movu m6, [r3 + 2] + pshufb m6, [pw_ang16_16] + movu m5, [r3 + 12] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r3 + lea r3, [r3 + 20] + xor r4, r4 + + call ang16_mode_17_19 + + lea r0, [r0 + r1 * 8] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + + call ang16_mode_17_19 + + RET + +cglobal intra_pred_ang16_19, 4,7,8,0-(1*mmsize) + xchg r2, r3 + add r1, r1 + lea r6, [ang_table + 16 * 16] + movu m6, [r3 + 2] + pshufb m6, [pw_ang16_16] + movu m5, [r3 + 12] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r3 + lea r3, [r3 + 20] + xor r4, r4 + inc r4 + + call ang16_mode_17_19 + + lea r0, [r0 + 16] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + inc r4 + + call ang16_mode_17_19 + + RET + +cglobal intra_pred_ang16_18, 4,5,4 + add r1, r1 + lea r4, [r1 * 3] + movu m1, [r3] + movu m3, [r3 + 16] + movu m0, [r2 + 2] + pshufb m0, [pw_swap16] + movu [r0], m1 + movu [r0 + 16], m3 + palignr m2, m1, m0, 14 + movu [r0 + r1], m2 + palignr m2, m3, m1, 14 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 12 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 10 + movu [r0 + r4], m2 + palignr m2, m3, m1, 10 + movu [r0 + r4 + 16], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m3, m1, 8 + movu [r0 + 16], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1], m2 + palignr m2, m3, m1, 6 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 4 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 2 + movu [r0 + r4], m2 + palignr m3, m1, 2 + movu [r0 + r4 + 16], m3 + + lea r0, [r0 + r1 * 4] + movu [r0], m0 + movu [r0 + 16], m1 + movu m3, [r2 + 18] + pshufb m3, [pw_swap16] + palignr m2, m0, m3, 14 + movu [r0 + r1], m2 + palignr m2, m1, m0, 14 + movu [r0 + r1 + 16], m2 + palignr m2, m0, m3, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m0, m3, 10 + movu [r0 + r4], m2 + palignr m2, m1, m0, 10 + movu [r0 + r4 + 16], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m0, m3, 8 + movu [r0], m2 + palignr m2, m1, m0, 8 + movu [r0 + 16], m2 + palignr m2, m0, m3, 6 + movu [r0 + r1], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1 + 16], m2 + palignr m2, m0, m3, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m0, m3, 2 + movu [r0 + r4], m2 + palignr m1, m0, 2 + movu [r0 + r4 + 16], m1 + + RET + +cglobal intra_pred_ang16_10, 4,5,4 + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1] + add r1, r1 + lea r4, [r1 * 3] + + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2] + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3] + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4] + movu [r0 + r4], m2 + movu [r0 + r4 + 16], m2 + + lea r2, [r0 + r1 *4] + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5] + movu [r2], m2 + movu [r2 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6] + movu [r2 + r1], m2 + movu [r2 + r1 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7] + movu [r2 + r1 * 2], m2 + movu [r2 + r1 * 2 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8] + movu [r2 + r4], m2 + movu [r2 + r4 + 16], m2 + + lea r2, [r2 + r1 *4] + pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9] + movu [r2], m2 + movu [r2 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10] + movu [r2 + r1], m2 + movu [r2 + r1 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11] + movu [r2 + r1 * 2], m2 + movu [r2 + r1 * 2 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12] + movu [r2 + r4], m2 + movu [r2 + r4 + 16], m2 + + lea r2, [r2 + r1 *4] + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13] + movu [r2], m2 + movu [r2 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14] + movu [r2 + r1], m2 + movu [r2 + r1 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15] + movu [r2 + r1 * 2], m2 + movu [r2 + r1 * 2 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16] + movu [r2 + r4], m2 + movu [r2 + r4 + 16], m2 + mova m3, m0 + + cmp r5m, byte 0 + jz .quit + + ; filter + + movh m1, [r3] ; [3 2 1 0] + pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0] + movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9] + psubw m1, m2 + psubw m3, m2 + psraw m1, 1 + psraw m3, 1 + paddw m3, m0 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_1023] + pmaxsw m3, m1 + pminsw m3, [pw_1023] +.quit: + movu [r0], m0 + movu [r0 + 16], m3 + + RET + +cglobal intra_pred_ang16_26, 4,5,4 + movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9] + add r1, r1 + lea r4, [r1 * 3] + + movu [r0], m0 + movu [r0 + 16], m3 + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m3 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 16], m3 + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m3 + + lea r3, [r0 + r1 *4] + movu [r3], m0 + movu [r3 + 16], m3 + movu [r3 + r1], m0 + movu [r3 + r1 + 16], m3 + movu [r3 + r1 * 2], m0 + movu [r3 + r1 * 2 + 16], m3 + movu [r3 + r4], m0 + movu [r3 + r4 + 16], m3 + + lea r3, [r3 + r1 *4] + movu [r3], m0 + movu [r3 + 16], m3 + movu [r3 + r1], m0 + movu [r3 + r1 + 16], m3 + movu [r3 + r1 * 2], m0 + movu [r3 + r1 * 2 + 16], m3 + movu [r3 + r4], m0 + movu [r3 + r4 + 16], m3 + + lea r3, [r3 + r1 *4] + movu [r3], m0 + movu [r3 + 16], m3 + movu [r3 + r1], m0 + movu [r3 + r1 + 16], m3 + movu [r3 + r1 * 2], m0 + movu [r3 + r1 * 2 + 16], m3 + movu [r3 + r4], m0 + movu [r3 + r4 + 16], m3 + + cmp r5m, byte 0 + jz .quit + + ; filter + + pshufb m0, [pw_unpackwdq] + movh m1, [r2] ; [3 2 1 0] + pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + psubw m1, m2 + psubw m3, m2 + psraw m1, 1 + psraw m3, 1 + paddw m3, m0 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_1023] + pmaxsw m3, m1 + pminsw m3, [pw_1023] + pextrw [r0], m0, 0 + pextrw [r0 + r1], m0, 1 + pextrw [r0 + r1 * 2], m0, 2 + pextrw [r0 + r4], m0, 3 + lea r0, [r0 + r1 * 4] + pextrw [r0], m0, 4 + pextrw [r0 + r1], m0, 5 + pextrw [r0 + r1 * 2], m0, 6 + pextrw [r0 + r4], m0, 7 + lea r0, [r0 + r1 * 4] + pextrw [r0], m3, 0 + pextrw [r0 + r1], m3, 1 + pextrw [r0 + r1 * 2], m3, 2 + pextrw [r0 + r4], m3, 3 + pextrw [r3], m3, 4 + pextrw [r3 + r1], m3, 5 + pextrw [r3 + r1 * 2], m3, 6 + pextrw [r3 + r4], m3, 7 + +.quit: + RET + +%macro MODE_2_34 0 + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu m2, [r2 + 36] + movu m3, [r2 + 52] + movu m4, [r2 + 68] + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + palignr m5, m1, m0, 2 + movu [r0 + r1], m5 + palignr m5, m2, m1, 2 + movu [r0 + r1 + 16], m5 + palignr m5, m3, m2, 2 + movu [r0 + r1 + 32], m5 + palignr m5, m4, m3, 2 + movu [r0 + r1 + 48], m5 + palignr m5, m1, m0, 4 + movu [r0 + r3], m5 + palignr m5, m2, m1, 4 + movu [r0 + r3 + 16], m5 + palignr m5, m3, m2, 4 + movu [r0 + r3 + 32], m5 + palignr m5, m4, m3, 4 + movu [r0 + r3 + 48], m5 + palignr m5, m1, m0, 6 + movu [r0 + r4], m5 + palignr m5, m2, m1, 6 + movu [r0 + r4 + 16], m5 + palignr m5, m3, m2, 6 + movu [r0 + r4 + 32], m5 + palignr m5, m4, m3, 6 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] + palignr m5, m1, m0, 8 + movu [r0], m5 + palignr m5, m2, m1, 8 + movu [r0 + 16], m5 + palignr m5, m3, m2, 8 + movu [r0 + 32], m5 + palignr m5, m4, m3, 8 + movu [r0 + 48], m5 + palignr m5, m1, m0, 10 + movu [r0 + r1], m5 + palignr m5, m2, m1, 10 + movu [r0 + r1 + 16], m5 + palignr m5, m3, m2, 10 + movu [r0 + r1 + 32], m5 + palignr m5, m4, m3, 10 + movu [r0 + r1 + 48], m5 + palignr m5, m1, m0, 12 + movu [r0 + r3], m5 + palignr m5, m2, m1, 12 + movu [r0 + r3 + 16], m5 + palignr m5, m3, m2, 12 + movu [r0 + r3 + 32], m5 + palignr m5, m4, m3, 12 + movu [r0 + r3 + 48], m5 + palignr m5, m1, m0, 14 + movu [r0 + r4], m5 + palignr m5, m2, m1, 14 + movu [r0 + r4 + 16], m5 + palignr m5, m3, m2, 14 + movu [r0 + r4 + 32], m5 + palignr m5, m4, m3, 14 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] + movu m0, [r2 + 84] + movu [r0], m1 + movu [r0 + 16], m2 + movu [r0 + 32], m3 + movu [r0 + 48], m4 + palignr m5, m2, m1, 2 + movu [r0 + r1], m5 + palignr m5, m3, m2, 2 + movu [r0 + r1 + 16], m5 + palignr m5, m4, m3, 2 + movu [r0 + r1 + 32], m5 + palignr m5, m0, m4, 2 + movu [r0 + r1 + 48], m5 + palignr m5, m2, m1, 4 + movu [r0 + r3], m5 + palignr m5, m3, m2, 4 + movu [r0 + r3 + 16], m5 + palignr m5, m4, m3, 4 + movu [r0 + r3 + 32], m5 + palignr m5, m0, m4, 4 + movu [r0 + r3 + 48], m5 + palignr m5, m2, m1, 6 + movu [r0 + r4], m5 + palignr m5, m3, m2, 6 + movu [r0 + r4 + 16], m5 + palignr m5, m4, m3, 6 + movu [r0 + r4 + 32], m5 + palignr m5, m0, m4, 6 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] + palignr m5, m2, m1, 8 + movu [r0], m5 + palignr m5, m3, m2, 8 + movu [r0 + 16], m5 + palignr m5, m4, m3, 8 + movu [r0 + 32], m5 + palignr m5, m0, m4, 8 + movu [r0 + 48], m5 + palignr m5, m2, m1, 10 + movu [r0 + r1], m5 + palignr m5, m3, m2, 10 + movu [r0 + r1 + 16], m5 + palignr m5, m4, m3, 10 + movu [r0 + r1 + 32], m5 + palignr m5, m0, m4, 10 + movu [r0 + r1 + 48], m5 + palignr m5, m2, m1, 12 + movu [r0 + r3], m5 + palignr m5, m3, m2, 12 + movu [r0 + r3 + 16], m5 + palignr m5, m4, m3, 12 + movu [r0 + r3 + 32], m5 + palignr m5, m0, m4, 12 + movu [r0 + r3 + 48], m5 + palignr m5, m2, m1, 14 + movu [r0 + r4], m5 + palignr m5, m3, m2, 14 + movu [r0 + r4 + 16], m5 + palignr m5, m4, m3, 14 + movu [r0 + r4 + 32], m5 + palignr m5, m0, m4, 14 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] +%endmacro +;-------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_2_34(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;-------------------------------------------------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang32_2, 3,6,6 + cmp r4m, byte 34 + cmove r2, r3mp + + add r1, r1 + lea r3, [r1 * 2] + lea r4, [r1 * 3] + mov r5, 2 + +.loop: + MODE_2_34 + add r2, 32 + dec r5 + jnz .loop + RET + +%macro TRANSPOSE_STORE_8x8 6 + %if %2 == 1 + ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 + punpckhwd m0, %3, %4 + punpcklwd %3, %4 + punpckhwd %4, %3, m0 + punpcklwd %3, m0 + + punpckhwd m0, %5, %6 + punpcklwd %5, %6 + punpckhwd %6, %5, m0 + punpcklwd %5, m0 + + punpckhqdq m0, %3, %5 + punpcklqdq %3, %5 + punpcklqdq %5, %4, %6 + punpckhqdq %4, %6 + + movu [r0 + %1], %3 + movu [r0 + r1 + %1], m0 + movu [r0 + r1 * 2 + %1], %5 + movu [r0 + r5 + %1], %4 + %else + ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32 + movh [r0], %3 + movhps [r0 + r1], %3 + movh [r0 + r1 * 2], %4 + movhps [r0 + r5], %4 + lea r0, [r0 + r1 * 4] + movh [r0], %5 + movhps [r0 + r1], %5 + movh [r0 + r1 * 2], %6 + movhps [r0 + r5], %6 + lea r0, [r0 + r1 * 4] + %endif +%endmacro + +%macro MODE_3_33 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + mova m7, m0 + + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2 + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0 + + palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1 + pmaddwd m4, m0, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m5, m1, [r3 + 4 * 16] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m5, m2, m0, 8 + pmaddwd m5, [r3 - 2 * 16] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 12 + pmaddwd m6, [r3 - 8 * 16] ; [ 8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m2, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, m0, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + + psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10] + palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7] + + pmaddwd m2, [r3] ; [16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8] + movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16] + palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9] + punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12] + punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8] + + palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9] + pmaddwd m4, m0, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m5, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 14 * 16] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10] + pmaddwd m6, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11] + pmaddwd m6, m1, [r3 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13] + pmaddwd m1, m2, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 28] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28] + palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29] + punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32] + punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28] + + pmaddwd m4, m0, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29] + pmaddwd m1, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30] + pmaddwd m5, [r3 - 2 * 16] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31] + pmaddwd m6, [r3 - 8 * 16] ; [ 8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31] + pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m6, m2 + + movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38] + palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39] + punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32] + punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28] + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m0, m2, m3, 4 + pmaddwd m0, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m5, m2, m3, 8 + pmaddwd m4, m5, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m5, m2, m3, 12 + pmaddwd m1, m5, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 14 * 16] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 46] + palignr m1, m3, 2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m6, m2, m3, 4 + pmaddwd m6, [r3 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m2, m3, 8 + pmaddwd m1, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m2, m3, 12 + pmaddwd m1, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 54] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_3, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_3_33 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_4_32 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m1, m5, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 15 * 16] ; [31] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 8 + pmaddwd m6, [r3 + 4 * 16] ; [ 20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m1, m2, m0, 12 + pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 + 3 * 16] ; [19] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4] + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + + movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] + + palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] + palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] + punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] + punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] + + palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6] + pmaddwd m1, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m5, m2, m7, 8 + mova m6, m5 + pmaddwd m5, [r3 - 9 * 16] ; [07] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m6, m2, m7, 12 + pmaddwd m6, [r3 + 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m2, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 + 11 * 16] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m7, m3, m2, 4 + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m1, m7 + mova m7, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + palignr m0, m3, m2, 8 + pmaddwd m4, m0, [r3 - 11 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m5, m3, m2, 12 + pmaddwd m5, [r3 - 16] ; [15] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + pmaddwd m6, m3, [r3 + 9 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + + movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25] + palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18] + palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19] + punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22] + punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18] + + palignr m1, m2, m3, 4 + pmaddwd m1, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m2, m3, 8 + mova m0, m1 + pmaddwd m1, [r3 - 13 * 16] ; [3] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m4, m2, m3, 12 + pmaddwd m4, [r3 - 3 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m2, [r3 + 7 * 16] ; [23] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m7, m2, 4 + pmaddwd m6, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m1, m7, m2, 8 + pmaddwd m6, m1, [r3 - 15 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m7, m2, 12 + pmaddwd m1, [r3 - 5 * 16] ; [11] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 44] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_4, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_4_32 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_5_31 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 + 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m1, m2, m0, 4 + mova m5, m1 + pmaddwd m1, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 3 * 16] ; [19] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 8 + mova m1, m6 + pmaddwd m6, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 + 5 * 16] ; [21] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m2, m0, 12 + mova m7, m1 + pmaddwd m7, [r3 - 10 * 16] ; [6] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m1, [r3 + 7 * 16] ; [23] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m7, m2, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m1, m7 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m2, [r3 + 9 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4] + pmaddwd m1, m7, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 11 * 16] ; [27] + paddd m5, [pd_16] + psrld m5, 5 + + movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] + palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] + palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] + punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] + punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] + + palignr m6, m2, m7, 4 + pmaddwd m1, m6, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m2, m7, 8 + mova m0, m1 + pmaddwd m1, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m0, [r3 + 15 * 16] ; [31] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m0, m2, m7, 12 + pmaddwd m0, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m2, [r3 - 15 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m1, m3, m2, 4 + pmaddwd m5, m1, [r3 - 13 * 16] ; [3] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + palignr m1, m3, m2, 8 + pmaddwd m6, m1, [r3 - 11 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m3, m2, 12 + pmaddwd m1, m7, [r3 - 9 * 16] ; [7] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m1, m7 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 7 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18] + palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19] + punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18] + + palignr m1, m0, m3, 4 + pmaddwd m5, m1, [r3 - 5 * 16] ; [11] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + palignr m1, m0, m3, 8 + pmaddwd m6, m1, [r3 - 3 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m0, m3, 12 + pmaddwd m1, [r3 - 16] ; [15] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 36] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_5, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_5_31 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_6_30 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 - 3 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m1, m2, m0, 4 + pmaddwd m5, m1, [r3 - 9 * 16] ; [7] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + palignr m1, m2, m0, 8 + pmaddwd m6, m1, [r3 - 15 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m7, m1, [r3 - 2 * 16] ; [14] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m1, [r3 + 11 * 16] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m7, m2, m0, 12 + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m2, [r3 - 16] ; [15] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m2, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m7, m3, m2, 4 + pmaddwd m6, m7, [r3 - 7 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] + palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] + palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] + punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] + punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] + + palignr m0, m2, m7, 4 + pmaddwd m1, m0, [r3 - 13 * 16] ; [3] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + palignr m4, m2, m7, 4 + pmaddwd m4, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m5, m2, m7, 8 + pmaddwd m1, m5, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 7 * 16] ; [23] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m1, m2, m7, 12 + pmaddwd m6, m1, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 + 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 - 5 * 16] ; [11] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m2, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m5, m3, m2, 4 + pmaddwd m4, m5, [r3 - 11 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m5, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 15 * 16] ; [31] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m3, m2, 8 + pmaddwd m1, m6, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + pmaddwd m6, [r3 + 9 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m3, m2, 12 + pmaddwd m0, m1, [r3 - 10 * 16] ; [6] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m1, [r3 + 3 * 16] ; [19] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 28] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_6, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_6_30 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_7_29 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 - 7 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m0, [r3 + 11 * 16] ; [27] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m1, m2, m0, 4 + pmaddwd m6, m1, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 - 3 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m7, m1, [r3 + 6 * 16] ; [22] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m1, [r3 + 15 * 16] ; [31] + paddd m1, [pd_16] + psrld m1, 5 + + mova m3, m0 + palignr m7, m2, m0, 8 + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m1, m2, m3, 12 + pmaddwd m5, m1, [r3 - 13 * 16] ; [3] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m1, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 + 5 * 16] ; [21] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 - 9 * 16] ; [7] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m2, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m2, [r3 + 9 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + + movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10] + punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9] + + palignr m6, m7, m2, 4 + pmaddwd m1, m6, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m6, [r3 - 5 * 16] ; [11] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m6, [r3 + 4 * 16] ; [20] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, [r3 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m0, m7, m2, 8 + pmaddwd m1, m0, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m0, [r3 - 16] ; [15] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m0, m7, m2, 12 + pmaddwd m4, m0, [r3 - 15 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m0, [r3 + 3 * 16] ; [19] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 - 11 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m0, m7, [r3 - 2 * 16] ; [14] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m1, m7, [r3 + 7 * 16] ; [23] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 20] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_7, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_7_29 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_8_28 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 - 11 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m0, [r3 - 16] ; [15] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m0, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m0, [r3 + 9 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m0, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m2, m0, 4 + pmaddwd m1, m7, [r3 - 13 * 16] ; [3] + paddd m1, [pd_16] + psrld m1, 5 + + mova m3, m0 + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 - 3 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 7 * 16] ; [23] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m7, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m7, m2, m3, 8 + pmaddwd m6, m7, [r3 - 15 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m7, [r3 - 5 * 16] ; [11] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m7, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 15 * 16] ; [31] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m7, m2, m3, 12 + pmaddwd m0, m7, [r3 - 12 * 16] ; [4] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 - 7 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m7, [r3 + 3 * 16] ; [19] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m7, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m2, [r3 - 9 * 16] ; [7] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m2, [r3 - 4 * 16] ; [12] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m2, [r3 + 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m0, m2, [r3 + 6 * 16] ; [22] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m1, m2, [r3 + 11 * 16] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 12] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_8, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_8_28 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_9_27 1 + movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m3, [r3 - 14 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 10 * 16] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 + 2 * 16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 6 * 16] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 10 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 4] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + palignr m7, m2, m3, 4 + pmaddwd m4, m7, [r3 - 14 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 - 10 * 16] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m7, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m7, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 2 * 16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 6 * 16] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m7, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 + 10 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m0, m7, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m7, [r3 + 14 * 16] ; [30] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m7, m7 + movhps m7, [r2 + 6] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_9, 3,6,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_9_27 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_10, 4,7,8 + mov r6d, 4 + add r1, r1 + lea r5, [r1 * 3] + lea r4, [r1 * 2] + lea r3, [r1 * 4] + mova m7, [c_mode32_10_0] + +.loop: + movu m0, [r2 + 2] + pshufb m1, m0, m7 + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + 32], m1 + movu [r0 + 48], m1 + + palignr m1, m0, 2 + pshufb m1, m7 + movu [r0 + r1], m1 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 48], m1 + + palignr m1, m0, 4 + pshufb m1, m7 + movu [r0 + r4], m1 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m1 + movu [r0 + r4 + 48], m1 + + palignr m1, m0, 6 + pshufb m1, m7 + movu [r0 + r5], m1 + movu [r0 + r5 + 16], m1 + movu [r0 + r5 + 32], m1 + movu [r0 + r5 + 48], m1 + + add r0, r3 + + palignr m1, m0, 8 + pshufb m1, m7 + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + 32], m1 + movu [r0 + 48], m1 + + palignr m1, m0, 10 + pshufb m1, m7 + movu [r0 + r1], m1 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 48], m1 + + palignr m1, m0, 12 + pshufb m1, m7 + movu [r0 + r4], m1 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m1 + movu [r0 + r4 + 48], m1 + + palignr m1, m0, 14 + pshufb m1, m7 + movu [r0 + r5], m1 + movu [r0 + r5 + 16], m1 + movu [r0 + r5 + 32], m1 + movu [r0 + r5 + 48], m1 + + add r0, r3 + add r2, 16 + dec r6d + jnz .loop + RET + +%macro MODE_11_25 1 + movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0] + pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 10 * 16] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 2 * 16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 6 * 16] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 10 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 2] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2] ; [6 5 4 3 2 1 0 16] + pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16] + + pmaddwd m4, m3, [r3 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 10 * 16] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 2 * 16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 6 * 16] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 10 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 0*mmsize + 2], m0 + movu [rsp + 1*mmsize + 2], m1 + movu [rsp + 2*mmsize + 2], m2 + movu [rsp + 3*mmsize + 2], m3 + mov r4w, [r3+32] + mov [rsp], r4w + mov r4w, [r2+64] + mov [rsp+66], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_11_25 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_12_24 1 + movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 16] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 9 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 13 * 16] ; [29] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 + 3 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 7 * 16] ; [9] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 15 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 5 * 16] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 5 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 15 * 16] ; [1] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 3 * 16] ; [13] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 13 * 16] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 9 * 16] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 11 * 16] ; [5] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 0*mmsize + 8], m0 + movu [rsp + 1*mmsize + 8], m1 + movu [rsp + 2*mmsize + 8], m2 + movu [rsp + 3*mmsize + 8], m3 + + mov r4w, [r2+64] + mov [rsp+72], r4w + mov r4w, [r3+12] + mov [rsp+6], r4w + mov r4w, [r3+26] + mov [rsp+4], r4w + mov r4w, [r3+38] + mov [rsp+2], r4w + mov r4w, [r3+52] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_12_24 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_13_23 1 + movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 7 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 11 * 16] ; [05] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 3 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 15 * 16] ; [01] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 13 * 16] ; [29] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 5 * 16] ; [11] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 9 * 16] ; [25] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 9 * 16] ; [07] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 5 * 16] ; [21] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m3, [r3 - 4 * 16] ; [12] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 - 13 * 16] ; [03] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 16] ; [17] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 15 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 3 * 16] ; [13] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 11 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 7 * 16] ; [09] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize], m0 + movu [rsp + 2*mmsize], m1 + movu [rsp + 3*mmsize], m2 + movu [rsp + 4*mmsize], m3 + + mov r4w, [r2+64] + mov [rsp+80], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 36] + pshufb m0, [shuf_mode_13_23] + pshufb m1, [shuf_mode_13_23] + movh [rsp + 8], m0 + movh [rsp], m1 + mov r4w, [r3+28] + mov [rsp+8], r4w + mov r4w, [r3+56] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_13_23 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_14_22 1 + movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 3 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 9 * 16] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 15 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 11 * 16] ; [05] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 5 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 16] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 13 * 16] ; [29] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 13 * 16] ; [03] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 7 * 16] ; [09] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 - 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 5 * 16] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 15 * 16] ; [01] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 9 * 16] ; [07] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 3 * 16] ; [13] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_14, 4,6,7,0-(5*mmsize+10) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize + 8], m0 + movu [rsp + 2*mmsize + 8], m1 + movu [rsp + 3*mmsize + 8], m2 + movu [rsp + 4*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 88], r4w + mov r4w, [r3+4] + mov [rsp+22], r4w + movu m0, [r3 + 10] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + pshufb m0, [shuf_mode_14_22] + pshufb m1, [shuf_mode_14_22] + pshufb m2, [shuf_mode_14_22] + movh [rsp + 14], m0 + movh [rsp + 6], m1 + movh [rsp - 2], m2 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_14_22 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_15_21 1 + movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 - 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 30] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 3 * 16] ; [13] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 28] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 5 * 16] ; [11] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 26] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 7 * 16] ; [09] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 24] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 9 * 16] ; [07] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 11 * 16] ; [05] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 13 * 16] ; [03] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 15 * 16] ; [01] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 15 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 13 * 16] ; [29] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m3, [r3 - 4 * 16] ; [12] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 11 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 9 * 16] ; [25] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 7 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 5 * 16] ; [21] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 3 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 16] ; [17] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize], m0 + movu [rsp + 3*mmsize], m1 + movu [rsp + 4*mmsize], m2 + movu [rsp + 5*mmsize], m3 + + mov r4w, [r2 + 64] + mov [rsp + 96], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 18] + movu m2, [r3 + 34] + movu m3, [r3 + 48] + pshufb m0, [shuf_mode_15_21] + pshufb m1, [shuf_mode_15_21] + pshufb m2, [shuf_mode_15_21] + pshufb m3, [shuf_mode_15_21] + movh [rsp + 24], m0 + movh [rsp + 16], m1 + movh [rsp + 8], m2 + movh [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_15_21 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_16_20 1 + movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 - 5 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 38] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 15 * 16] ; [01] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 36] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 34] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 32] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 3 * 16] ; [13] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 30] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 13 * 16] ; [03] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 28] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 26] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 9 * 16] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 24] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 11 * 16] ; [05] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 16] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 - 9 * 16] ; [07] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 13 * 16] ; [29] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 3 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 7 * 16] ; [09] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 15 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 5 * 16] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize + 8], m0 + movu [rsp + 3*mmsize + 8], m1 + movu [rsp + 4*mmsize + 8], m2 + movu [rsp + 5*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 104], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 22] + movu m2, [r3 + 40] + movd m3, [r3 + 58] + pshufb m0, [shuf_mode_16_20] + pshufb m1, [shuf_mode_16_20] + pshufb m2, [shuf_mode_16_20] + pshufb m3, [shuf_mode_16_20] + movu [rsp + 24], m0 + movu [rsp + 12], m1 + movu [rsp], m2 + movd [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_16_20 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_17_19 1 + movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 - 10 * 16] ; [06] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 48] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 46] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 2 * 16] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 44] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 42] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 14 * 16] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 40] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 38] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + movu m3, [r2 + 36] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 34] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 14 * 16] ; [02] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 32] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [08] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 30] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 28] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 26] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 26] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2 + 24] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 - 10 * 16] ; [06] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 2 * 16] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 14 * 16] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 14 * 16] ; [02] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [08] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4) + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 3*mmsize + 2], m0 + movu [rsp + 4*mmsize + 2], m1 + movu [rsp + 5*mmsize + 2], m2 + movu [rsp + 6*mmsize + 2], m3 + + mov r4w, [r2 + 64] + mov [rsp + 114], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + movd m3, [r3 + 2] + pshufb m0, [shuf_mode_17_19] + pshufb m1, [shuf_mode_17_19] + pshufb m2, [shuf_mode_17_19] + pshufb m3, [shuf_mode_16_20] + movd [rsp + 46], m3 + movu [rsp + 30], m0 + movu [rsp + 12], m1 + movu [rsp - 4], m2 + mov r4w, [r3 + 24] + mov [rsp + 30], r4w + mov r4w, [r3 + 28] + mov [rsp + 28], r4w + mov r4w, [r3 + 46] + mov [rsp + 12], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_17_19 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_18, 4,7,8 + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8] + movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16] + movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24] + movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9] + + add r1, r1 + lea r6, [r1 * 2] + lea r3, [r1 * 3] + lea r4, [r1 * 4] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8] + pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16] + + palignr m6, m0, m4, 14 + movu [r0 + r1], m6 + palignr m6, m1, m0, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m2, m1, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m3, m2, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m0, m4, 12 + movu [r0 + r6], m6 + palignr m6, m1, m0, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m2, m1, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m3, m2, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m0, m4, 10 + movu [r0 + r3], m6 + palignr m6, m1, m0, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m2, m1, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m3, m2, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m0, m4, 8 + movu [r0], m6 + palignr m6, m1, m0, 8 + movu [r0 + 16], m6 + palignr m6, m2, m1, 8 + movu [r0 + 32], m6 + palignr m6, m3, m2, 8 + movu [r0 + 48], m6 + + palignr m6, m0, m4, 6 + movu [r0 + r1], m6 + palignr m6, m1, m0, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m2, m1, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m3, m2, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m0, m4, 4 + movu [r0 + r6], m6 + palignr m6, m1, m0, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m2, m1, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m3, m2, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m0, m4, 2 + movu [r0 + r3], m6 + palignr m6, m1, m0, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m2, m1, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m3, m2, 2 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + movu [r0], m4 + movu [r0 + 16], m0 + movu [r0 + 32], m1 + movu [r0 + 48], m2 + + palignr m6, m4, m5, 14 + movu [r0 + r1], m6 + palignr m6, m0, m4, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m1, m0, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m2, m1, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m4, m5, 12 + movu [r0 + r6], m6 + palignr m6, m0, m4, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m1, m0, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m2, m1, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m4, m5, 10 + movu [r0 + r3], m6 + palignr m6, m0, m4, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m1, m0, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m2, m1, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m4, m5, 8 + movu [r0], m6 + palignr m6, m0, m4, 8 + movu [r0 + 16], m6 + palignr m6, m1, m0, 8 + movu [r0 + 32], m6 + palignr m6, m2, m1, 8 + movu [r0 + 48], m6 + + palignr m6, m4, m5, 6 + movu [r0 + r1], m6 + palignr m6, m0, m4, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m1, m0, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m2, m1, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m4, m5, 4 + movu [r0 + r6], m6 + palignr m6, m0, m4, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m1, m0, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m2, m1, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m4, m5, 2 + movu [r0 + r3], m6 + palignr m6, m0, m4, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m1, m0, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m2, m1, 2 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + movu m2, [r2 + 34] + movu m3, [r2 + 50] + pshufb m2, [shuf_mode32_18] + pshufb m3, [shuf_mode32_18] + + movu [r0], m5 + movu [r0 + 16], m4 + movu [r0 + 32], m0 + movu [r0 + 48], m1 + + palignr m6, m5, m2, 14 + movu [r0 + r1], m6 + palignr m6, m4, m5, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m0, m4, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m1, m0, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m5, m2, 12 + movu [r0 + r6], m6 + palignr m6, m4, m5, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m0, m4, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m1, m0, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m5, m2, 10 + movu [r0 + r3], m6 + palignr m6, m4, m5, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m0, m4, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m1, m0, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m5, m2, 8 + movu [r0], m6 + palignr m6, m4, m5, 8 + movu [r0 + 16], m6 + palignr m6, m0, m4, 8 + movu [r0 + 32], m6 + palignr m6, m1, m0, 8 + movu [r0 + 48], m6 + + palignr m6, m5, m2, 6 + movu [r0 + r1], m6 + palignr m6, m4, m5, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m0, m4, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m1, m0, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m5, m2, 4 + movu [r0 + r6], m6 + palignr m6, m4, m5, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m0, m4, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m1, m0, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m5, m2, 2 + movu [r0 + r3], m6 + palignr m6, m4, m5, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m0, m4, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m1, m0, 2 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + movu [r0], m2 + movu [r0 + 16], m5 + movu [r0 + 32], m4 + movu [r0 + 48], m0 + + palignr m6, m2, m3, 14 + movu [r0 + r1], m6 + palignr m6, m5, m2, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m4, m5, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m0, m4, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m2, m3, 12 + movu [r0 + r6], m6 + palignr m6, m5, m2, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m4, m5, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m0, m4, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m2, m3, 10 + movu [r0 + r3], m6 + palignr m6, m5, m2, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m4, m5, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m0, m4, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m2, m3, 8 + movu [r0], m6 + palignr m6, m5, m2, 8 + movu [r0 + 16], m6 + palignr m6, m4, m5, 8 + movu [r0 + 32], m6 + palignr m6, m0, m4, 8 + movu [r0 + 48], m6 + + palignr m6, m2, m3, 6 + movu [r0 + r1], m6 + palignr m6, m5, m2, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m4, m5, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m0, m4, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m2, m3, 4 + movu [r0 + r6], m6 + palignr m6, m5, m2, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m4, m5, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m0, m4, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m2, m3, 2 + movu [r0 + r3], m6 + palignr m6, m5, m2, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m4, m5, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m0, m4, 2 + movu [r0 + r3 + 48], m6 + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 3*mmsize + 2], m0 + movu [rsp + 4*mmsize + 2], m1 + movu [rsp + 5*mmsize + 2], m2 + movu [rsp + 6*mmsize + 2], m3 + + mov r4w, [r2 + 64] + mov [rsp + 114], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + movd m3, [r3 + 2] + pshufb m0, [shuf_mode_17_19] + pshufb m1, [shuf_mode_17_19] + pshufb m2, [shuf_mode_17_19] + pshufb m3, [shuf_mode_16_20] + movd [rsp + 46], m3 + movu [rsp + 30], m0 + movu [rsp + 12], m1 + movu [rsp - 4], m2 + mov r4w, [r3 + 24] + mov [rsp + 30], r4w + mov r4w, [r3 + 28] + mov [rsp + 28], r4w + mov r4w, [r3 + 46] + mov [rsp + 12], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_17_19 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize + 8], m0 + movu [rsp + 3*mmsize + 8], m1 + movu [rsp + 4*mmsize + 8], m2 + movu [rsp + 5*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 104], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 22] + movu m2, [r3 + 40] + movd m3, [r3 + 58] + pshufb m0, [shuf_mode_16_20] + pshufb m1, [shuf_mode_16_20] + pshufb m2, [shuf_mode_16_20] + pshufb m3, [shuf_mode_16_20] + movu [rsp + 24], m0 + movu [rsp + 12], m1 + movu [rsp], m2 + movd [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_16_20 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize], m0 + movu [rsp + 3*mmsize], m1 + movu [rsp + 4*mmsize], m2 + movu [rsp + 5*mmsize], m3 + + mov r4w, [r2 + 64] + mov [rsp + 96], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 18] + movu m2, [r3 + 34] + movu m3, [r3 + 48] + pshufb m0, [shuf_mode_15_21] + pshufb m1, [shuf_mode_15_21] + pshufb m2, [shuf_mode_15_21] + pshufb m3, [shuf_mode_15_21] + movh [rsp + 24], m0 + movh [rsp + 16], m1 + movh [rsp + 8], m2 + movh [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_15_21 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize + 8], m0 + movu [rsp + 2*mmsize + 8], m1 + movu [rsp + 3*mmsize + 8], m2 + movu [rsp + 4*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 88], r4w + mov r4w, [r3+4] + mov [rsp+22], r4w + movu m0, [r3 + 10] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + pshufb m0, [shuf_mode_14_22] + pshufb m1, [shuf_mode_14_22] + pshufb m2, [shuf_mode_14_22] + movh [rsp + 14], m0 + movh [rsp + 6], m1 + movh [rsp - 2], m2 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_14_22 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize], m0 + movu [rsp + 2*mmsize], m1 + movu [rsp + 3*mmsize], m2 + movu [rsp + 4*mmsize], m3 + + mov r4w, [r2+64] + mov [rsp+80], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 36] + pshufb m0, [shuf_mode_13_23] + pshufb m1, [shuf_mode_13_23] + movh [rsp + 8], m0 + movh [rsp], m1 + mov r4w, [r3+28] + mov [rsp+8], r4w + mov r4w, [r3+56] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_13_23 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + + movu [rsp + 0*mmsize + 8], m0 + movu [rsp + 1*mmsize + 8], m1 + movu [rsp + 2*mmsize + 8], m2 + movu [rsp + 3*mmsize + 8], m3 + + mov r4w, [r2+64] + mov [rsp+72], r4w + mov r4w, [r3+12] + mov [rsp+6], r4w + mov r4w, [r3+26] + mov [rsp+4], r4w + mov r4w, [r3+38] + mov [rsp+2], r4w + mov r4w, [r3+52] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mova m2, [pw_punpcklwd] + +.loop: + MODE_12_24 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_25(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4) + xchg r2, r3 + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 0*mmsize + 2], m0 + movu [rsp + 1*mmsize + 2], m1 + movu [rsp + 2*mmsize + 2], m2 + movu [rsp + 3*mmsize + 2], m3 + mov r4w, [r3+32] + mov [rsp], r4w + mov r4w, [r2+64] + mov [rsp+66], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + +.loop: + MODE_11_25 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_26, 4,7,5 + mov r6d, 4 + add r1, r1 + lea r2, [r1 * 2] + lea r4, [r1 * 3] + lea r5, [r1 * 4] + mova m4, [c_mode32_10_0] + + movu m0, [r3 + 2] + movu m1, [r3 + 18] + movu m2, [r3 + 34] + movu m3, [r3 + 50] + +.loop: + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + movu [r0 + r2], m0 + movu [r0 + r2 + 16], m1 + movu [r0 + r2 + 32], m2 + movu [r0 + r2 + 48], m3 + + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m2 + movu [r0 + r4 + 48], m3 + + add r0, r5 + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + movu [r0 + r2], m0 + movu [r0 + r2 + 16], m1 + movu [r0 + r2 + 32], m2 + movu [r0 + r2 + 48], m3 + + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m2 + movu [r0 + r4 + 48], m3 + + add r0, r5 + dec r6d + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_27, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_9_27 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_28, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_8_28 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_29, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_7_29 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_30, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_6_30 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_31, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_5_31 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_32, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_4_32 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_33, 4,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 +.loop: + MODE_3_33 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET diff --git a/source/common/x86/intrapred8.asm b/source/common/x86/intrapred8.asm new file mode 100644 index 0000000..0ababc6 --- /dev/null +++ b/source/common/x86/intrapred8.asm @@ -0,0 +1,31997 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +pb_0_8 times 8 db 0, 8 +pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 +pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 +c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 +c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0 +c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0 +c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15 +c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0 +c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0 +c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 +c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 +pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 +c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6 +c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4 +c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2 +c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2 +c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2 +c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1 +c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 + +const ang_table +%assign x 0 +%rep 32 + times 8 db (32-x), x +%assign x x+1 +%endrep + +SECTION .text + +cextern pw_8 +cextern pw_1024 +cextern pb_unpackbd1 +cextern multiL +cextern multiH +cextern multiH2 +cextern multiH3 +cextern multi_2Row + +;----------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc4, 4,6,3 + mov r4d, r5m + inc r2 + inc r3 + pxor m0, m0 + movd m1, [r2] + movd m2, [r3] + punpckldq m1, m2 + psadbw m1, m0 ; m1 = sum + + test r4d, r4d + + mov r4d, 4096 + movd m2, r4d + pmulhrsw m1, m2 ; m1 = (sum + 4) / 8 + movd r4d, m1 ; r4d = dc_val + pshufb m1, m0 ; m1 = byte [dc_val ...] + + ; store DC 4x4 + lea r5, [r1 * 3] + movd [r0], m1 + movd [r0 + r1], m1 + movd [r0 + r1 * 2], m1 + movd [r0 + r5], m1 + + ; do DC filter + jz .end + lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2 + add r4d, r5d ; r4d = DC * 3 + 2 + movd m1, r4d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + + ; filter top + pmovzxbw m2, [r3] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + movd [r0], m2 ; overwrite top-left pixel, we will update it later + + ; filter top-left + movzx r3d, byte [r3] + add r5d, r3d + movzx r3d, byte [r2] + add r3d, r5d + shr r3d, 2 + mov [r0], r3b + + ; filter left + add r0, r1 + pmovzxbw m2, [r2 + 1] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + pextrb [r0], m2, 0 + pextrb [r0 + r1], m2, 1 + pextrb [r0 + r1 * 2], m2, 2 + +.end: + RET + + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc8, 4, 7, 3 + mov r4d, r5m + inc r2 + inc r3 + pxor m0, m0 + movh m1, [r2] + movh m2, [r3] + punpcklqdq m1, m2 + psadbw m1, m0 + pshufd m2, m1, 2 + paddw m1, m2 + + movd r5d, m1 + add r5d, 8 + shr r5d, 4 ; sum = sum / 16 + movd m1, r5d + pshufb m1, m0 ; m1 = byte [dc_val ...] + + test r4d, r4d + + ; store DC 8x8 + mov r6, r0 + movh [r0], m1 + movh [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movh [r0], m1 + movh [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movh [r0], m1 + movh [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movh [r0], m1 + movh [r0 + r1], m1 + + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 + + ; filter top + pmovzxbw m2, [r3] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + movh [r6], m2 + + ; filter top-left + movzx r3d, byte [r3] + add r4d, r3d + movzx r3d, byte [r2] + add r3d, r4d + shr r3d, 2 + mov [r6], r3b + + ; filter left + add r6, r1 + pmovzxbw m2, [r2 + 1] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + pextrb [r6], m2, 0 + pextrb [r6 + r1], m2, 1 + pextrb [r6 + 2 * r1], m2, 2 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 3 + pextrb [r6 + r1 * 2], m2, 4 + pextrb [r6 + r1 * 4], m2, 6 + lea r1, [r1 * 3] + pextrb [r6 + r1], m2, 5 + +.end: + RET + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc16, 5, 7, 4 + mov r4d, r5m + inc r2 + inc r3 + pxor m0, m0 + movu m1, [r2] + movu m2, [r3] + psadbw m1, m0 + psadbw m2, m0 + paddw m1, m2 + pshufd m2, m1, 2 + paddw m1, m2 + + movd r5d, m1 + add r5d, 16 + shr r5d, 5 ; sum = sum / 32 + movd m1, r5d + pshufb m1, m0 ; m1 = byte [dc_val ...] + + test r4d, r4d + + ; store DC 16x16 + mov r6, r0 + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 + + ; filter top + pmovzxbw m2, [r3] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + movh [r6], m2 + pmovzxbw m3, [r3 + 8] + paddw m3, m1 + psraw m3, 2 + packuswb m3, m3 + movh [r6 + 8], m3 + + ; filter top-left + movzx r3d, byte [r3] + add r4d, r3d + movzx r3d, byte [r2] + add r3d, r4d + shr r3d, 2 + mov [r6], r3b + + ; filter left + add r6, r1 + pmovzxbw m2, [r2 + 1] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + pextrb [r6], m2, 0 + pextrb [r6 + r1], m2, 1 + pextrb [r6 + r1 * 2], m2, 2 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 3 + pextrb [r6 + r1 * 2], m2, 4 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 5 + pextrb [r6 + r1 * 2], m2, 6 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 7 + + pmovzxbw m3, [r2 + 9] + paddw m3, m1 + psraw m3, 2 + packuswb m3, m3 + pextrb [r6 + r1 * 2], m3, 0 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m3, 1 + pextrb [r6 + r1 * 2], m3, 2 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m3, 3 + pextrb [r6 + r1 * 2], m3, 4 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m3, 5 + pextrb [r6 + r1 * 2], m3, 6 + +.end: + RET + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc32, 4, 5, 5 + inc r2 + inc r3 + pxor m0, m0 + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r3] + movu m4, [r3 + 16] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + paddw m1, m2 + paddw m3, m4 + paddw m1, m3 + pshufd m2, m1, 2 + paddw m1, m2 + + movd r4d, m1 + add r4d, 32 + shr r4d, 6 ; sum = sum / 64 + movd m1, r4d + pshufb m1, m0 ; m1 = byte [dc_val ...] + +%rep 2 + ; store DC 16x16 + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] +%endrep + + RET + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar4, 4,7,5 + inc r2 + inc r3 + pmovzxbw m0, [r3] ; topRow[i] = above[i]; + punpcklqdq m0, m0 + + pxor m1, m1 + movd m2, [r2 + 4] ; bottomLeft = left[4] + movzx r6d, byte [r3 + 4] ; topRight = above[4]; + pshufb m2, m1 + punpcklbw m2, m1 + psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i] + psllw m0, 2 + punpcklqdq m3, m2, m1 + psubw m0, m3 + paddw m2, m2 + +%macro COMP_PRED_PLANAR_2ROW 1 + movzx r4d, byte [r2 + %1] + lea r4d, [r4d * 4 + 4] + movd m3, r4d + pshuflw m3, m3, 0 + + movzx r4d, byte [r2 + %1 + 1] + lea r4d, [r4d * 4 + 4] + movd m4, r4d + pshuflw m4, m4, 0 + punpcklqdq m3, m4 ; horPred + + movzx r4d, byte [r2 + %1] + mov r5d, r6d + sub r5d, r4d + movd m4, r5d + pshuflw m4, m4, 0 + + movzx r4d, byte [r2 + %1 + 1] + mov r5d, r6d + sub r5d, r4d + movd m1, r5d + pshuflw m1, m1, 0 + punpcklqdq m4, m1 ; rightColumnN + + pmullw m4, [multi_2Row] + paddw m3, m4 + paddw m0, m2 + paddw m3, m0 + psraw m3, 3 + packuswb m3, m3 + + movd [r0], m3 + pshufd m3, m3, 0x55 + movd [r0 + r1], m3 + lea r0, [r0 + 2 * r1] +%endmacro + + COMP_PRED_PLANAR_2ROW 0 + COMP_PRED_PLANAR_2ROW 2 + + RET + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar8, 4,4,7 + inc r2 + inc r3 + pxor m0, m0 + pmovzxbw m1, [r3] ; v_topRow + pmovzxbw m2, [r2] ; v_leftColumn + + movd m3, [r3 + 8] ; topRight = above[8]; + movd m4, [r2 + 8] ; bottomLeft = left[8]; + + pshufb m3, m0 + pshufb m4, m0 + punpcklbw m3, m0 ; v_topRight + punpcklbw m4, m0 ; v_bottomLeft + + psubw m4, m1 ; v_bottomRow + psubw m3, m2 ; v_rightColumn + + psllw m1, 3 ; v_topRow + psllw m2, 3 ; v_leftColumn + + paddw m6, m2, [pw_8] + +%macro PRED_PLANAR_ROW8 1 + %if (%1 < 4) + pshuflw m5, m6, 0x55 * %1 + pshufd m5, m5, 0 + pshuflw m2, m3, 0x55 * %1 + pshufd m2, m2, 0 + %else + pshufhw m5, m6, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA + pshufhw m2, m3, 0x55 * (%1 - 4) + pshufd m2, m2, 0xAA + %endif + + pmullw m2, [multiL] + paddw m5, m2 + paddw m1, m4 + paddw m5, m1 + psraw m5, 4 + packuswb m5, m5 + + movh [r0], m5 + lea r0, [r0 + r1] + +%endmacro + + PRED_PLANAR_ROW8 0 + PRED_PLANAR_ROW8 1 + PRED_PLANAR_ROW8 2 + PRED_PLANAR_ROW8 3 + PRED_PLANAR_ROW8 4 + PRED_PLANAR_ROW8 5 + PRED_PLANAR_ROW8 6 + PRED_PLANAR_ROW8 7 + + RET + + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar16, 4,6,8 + inc r2 + inc r3 + pxor m0, m0 + pmovzxbw m1, [r3] ; topRow[0-7] + pmovzxbw m2, [r3 + 8] ; topRow[8-15] + + movd m3, [r2 + 16] + pshufb m3, m0 + punpcklbw m3, m0 ; v_bottomLeft = left[16] + movzx r4d, byte [r3 + 16] ; topRight = above[16] + + psubw m4, m3, m1 ; v_bottomRow[0] + psubw m5, m3, m2 ; v_bottomRow[1] + + psllw m1, 4 + psllw m2, 4 + +%macro PRED_PLANAR_ROW16 1 + movzx r5d, byte [r2 + %1] + add r5d, r5d + lea r5d, [r5d * 8 + 16] + movd m3, r5d + pshuflw m3, m3, 0 + pshufd m3, m3, 0 ; horPred + + movzx r5d, byte [r2 + %1] + mov r3d, r4d + sub r3d, r5d + movd m6, r3d + pshuflw m6, m6, 0 + pshufd m6, m6, 0 + + pmullw m7, m6, [multiL] + paddw m7, m3 + paddw m1, m4 + paddw m7, m1 + psraw m7, 5 + + pmullw m6, m6, [multiH] + paddw m3, m6 + paddw m2, m5 + paddw m3, m2 + psraw m3, 5 + + packuswb m7, m3 + movu [r0], m7 + lea r0, [r0 + r1] +%endmacro + + PRED_PLANAR_ROW16 0 + PRED_PLANAR_ROW16 1 + PRED_PLANAR_ROW16 2 + PRED_PLANAR_ROW16 3 + PRED_PLANAR_ROW16 4 + PRED_PLANAR_ROW16 5 + PRED_PLANAR_ROW16 6 + PRED_PLANAR_ROW16 7 + PRED_PLANAR_ROW16 8 + PRED_PLANAR_ROW16 9 + PRED_PLANAR_ROW16 10 + PRED_PLANAR_ROW16 11 + PRED_PLANAR_ROW16 12 + PRED_PLANAR_ROW16 13 + PRED_PLANAR_ROW16 14 + PRED_PLANAR_ROW16 15 + + RET + + +;----------------------------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;----------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal intra_pred_planar32, 4,7,12 + %define bottomRow0 m8 + %define bottomRow1 m9 + %define bottomRow2 m10 + %define bottomRow3 m11 +%else +cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize) + %define bottomRow0 [rsp + 0 * mmsize] + %define bottomRow1 [rsp + 1 * mmsize] + %define bottomRow2 [rsp + 2 * mmsize] + %define bottomRow3 [rsp + 3 * mmsize] +%endif + inc r2 + inc r3 + pxor m3, m3 + movd m0, [r2 + 32] + pshufb m0, m3 + punpcklbw m0, m3 ; v_bottomLeft = left[32] + movzx r4d, byte [r3 + 32] ; topRight = above[32] + + pmovzxbw m1, [r3 + 0] ; topRow[0] + pmovzxbw m2, [r3 + 8] ; topRow[1] + pmovzxbw m3, [r3 +16] ; topRow[2] + pmovzxbw m4, [r3 +24] ; topRow[3] + + psubw m5, m0, m1 ; v_bottomRow[0] + psubw m6, m0, m2 ; v_bottomRow[1] + psubw m7, m0, m3 ; v_bottomRow[2] + psubw m0, m4 ; v_bottomRow[3] + + mova bottomRow0, m5 + mova bottomRow1, m6 + mova bottomRow2, m7 + mova bottomRow3, m0 + + psllw m1, 5 + psllw m2, 5 + psllw m3, 5 + psllw m4, 5 + +%macro COMP_PRED_PLANAR_ROW 1 + movzx r5d, byte [r2] + shl r5d, 5 + add r5d, 32 + movd m5, r5d + pshuflw m5, m5, 0 + pshufd m5, m5, 0 ; horPred + + movzx r5d, byte [r2] + mov r6d, r4d + sub r6d, r5d + movd m6, r6d + pshuflw m6, m6, 0 + pshufd m6, m6, 0 + +%if (%1 == 0) + pmullw m7, m6, [multiL] +%else + pmullw m7, m6, [multiH2] +%endif + + paddw m7, m5 +%if (%1 == 0) + paddw m1, bottomRow0 + paddw m7, m1 +%else + paddw m3, bottomRow2 + paddw m7, m3 +%endif + psraw m7, 6 + +%if (%1 == 0) + pmullw m6, [multiH] +%else + pmullw m6, [multiH3] +%endif + paddw m6, m5 +%if (%1 == 0) + paddw m2, bottomRow1 + paddw m6, m2 +%else + paddw m4, bottomRow3 + paddw m6, m4 +%endif + psraw m6, 6 + + packuswb m7, m6 + movu [r0 + %1], m7 +%endmacro + + mov r3, 32 +.loop: + COMP_PRED_PLANAR_ROW 0 + COMP_PRED_PLANAR_ROW 16 + inc r2 + lea r0, [r0 + r1] + + dec r3 + jnz .loop +%undef COMP_PRED_PLANAR_ROW + + RET + +;----------------------------------------------------------------------------- +; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang4_2, 3,3,4 + cmp r4m, byte 34 + cmove r2, r3mp + movh m0, [r2 + 2] + movd [r0], m0 + palignr m1, m0, 1 + movd [r0 + r1], m1 + palignr m2, m0, 2 + movd [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + psrldq m0, 3 + movd [r0 + r1], m0 + RET + + +INIT_XMM sse4 +cglobal intra_pred_ang4_3, 3,4,5 + cmp r4m, byte 33 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] + punpcklqdq m0, m1 + punpcklqdq m2, m3 + + movh m3, [r3 + 6 * 16] ; [26] + movhps m3, [r3] ; [20] + movh m4, [r3 - 6 * 16] ; [14] + movhps m4, [r3 - 12 * 16] ; [ 8] + jmp .do_filter4x4 + + ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose +ALIGN 16 +.do_filter4x4: + mova m1, [pw_1024] + + pmaddubsw m0, m3 + pmulhrsw m0, m1 + pmaddubsw m2, m4 + pmulhrsw m2, m1 + packuswb m0, m2 + + ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before + jz .store + + ; transpose 4x4 + pshufb m0, [c_trans_4x4] + +.store: + ; TODO: use pextrd here after intrinsic ssse3 removed + movd [r0], m0 + pextrd [r0 + r1], m0, 1 + pextrd [r0 + r1 * 2], m0, 2 + lea r1, [r1 * 3] + pextrd [r0 + r1], m0, 3 + RET + + +cglobal intra_pred_ang4_4, 3,4,5 + cmp r4m, byte 32 + cmove r2, r3mp + lea r3, [ang_table + 18 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + punpcklqdq m0, m1 + punpcklqdq m2, m1, m3 + + movh m3, [r3 + 3 * 16] ; [21] + movhps m3, [r3 - 8 * 16] ; [10] + movh m4, [r3 + 13 * 16] ; [31] + movhps m4, [r3 + 2 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_5, 3,4,5 + cmp r4m, byte 31 + cmove r2, r3mp + lea r3, [ang_table + 10 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + punpcklqdq m0, m1 + punpcklqdq m2, m1, m3 + + movh m3, [r3 + 7 * 16] ; [17] + movhps m3, [r3 - 8 * 16] ; [ 2] + movh m4, [r3 + 9 * 16] ; [19] + movhps m4, [r3 - 6 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_6, 3,4,5 + cmp r4m, byte 30 + cmove r2, r3mp + lea r3, [ang_table + 19 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + punpcklqdq m0, m0 + punpcklqdq m2, m2 + + movh m3, [r3 - 6 * 16] ; [13] + movhps m3, [r3 + 7 * 16] ; [26] + movh m4, [r3 - 12 * 16] ; [ 7] + movhps m4, [r3 + 1 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_7, 3,4,5 + cmp r4m, byte 29 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + punpcklqdq m2, m0, m3 + punpcklqdq m0, m0 + + movh m3, [r3 - 11 * 16] ; [ 9] + movhps m3, [r3 - 2 * 16] ; [18] + movh m4, [r3 + 7 * 16] ; [27] + movhps m4, [r3 - 16 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_8, 3,4,5 + cmp r4m, byte 28 + cmove r2, r3mp + lea r3, [ang_table + 13 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + punpcklqdq m0, m0 + mova m2, m0 + + movh m3, [r3 - 8 * 16] ; [ 5] + movhps m3, [r3 - 3 * 16] ; [10] + movh m4, [r3 + 2 * 16] ; [15] + movhps m4, [r3 + 7 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_9, 3,4,5 + cmp r4m, byte 27 + cmove r2, r3mp + lea r3, [ang_table + 4 * 16] + movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + punpcklqdq m0, m0 + mova m2, m0 + + movh m3, [r3 - 2 * 16] ; [ 2] + movhps m3, [r3 - 0 * 16] ; [ 4] + movh m4, [r3 + 2 * 16] ; [ 6] + movhps m4, [r3 + 4 * 16] ; [ 8] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_10, 3,3,4 + movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + pshufb m0, [pb_unpackbd1] + + pshufd m1, m0, 1 + movhlps m2, m0 + pshufd m3, m0, 3 + movd [r0 + r1], m1 + movd [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + movd [r0 + r1], m3 + + cmp r5m, byte 0 + jz .quit + + ; filter + mov r2, r3mp + pmovzxbw m0, m0 ; [-1 -1 -1 -1] + movh m1, [r2] ; [4 3 2 1 0] + pshufb m2, m1, [pb_0_8] ; [0 0 0 0] + pshufb m1, [pb_unpackbw1] ; [4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + packuswb m0, m0 + +.quit: + movd [r0], m0 + RET + + +INIT_XMM sse4 +cglobal intra_pred_ang4_26, 4,4,3 + movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1] + + ; store + movd [r0], m0 + movd [r0 + r1], m0 + movd [r0 + r1 * 2], m0 + lea r3, [r1 * 3] + movd [r0 + r3], m0 + + ; filter + cmp r5m, byte 0 + jz .quit + + pshufb m0, [pb_0_8] ; [ 1 1 1 1] + movh m1, [r2] ; [-4 -3 -2 -1 0] + pshufb m2, m1, [pb_0_8] ; [0 0 0 0] + pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + packuswb m0, m0 + + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r3], m0, 3 + +.quit: + RET + + +cglobal intra_pred_ang4_11, 3,4,5 + cmp r4m, byte 25 + cmove r2, r3mp + lea r3, [ang_table + 24 * 16] + movh m0, [r2] ; [x x x 4 3 2 1 0] + palignr m1, m0, 1 ; [x x x x 4 3 2 1] + punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] + punpcklqdq m0, m0 + mova m2, m0 + + movh m3, [r3 + 6 * 16] ; [24] + movhps m3, [r3 + 4 * 16] ; [26] + movh m4, [r3 + 2 * 16] ; [28] + movhps m4, [r3 + 0 * 16] ; [30] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_12, 3,4,5 + cmp r4m, byte 24 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + movh m0, [r2] ; [x x x 4 3 2 1 0] + palignr m1, m0, 1 ; [x x x x 4 3 2 1] + punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] + punpcklqdq m0, m0 + mova m2, m0 + + movh m3, [r3 + 7 * 16] ; [27] + movhps m3, [r3 + 2 * 16] ; [22] + movh m4, [r3 - 3 * 16] ; [17] + movhps m4, [r3 - 8 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_13, 4,4,5 + cmp r4m, byte 23 + jnz .load + xchg r2, r3 +.load: + movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x] + palignr m0, m1, 1 ; [x x x 4 3 2 1 0] + palignr m2, m1, 2 ; [x x x x 4 3 2 1] + pinsrb m1, [r3 + 4], 0 + punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] + punpcklqdq m2, m0, m1 + punpcklqdq m0, m0 + + lea r3, [ang_table + 21 * 16] + movh m3, [r3 + 2 * 16] ; [23] + movhps m3, [r3 - 7 * 16] ; [14] + movh m4, [r3 - 16 * 16] ; [ 5] + movhps m4, [r3 + 7 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_14, 4,4,5 + cmp r4m, byte 22 + jnz .load + xchg r2, r3 +.load: + movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] + palignr m0, m2, 1 ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + pinsrb m2, [r3 + 2], 0 + punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + punpcklqdq m0, m0 + punpcklqdq m2, m2 + + lea r3, [ang_table + 19 * 16] + movh m3, [r3 + 0 * 16] ; [19] + movhps m3, [r3 - 13 * 16] ; [ 6] + movh m4, [r3 + 6 * 16] ; [25] + movhps m4, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_15, 4,4,5 + cmp r4m, byte 21 + jnz .load + xchg r2, r3 +.load: + movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] + palignr m0, m2, 1 ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + pinsrb m2, [r3 + 2], 0 + pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] + pinsrb m3, [r3 + 4], 0 + punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] + punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + punpcklqdq m0, m2 + punpcklqdq m2, m4 + + lea r3, [ang_table + 23 * 16] + movh m3, [r3 - 8 * 16] ; [15] + movhps m3, [r3 + 7 * 16] ; [30] + movh m4, [r3 - 10 * 16] ; [13] + movhps m4, [r3 + 5 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_16, 4,4,5 + cmp r4m, byte 20 + jnz .load + xchg r2, r3 +.load: + movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] + palignr m0, m2, 1 ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + pinsrb m2, [r3 + 2], 0 + pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] + pinsrb m3, [r3 + 3], 0 + punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] + punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + punpcklqdq m0, m2 + punpcklqdq m2, m4 + + lea r3, [ang_table + 19 * 16] + movh m3, [r3 - 8 * 16] ; [11] + movhps m3, [r3 + 3 * 16] ; [22] + movh m4, [r3 - 18 * 16] ; [ 1] + movhps m4, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_17, 4,4,5 + cmp r4m, byte 19 + jnz .load + xchg r2, r3 +.load: + movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x] + palignr m0, m3, 1 ; [- - - 4 3 2 1 0] + palignr m1, m3, 2 ; [- - - - 4 3 2 1] + mova m4, m0 + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + + pinsrb m3, [r3 + 1], 0 + punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] + punpcklqdq m0, m1 + + pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] + pinsrb m2, [r3 + 2], 0 + pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] + pinsrb m1, [r3 + 4], 0 + punpcklbw m1, m2 ; [1 0 0 x x y y z] + punpcklbw m2, m3 ; [2 1 1 0 0 x x y] + punpcklqdq m2, m1 + + lea r3, [ang_table + 14 * 16] + movh m3, [r3 - 8 * 16] ; [ 6] + movhps m3, [r3 - 2 * 16] ; [12] + movh m4, [r3 + 4 * 16] ; [18] + movhps m4, [r3 + 10 * 16] ; [24] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + + +cglobal intra_pred_ang4_18, 4,4,1 + mov r2d, [r2] + bswap r2d + movd m0, r2d + pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] + lea r2, [r1 * 3] + movd [r0 + r2], m0 + psrldq m0, 1 + movd [r0 + r1 * 2], m0 + psrldq m0, 1 + movd [r0 + r1], m0 + psrldq m0, 1 + movd [r0], m0 + RET +;----------------------------------------------------------------------------- +; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang8_2, 3,5,2 + cmp r4m, byte 34 + cmove r2, r3mp + movu m0, [r2 + 2] + lea r4, [r1 * 3] + + movh [r0], m0 + palignr m1, m0, 1 + movh [r0 + r1], m1 + palignr m1, m0, 2 + movh [r0 + r1 * 2], m1 + palignr m1, m0, 3 + movh [r0 + r4], m1 + palignr m1, m0, 4 + lea r0, [r0 + r1 * 4] + movh [r0], m1 + palignr m1, m0, 5 + movh [r0 + r1], m1 + palignr m1, m0, 6 + movh [r0 + r1 * 2], m1 + palignr m1, m0, 7 + movh [r0 + r4], m1 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang8_3, 3,5,8 + cmp r4m, byte 33 + cmove r2, r3mp + lea r3, [ang_table + 22 * 16] + lea r4, [ang_table + 8 * 16] + mova m3, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + + pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] + pmulhrsw m4, m3 + pmaddubsw m1, [r3 - 2 * 16] ; [20] + pmulhrsw m1, m3 + packuswb m4, m1 + + palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + + pmaddubsw m5, [r3 - 8 * 16] ; [14] + pmulhrsw m5, m3 + + palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + + pmaddubsw m6, [r4] ; [ 8] + pmulhrsw m6, m3 + packuswb m5, m6 + + palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] + + pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] + pmulhrsw m6, m3 + + pmaddubsw m1, [r3 + 6 * 16] ; [28] + pmulhrsw m1, m3 + packuswb m6, m1 + + palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] + + pmaddubsw m1, [r3] ; [22] + pmulhrsw m1, m3 + + palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] + + pmaddubsw m2, [r3 - 6 * 16] ; [16] + pmulhrsw m2, m3 + packuswb m1, m2 + jmp .transpose8x8 + +ALIGN 16 +.transpose8x8: + jz .store + + ; transpose 8x8 + punpckhbw m0, m4, m5 + punpcklbw m4, m5 + punpckhbw m2, m4, m0 + punpcklbw m4, m0 + + punpckhbw m0, m6, m1 + punpcklbw m6, m1 + punpckhbw m1, m6, m0 + punpcklbw m6, m0 + + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckldq m6, m2, m1 + punpckhdq m2, m1 + mova m1, m2 + +.store: + lea r4, [r1 * 3] + movh [r0], m4 + movhps [r0 + r1], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r4], m5 + add r0, r4 + movh [r0 + r1], m6 + movhps [r0 + r1 * 2], m6 + movh [r0 + r4], m1 + movhps [r0 + r1 * 4], m1 + RET + +cglobal intra_pred_ang8_4, 3,5,8 + cmp r4m, byte 32 + cmove r2, r3mp + lea r3, [ang_table + 24 * 16] + lea r4, [ang_table + 10 * 16] + mova m3, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + mova m5, m1 + + pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] + pmulhrsw m4, m3 + pmaddubsw m1, [r4] ; [10] + pmulhrsw m1, m3 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 7 * 16] ; [31] + pmulhrsw m5, m3 + + palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + + pmaddubsw m6, [r3 - 4 * 16] ; [ 20] + pmulhrsw m6, m3 + packuswb m5, m6 + + palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + + pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] + pmulhrsw m6, m3 + + pmaddubsw m1, [r3 + 6 * 16] ; [30] + pmulhrsw m1, m3 + packuswb m6, m1 + + palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] + + pmaddubsw m1, [r3 - 5 * 16] ; [19] + pmulhrsw m1, m3 + + palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] + + pmaddubsw m2, [r4 - 2 * 16] ; [8] + pmulhrsw m2, m3 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_5, 3,5,8 + cmp r4m, byte 31 + cmove r2, r3mp + lea r3, [ang_table + 17 * 16] + lea r4, [ang_table + 2 * 16] + mova m3, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + mova m5, m1 + + pmaddubsw m4, m0, [r3] ; [17] + pmulhrsw m4, m3 + pmaddubsw m1, [r4] ; [2] + pmulhrsw m1, m3 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 2 * 16] ; [19] + pmulhrsw m5, m3 + + palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + mova m1, m6 + + pmaddubsw m1, [r4 + 2 * 16] ; [4] + pmulhrsw m1, m3 + packuswb m5, m1 + + pmaddubsw m6, [r3 + 4 * 16] ; [21] + pmulhrsw m6, m3 + + palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + + mova m7, m1 + pmaddubsw m7, [r4 + 4 * 16] ; [6] + pmulhrsw m7, m3 + packuswb m6, m7 + + pmaddubsw m1, [r3 + 6 * 16] ; [23] + pmulhrsw m1, m3 + + palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] + + pmaddubsw m2, [r4 + 6 * 16] ; [8] + pmulhrsw m2, m3 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_6, 3,5,8 + cmp r4m, byte 30 + cmove r2, r3mp + lea r3, [ang_table + 20 * 16] + lea r4, [ang_table + 8 * 16] + mova m7, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + mova m1, m0 + + pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 6 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + + pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] + pmulhrsw m5, m7 + + pmaddubsw m6, [r3] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + + pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] + pmulhrsw m6, m7 + + mova m3, m1 + pmaddubsw m3, [r3 - 6 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m1, [r3 + 7 * 16] ; [27] + pmulhrsw m1, m7 + + palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + + pmaddubsw m2, [r4] ; [8] + pmulhrsw m2, m7 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_7, 3,5,8 + cmp r4m, byte 29 + cmove r2, r3mp + lea r3, [ang_table + 24 * 16] + lea r4, [ang_table + 6 * 16] + mova m7, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] + pmulhrsw m5, m7 + + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + + pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] + pmulhrsw m6, m7 + + mova m3, m1 + pmaddubsw m3, [r3 - 2 * 16] ; [22] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m1, [r3 + 7 * 16] ; [31] + pmulhrsw m1, m7 + + palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + + pmaddubsw m2, [r4 + 2 * 16] ; [8] + pmulhrsw m2, m7 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_8, 3,5,8 + cmp r4m, byte 28 + cmove r2, r3mp + lea r3, [ang_table + 23 * 16] + lea r4, [ang_table + 8 * 16] + mova m7, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + + pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] + pmulhrsw m5, m7 + + pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] + pmulhrsw m6, m7 + + pmaddubsw m0, [r3 + 7 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] + pmulhrsw m1, m7 + + pmaddubsw m2, [r4] ; [8] + pmulhrsw m2, m7 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_9, 3,5,8 + cmp r4m, byte 27 + cmove r2, r3mp + lea r3, [ang_table + 10 * 16] + mova m7, [pw_1024] + + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] + pmulhrsw m5, m7 + + pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r3] ; [10] + pmulhrsw m6, m7 + + pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] + pmulhrsw m2, m7 + packuswb m6, m2 + + pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] + pmulhrsw m1, m7 + + pmaddubsw m0, [r3 + 6 * 16] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_10, 4,5,5 + movh m0, [r2 + 1] + mova m4, [pb_unpackbq] + palignr m1, m0, 2 + pshufb m1, m4 + palignr m2, m0, 4 + pshufb m2, m4 + palignr m3, m0, 6 + pshufb m3, m4 + pshufb m0, m4 + + lea r4, [r1 * 3] + movhps [r0 + r1], m0 + movh [r0 + r1 * 2], m1 + movhps [r0 + r4], m1 + lea r2, [r0 + r1 * 4] + movh [r2], m2 + movhps [r2 + r1], m2 + movh [r2 + r1 * 2], m3 + movhps [r2 + r4], m3 + +; filter + cmp r5m, byte 0 + jz .quit + + pmovzxbw m0, m0 + movu m1, [r3] + palignr m2, m1, 1 + pshufb m1, m4 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + psubw m2, m1 + psraw m2, 1 + paddw m0, m2 + packuswb m0, m0 + +.quit: + movh [r0], m0 + RET + +cglobal intra_pred_ang8_26, 4,5,3 + movh m0, [r3 + 1] + + lea r4, [r1 * 3] + movh [r0], m0 + movh [r0 + r1], m0 + movh [r0 + r1 * 2], m0 + movh [r0 + r4], m0 + lea r3, [r0 + r1 * 4] + movh [r3], m0 + movh [r3 + r1], m0 + movh [r3 + r1 * 2], m0 + movh [r3 + r4], m0 + +; filter + cmp r5m, byte 0 + jz .quit + + pshufb m0, [pb_unpackbq] + pmovzxbw m0, m0 + movu m1, [r2] + palignr m2, m1, 1 + pshufb m1, [pb_unpackbq] + pmovzxbw m1, m1 + pmovzxbw m2, m2 + psubw m2, m1 + psraw m2, 1 + paddw m0, m2 + packuswb m0, m0 + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r4], m0, 3 + pextrb [r3], m0, 4 + pextrb [r3 + r1], m0, 5 + pextrb [r3 + r1 * 2], m0, 6 + pextrb [r3 + r4], m0, 7 + +.quit: + RET + +cglobal intra_pred_ang8_11, 3,5,8 + cmp r4m, byte 25 + cmove r2, r3mp + lea r3, [ang_table + 23 * 16] + mova m7, [pw_1024] + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] + pmulhrsw m5, m7 + + pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] + pmulhrsw m6, m7 + + pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] + pmulhrsw m2, m7 + packuswb m6, m2 + + pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] + pmulhrsw m1, m7 + + pmaddubsw m0, [r3 - 7 * 16] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_12, 4,5,8 + cmp r4m, byte 24 + mov r4, r2 + cmovz r2, r3 + cmovz r3, r4 + + lea r4, [ang_table + 22 * 16] + mova m7, [pw_1024] + + movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] + pinsrb m0, [r3 + 6], 0 + punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] + punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4] ; [22] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] + pmulhrsw m1, m7 + + pmaddubsw m0, [r4 + 2 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] + pmulhrsw m5, m7 + + lea r4, [ang_table + 7 * 16] + pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m2, [r4] ; [7] + pmulhrsw m6, m7 + + pmaddubsw m2, [r4 - 5 * 16] ; [2] + pmulhrsw m2, m7 + packuswb m6, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_13, 4,5,8 + cmp r4m, byte 23 + mov r4, r2 + cmovz r2, r3 + cmovz r3, r4 + + lea r4, [ang_table + 24 * 16] + mova m7, [pw_1024] + + movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] + pinsrb m1, [r3 + 4], 0 + pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] + pinsrb m0, [r3 + 7], 0 + punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] + punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] + pmulhrsw m4, m7 + + pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] + pmulhrsw m6, m7 + + pmaddubsw m0, [r4] ; [24] + pmulhrsw m0, m7 + + lea r4, [ang_table + 13 * 16] + pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m5, [r4 - 8 * 16] ; [5] + pmulhrsw m5, m7 + packuswb m5, m6 + + pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] + pmulhrsw m6, m7 + + pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] + pmulhrsw m2, m7 + packuswb m6, m2 + + pmaddubsw m1, [r4 - 12 * 16] ; [1] + pmulhrsw m1, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_14, 4,5,8 + cmp r4m, byte 22 + mov r4, r2 + cmovz r2, r3 + cmovz r3, r4 + + lea r4, [ang_table + 24 * 16] + mova m3, [pw_1024] + + movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] + pinsrb m1, [r3 + 2], 1 + pinsrb m1, [r3 + 5], 0 + pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] + pinsrb m0, [r3 + 7], 0 + punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] + punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] + pmulhrsw m4, m3 + + pmaddubsw m0, [r4] ; [24] + pmulhrsw m0, m3 + + pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] + pmulhrsw m5, m3 + + lea r4, [ang_table + 12 * 16] + pmaddubsw m6, [r4] ; [12] + pmulhrsw m6, m3 + packuswb m5, m6 + + pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] + pmulhrsw m6, m3 + + pmaddubsw m2, [r4 - 6 * 16] ; [6] + pmulhrsw m2, m3 + packuswb m4, m2 + + pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] + pmulhrsw m2, m3 + packuswb m6, m2 + + pmaddubsw m1, [r4 - 7 * 16] ; [5] + pmulhrsw m1, m3 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_15, 4,5,8 + cmp r4m, byte 21 + mov r4, r2 + cmovz r2, r3 + cmovz r3, r4 + + lea r4, [ang_table + 23 * 16] + mova m3, [pw_1024] + + movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m2, [r3] + pshufb m2, [c_mode16_15] + palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] + pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] + pinsrb m0, [r3 + 8], 0 + punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] + palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 8 * 16] ; [15] + pmulhrsw m4, m3 + + pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] + pmulhrsw m2, m3 + packuswb m4, m2 + + pmaddubsw m5, [r4 - 10 * 16] ; [13] + pmulhrsw m5, m3 + + pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] + pmulhrsw m2, m3 + packuswb m5, m2 + + pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] + pmulhrsw m2, m3 + + pmaddubsw m0, [r4 + 1 * 16] ; [24] + pmulhrsw m0, m3 + + lea r4, [ang_table + 11 * 16] + pmaddubsw m6, [r4] ; [11] + pmulhrsw m6, m3 + packuswb m6, m2 + + pmaddubsw m1, [r4 - 2 * 16] ; [9] + pmulhrsw m1, m3 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_16, 4,5,8 + cmp r4m, byte 20 + mov r4, r2 + cmovz r2, r3 + cmovz r3, r4 + + lea r4, [ang_table + 22 * 16] + mova m7, [pw_1024] + + movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m2, [r3] + pshufb m2, [c_mode16_16] + palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] + pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] + pinsrb m0, [r3 + 8], 0 + punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] + palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] + palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m3, m5, [r4] ; [22] + pmulhrsw m3, m7 + + pmaddubsw m0, [r4 + 2 * 16] ; [24] + pmulhrsw m0, m7 + + lea r4, [ang_table + 9 * 16] + + pmaddubsw m4, [r4 + 2 * 16] ; [11] + pmulhrsw m4, m7 + packuswb m4, m3 + + pmaddubsw m2, [r4 + 3 * 16] ; [12] + pmulhrsw m2, m7 + + pmaddubsw m5, [r4 - 8 * 16] ; [1] + pmulhrsw m5, m7 + packuswb m5, m2 + + mova m2, m6 + pmaddubsw m6, [r4 + 14 * 16] ; [23] + pmulhrsw m6, m7 + + pmaddubsw m2, [r4 - 7 * 16] ; [2] + pmulhrsw m2, m7 + packuswb m6, m2 + + pmaddubsw m1, [r4 + 4 * 16] ; [13] + pmulhrsw m1, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_17, 4,5,8 + cmp r4m, byte 19 + mov r4, r2 + cmovz r2, r3 + cmovz r3, r4 + + lea r4, [ang_table + 17 * 16] + mova m3, [pw_1024] + + movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m1, [r3] + pshufb m1, [c_mode16_17] + palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] + pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] + pinsrb m0, [r3 + 7], 0 + punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] + + palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + + pmaddubsw m2, [r4 - 5 * 16] ; [12] + pmulhrsw m2, m3 + + pmaddubsw m4, [r4 - 11 * 16] ; [6] + pmulhrsw m4, m3 + packuswb m4, m2 + + pmaddubsw m5, [r4 + 1 * 16] ; [18] + pmulhrsw m5, m3 + + palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + pmaddubsw m2, [r4 + 7 * 16] ; [24] + pmulhrsw m2, m3 + packuswb m5, m2 + + palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] + mova m2, m6 + pmaddubsw m6, [r4 + 13 * 16] ; [30] + pmulhrsw m6, m3 + + pmaddubsw m2, [r4 - 13 * 16] ; [4] + pmulhrsw m2, m3 + packuswb m6, m2 + + palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] + pmaddubsw m1, [r4 - 7 * 16] ; [10] + pmulhrsw m1, m3 + + pmaddubsw m0, [r4 - 1 * 16] ; [16] + pmulhrsw m0, m3 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_18, 4,4,1 + movu m0, [r2] + pshufb m0, [pb_swap8] + movhps m0, [r3 + 1] + lea r2, [r0 + r1 * 4] + lea r3, [r1 * 3] + movh [r2 + r3], m0 + psrldq m0, 1 + movh [r2 + r1 * 2], m0 + psrldq m0, 1 + movh [r2 + r1], m0 + psrldq m0, 1 + movh [r2], m0 + psrldq m0, 1 + movh [r0 + r3], m0 + psrldq m0, 1 + movh [r0 + r1 * 2], m0 + psrldq m0, 1 + movh [r0 + r1], m0 + psrldq m0, 1 + movh [r0], m0 + RET + + +;----------------------------------------------------------------------------- +; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang16_2, 3,3,3 + cmp r4m, byte 34 + cmove r2, r3mp + movu m0, [r2 + 2] + movu m1, [r2 + 18] + movu [r0], m0 + palignr m2, m1, m0, 1 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 2 + movu [r0], m2 + palignr m2, m1, m0, 3 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 4 + movu [r0], m2 + palignr m2, m1, m0, 5 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 6 + movu [r0], m2 + palignr m2, m1, m0, 7 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 9 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 10 + movu [r0], m2 + palignr m2, m1, m0, 11 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 12 + movu [r0], m2 + palignr m2, m1, m0, 13 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 14 + movu [r0], m2 + palignr m2, m1, m0, 15 + movu [r0 + r1], m2 + RET + +%macro TRANSPOSE_STORE_8x8 6 + %if %2 == 1 + ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 + punpckhbw m0, %3, %4 + punpcklbw %3, %4 + punpckhbw %4, %3, m0 + punpcklbw %3, m0 + + punpckhbw m0, %5, m1 + punpcklbw %5, %6 + punpckhbw %6, %5, m0 + punpcklbw %5, m0 + + punpckhdq m0, %3, %5 + punpckldq %3, %5 + punpckldq %5, %4, %6 + punpckhdq %4, %6 + + movh [r0 + + %1 * 8], %3 + movhps [r0 + r1 + %1 * 8], %3 + movh [r0 + r1*2 + %1 * 8], m0 + movhps [r0 + r5 + %1 * 8], m0 + movh [r6 + %1 * 8], %5 + movhps [r6 + r1 + %1 * 8], %5 + movh [r6 + r1*2 + %1 * 8], %4 + movhps [r6 + r5 + %1 * 8], %4 + %else + ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 + movh [r0 ], %3 + movhps [r0 + r1 ], %3 + movh [r0 + r1 * 2], %4 + movhps [r0 + r5 ], %4 + lea r0, [r0 + r1 * 4] + movh [r0 ], %5 + movhps [r0 + r1 ], %5 + movh [r0 + r1 * 2], %6 + movhps [r0 + r5 ], %6 + lea r0, [r0 + r1 * 4] + %endif +%endmacro + +INIT_XMM sse4 +cglobal intra_pred_ang16_3, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m0, [r2 + 1] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + + pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m5, m2, m0, 4 + + pmaddubsw m5, [r3 - 2 * 16] ; [14] + pmulhrsw m5, m7 + + palignr m6, m2, m0, 6 + + pmaddubsw m6, [r3 - 8 * 16] ; [ 8] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 8 + + pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmulhrsw m6, m7 + + pmaddubsw m1, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + + palignr m2, m0, 12 + + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + movu m0, [r2 + 8] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + + pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmulhrsw m5, m7 + + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 6 + + pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmulhrsw m6, m7 + + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m6, m1 + + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m1, m1 + + movhps m1, [r2 + 14] ; [00] + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_33, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m0, [r2 + 1] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + + pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m5, m2, m0, 4 + + pmaddubsw m5, [r3 - 2 * 16] ; [14] + pmulhrsw m5, m7 + + palignr m6, m2, m0, 6 + + pmaddubsw m6, [r3 - 8 * 16] ; [ 8] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 8 + + pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmulhrsw m6, m7 + + pmaddubsw m1, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + + palignr m2, m0, 12 + + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + movu m0, [r2 + 8] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + + pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmulhrsw m5, m7 + + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 6 + + pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmulhrsw m6, m7 + + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m6, m1 + + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m1, m1 + + movh m2, [r2 + 14] ; [00] + + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r5 ], m5 + lea r0, [r0 + r1 * 4] + movh [r0 ], m6 + movhps [r0 + r1 ], m6 + movh [r0 + r1 * 2], m1 + movh [r0 + r5 ], m2 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_4, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m0, [r2 + 1] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + + pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 15 * 16] ; [31] + pmulhrsw m5, m7 + + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 4 * 16] ; [ 20] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 6 + + pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] + pmulhrsw m6, m7 + + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 + + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 + 3 * 16] ; [19] + pmulhrsw m1, m7 + + palignr m2, m0, 10 + + pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] + pmulhrsw m4, m7 + + movu m0, [r2 + 6] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + + pmaddubsw m1, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m5, m2, m0, 4 + mova m6, m5 + + pmaddubsw m5, [r3 - 9 * 16] ; [07] + pmulhrsw m5, m7 + + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m6, m2, m0, 6 + + pmaddubsw m6, [r3 + 16] ; [17] + pmulhrsw m6, m7 + + palignr m1, m2, m0, 8 + palignr m2, m0, 10 + + pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m1, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_32, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m0, [r2 + 1] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + + + pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 15 * 16] ; [31] + pmulhrsw m5, m7 + + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 4 * 16] ; [ 20] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m0, 6 + + pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] + pmulhrsw m6, m7 + + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 + + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 + 3 * 16] ; [19] + pmulhrsw m1, m7 + + palignr m2, m0, 10 + + pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] + pmulhrsw m4, m7 + + movu m0, [r2 + 6] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + + pmaddubsw m1, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m5, m2, m0, 4 + mova m6, m5 + + pmaddubsw m5, [r3 - 9 * 16] ; [07] + pmulhrsw m5, m7 + + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m6, m2, m0, 6 + + pmaddubsw m6, [r3 + 16] ; [17] + pmulhrsw m6, m7 + + palignr m1, m2, m0, 8 + palignr m2, m0, 10 + + pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m1, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_5, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + palignr m5, m2, m3, 2 + + pmaddubsw m4, m3, [r3 + 16] ; [17] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m3, 4 + + pmaddubsw m5, [r3 + 3 * 16] ; [19] + pmulhrsw m5, m7 + pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] + pmulhrsw m1, m7 + packuswb m5, m1 + + palignr m1, m2, m3, 6 + + pmaddubsw m6, [r3 + 5 * 16] ; [21] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m0, m2, m3, 8 + + pmaddubsw m1, [r3 + 7 * 16] ; [23] + pmulhrsw m1, m7 + pmaddubsw m0, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + palignr m4, m2, m3, 8 + palignr m5, m2, m3, 10 + + pmaddubsw m4, [r3 + 9 * 16] ; [25] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m3, 12 + + pmaddubsw m5, [r3 + 11 * 16] ; [27] + pmulhrsw m5, m7 + pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m5, m1 + + palignr m1, m2, m3, 14 + + pmaddubsw m6, [r3 + 13 * 16] ; [29] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_31, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + palignr m5, m2, m3, 2 + + pmaddubsw m4, m3, [r3 + 16] ; [17] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m3, 4 + + pmaddubsw m5, [r3 + 3 * 16] ; [19] + pmulhrsw m5, m7 + pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] + pmulhrsw m1, m7 + packuswb m5, m1 + + palignr m1, m2, m3, 6 + + pmaddubsw m6, [r3 + 5 * 16] ; [21] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m0, m2, m3, 8 + + pmaddubsw m1, [r3 + 7 * 16] ; [23] + pmulhrsw m1, m7 + pmaddubsw m0, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + palignr m4, m2, m3, 8 + palignr m5, m2, m3, 10 + + pmaddubsw m4, [r3 + 9 * 16] ; [25] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m3, 12 + + pmaddubsw m5, [r3 + 11 * 16] ; [27] + pmulhrsw m5, m7 + pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m5, m1 + + palignr m1, m2, m3, 14 + + pmaddubsw m6, [r3 + 13 * 16] ; [29] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_6, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m3, 2 + + pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m3, 4 + + pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m0, m2, m3, 6 + + pmaddubsw m1, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + pmaddubsw m0, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + palignr m4, m2, m3, 6 + palignr m6, m2, m3, 8 + + pmaddubsw m4, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, m6, [r3 - 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m0, m2, m3, 10 + + pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] + pmulhrsw m6, m7 + pmaddubsw m0, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m2, m3, 12 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_30, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m6, m2, m3, 2 + + pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m1, m2, m3, 4 + + pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m0, m2, m3, 6 + + pmaddubsw m1, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + pmaddubsw m0, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + palignr m4, m2, m3, 6 + palignr m6, m2, m3, 8 + + pmaddubsw m4, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, m6, [r3 - 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m0, m2, m3, 10 + + pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] + pmulhrsw m6, m7 + pmaddubsw m0, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m2, m3, 12 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_7, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] + pmulhrsw m4, m7 + pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] + pmulhrsw m0, m7 + packuswb m4, m0 + + palignr m1, m2, m3, 2 + + pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] + pmulhrsw m5, m7 + pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m0, m2, m3, 4 + + pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmulhrsw m1, m7 + pmaddubsw m0, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + palignr m1, m2, m3, 4 + + pmaddubsw m4, m1, [r3 + 16] ; [17] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m0, m2, m3, 6 + + pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] + pmulhrsw m6, m7 + pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m2, m3, 8 + + pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_29, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] + pmulhrsw m4, m7 + pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] + pmulhrsw m0, m7 + packuswb m4, m0 + + palignr m1, m2, m3, 2 + + pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] + pmulhrsw m5, m7 + pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m0, m2, m3, 4 + + pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmulhrsw m1, m7 + pmaddubsw m0, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + palignr m1, m2, m3, 4 + + pmaddubsw m4, m1, [r3 + 16] ; [17] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + + palignr m0, m2, m3, 6 + + pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] + pmulhrsw m6, m7 + pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m2, m3, 8 + + pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_8, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] + pmulhrsw m2, m7 + packuswb m4, m2 + + pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] + pmulhrsw m6, m7 + pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] + pmulhrsw m2, m7 + packuswb m6, m2 + + palignr m2, m0, m1, 2 + palignr m3, m0, m1, 4 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] + pmulhrsw m5, m7 + pmaddubsw m2, [r3 + 12 * 16] ; [28] + pmulhrsw m2, m7 + packuswb m5, m2 + + pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] + pmulhrsw m1, m7 + pmaddubsw m3, [r3] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_28, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] + pmulhrsw m2, m7 + packuswb m4, m2 + + pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] + pmulhrsw m6, m7 + pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] + pmulhrsw m2, m7 + packuswb m6, m2 + + palignr m2, m0, m1, 2 + palignr m3, m0, m1, 4 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] + pmulhrsw m5, m7 + pmaddubsw m2, [r3 + 12 * 16] ; [28] + pmulhrsw m2, m7 + packuswb m5, m2 + + pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] + pmulhrsw m1, m7 + pmaddubsw m3, [r3] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_9, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] + pmulhrsw m4, m7 + pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] + pmulhrsw m6, m7 + pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m1, m1 + + punpcklqdq m1, m3 ; [00] + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_27, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + + pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] + pmulhrsw m4, m7 + pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m1, m1 + + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r5 ], m5 + lea r0, [r0 + r1 * 4] + movh [r0 ], m6 + movhps [r0 + r1 ], m6 + movh [r0 + r1 * 2], m1 + movh [r0 + r5 ], m2 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_10, 6,6,8 + lea r4, [r1 * 3] + pxor m7, m7 + + movu m0, [r2 + 1] + palignr m1, m0, 1 + pshufb m1, m7 + palignr m2, m0, 2 + pshufb m2, m7 + palignr m3, m0, 3 + pshufb m3, m7 + palignr m4, m0, 4 + pshufb m4, m7 + palignr m5, m0, 5 + pshufb m5, m7 + palignr m6, m0, 6 + pshufb m6, m7 + + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m2 + movu [r0 + r4], m3 + lea r2, [r0 + r1 * 4] + movu [r2], m4 + movu [r2 + r1], m5 + movu [r2 + r1 * 2], m6 + + palignr m1, m0, 7 + pshufb m1, m7 + movhlps m2, m0 + pshufb m2, m7 + palignr m3, m0, 9 + pshufb m3, m7 + palignr m4, m0, 10 + pshufb m4, m7 + palignr m5, m0, 11 + pshufb m5, m7 + palignr m6, m0, 12 + pshufb m6, m7 + + movu [r2 + r4], m1 + lea r2, [r2 + r1 * 4] + movu [r2], m2 + movu [r2 + r1], m3 + movu [r2 + r1 * 2], m4 + movu [r2 + r4], m5 + lea r2, [r2 + r1 * 4] + movu [r2], m6 + + palignr m1, m0, 13 + pshufb m1, m7 + palignr m2, m0, 14 + pshufb m2, m7 + palignr m3, m0, 15 + pshufb m3, m7 + pshufb m0, m7 + + movu [r2 + r1], m1 + movu [r2 + r1 * 2], m2 + movu [r2 + r4], m3 + +; filter + cmp r5w, byte 0 + jz .quit + pmovzxbw m0, m0 + mova m1, m0 + movu m2, [r3] + movu m3, [r3 + 1] + + pshufb m2, m7 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 + +.quit: + movu [r0], m0 + + RET + +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal intra_pred_ang16_26, 4,8,5 + mov r7, r5mp + %define bfilter r7w +%else + cglobal intra_pred_ang16_26, 6,7,5,0 - 4 + %define bfilter dword[rsp] + mov bfilter, r5 +%endif + movu m0, [r3 + 1] + + lea r4, [r1 * 3] + lea r3, [r0 + r1 * 4] + lea r5, [r3 + r1 * 4] + lea r6, [r5 + r1 * 4] + + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r4], m0 + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r4], m0 + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + + movu [r6], m0 + movu [r6 + r1], m0 + movu [r6 + r1 * 2], m0 + movu [r6 + r4], m0 + +; filter + cmp bfilter, byte 0 + jz .quit + + pxor m4, m4 + pshufb m0, m4 + pmovzxbw m0, m0 + mova m1, m0 + movu m2, [r2] + movu m3, [r2 + 1] + + pshufb m2, m4 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 + + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r4], m0, 3 + pextrb [r3], m0, 4 + pextrb [r3 + r1], m0, 5 + pextrb [r3 + r1 * 2], m0, 6 + pextrb [r3 + r4], m0, 7 + pextrb [r5], m0, 8 + pextrb [r5 + r1], m0, 9 + pextrb [r5 + r1 * 2], m0, 10 + pextrb [r5 + r4], m0, 11 + pextrb [r6], m0, 12 + pextrb [r6 + r1], m0, 13 + pextrb [r6 + r1 * 2], m0, 14 + pextrb [r6 + r4], m0, 15 + +.quit: + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_11, 3,7,8 + + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + +.loop: + movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + mova m2, m3 + palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] + pmulhrsw m4, m7 + pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m1, m1 + punpcklqdq m1, m2 ;[00] + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_25, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + mova m2, m3 + palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] + pmulhrsw m4, m7 + pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m1, m1 + + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r5 ], m5 + lea r0, [r0 + r1 * 4] + movh [r0 ], m6 + movhps [r0 + r1 ], m6 + movh [r0 + r1 * 2], m1 + movh [r0 + r5 ], m2 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_12, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r3] + pshufb m2, [c_mode16_12] + + palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] + pmulhrsw m6, m7 + pmaddubsw m0, [r4 - 14 * 16] ; [2] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m3, m2, 15 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + + pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_24, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_12] + + palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m4, m1 + + pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] + pmulhrsw m6, m7 + pmaddubsw m0, [r4 - 14 * 16] ; [2] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m3, m2, 15 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + + pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] + pmulhrsw m0, m7 + packuswb m6, m0 + + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_13, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r3] + pshufb m2, [c_mode16_13] + + palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + palignr m3, m2, 15 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + + pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_23, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_13] + + palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + palignr m3, m2, 15 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + + pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_14, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r3] + pshufb m2, [c_mode16_14] + + palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 + 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 + 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_22, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_14] + + palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 + 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 + 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_15, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r3] + pshufb m2, [c_mode16_15] + + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 16] ; [15] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] + + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_21, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_15] + + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 16] ; [15] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] + + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_16, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r3] + pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m5, m7 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m4, m7 + + pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 16] ; [15] + pmulhrsw m6, m7 + + pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 16] ; [15] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_20, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m5, m7 + + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m4, m7 + + pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 16] ; [15] + pmulhrsw m6, m7 + + pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 16] ; [15] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_17, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r3] + pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] + pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m4, m7 + + pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m6, m7 + + pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] + + pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_19, 4,7,8 + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] + pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m4, m7 + + pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m6, m7 + + pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + + pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_18, 4,5,3 + + movu m0, [r3] + movu m1, [r2] + mova m2, [c_mode16_18] + pshufb m1, m2 + + lea r2, [r1 * 2] + lea r3, [r1 * 3] + lea r4, [r1 * 4] + movu [r0], m0 + palignr m2, m0, m1, 15 + movu [r0 + r1], m2 + palignr m2, m0, m1, 14 + movu [r0 + r2], m2 + palignr m2, m0, m1, 13 + movu [r0 + r3], m2 + lea r0, [r0 + r4] + palignr m2, m0, m1, 12 + movu [r0], m2 + palignr m2, m0, m1, 11 + movu [r0 + r1], m2 + palignr m2, m0, m1, 10 + movu [r0 + r2], m2 + palignr m2, m0, m1, 9 + movu [r0 + r3], m2 + lea r0, [r0 + r4] + palignr m2, m0, m1, 8 + movu [r0], m2 + palignr m2, m0, m1, 7 + movu [r0 + r1], m2 + palignr m2, m0, m1, 6 + movu [r0 + r2], m2 + palignr m2, m0, m1, 5 + movu [r0 + r3], m2 + lea r0, [r0 + r4] + palignr m2, m0, m1, 4 + movu [r0], m2 + palignr m2, m0, m1, 3 + movu [r0 + r1], m2 + palignr m2, m0, m1, 2 + movu [r0 + r2], m2 + palignr m0, m1, 1 + movu [r0 + r3], m0 + RET + +;--------------------------------------------------------------------------------------------------------------- +; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;--------------------------------------------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang32_2, 3,4,4 + cmp r4m, byte 34 + cmove r2, r3mp + movu m0, [r2 + 2] + movu m1, [r2 + 18] + movu m3, [r2 + 34] + + lea r3, [r1 * 3] + + movu [r0], m0 + movu [r0 + 16], m1 + palignr m2, m1, m0, 1 + movu [r0 + r1], m2 + palignr m2, m3, m1, 1 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 2 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 2 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 3 + movu [r0 + r3], m2 + palignr m2, m3, m1, 3 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + palignr m2, m1, m0, 4 + movu [r0], m2 + palignr m2, m3, m1, 4 + movu [r0 + 16], m2 + palignr m2, m1, m0, 5 + movu [r0 + r1], m2 + palignr m2, m3, m1, 5 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 6 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 7 + movu [r0 + r3], m2 + palignr m2, m3, m1, 7 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m3, m1, 8 + movu [r0 + 16], m2 + palignr m2, m1, m0, 9 + movu [r0 + r1], m2 + palignr m2, m3, m1, 9 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 10 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 10 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 11 + movu [r0 + r3], m2 + palignr m2, m3, m1, 11 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + palignr m2, m1, m0, 12 + movu [r0], m2 + palignr m2, m3, m1, 12 + movu [r0 + 16], m2 + palignr m2, m1, m0, 13 + movu [r0 + r1], m2 + palignr m2, m3, m1, 13 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 14 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 14 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 15 + movu [r0 + r3], m2 + palignr m2, m3, m1, 15 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + movu [r0], m1 + movu m0, [r2 + 50] + movu [r0 + 16], m3 + palignr m2, m3, m1, 1 + movu [r0 + r1], m2 + palignr m2, m0, m3, 1 + movu [r0 + r1 + 16], m2 + palignr m2, m3, m1, 2 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m3, 2 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m3, m1, 3 + movu [r0 + r3], m2 + palignr m2, m0, m3, 3 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + palignr m2, m3, m1, 4 + movu [r0], m2 + palignr m2, m0, m3, 4 + movu [r0 + 16], m2 + palignr m2, m3, m1, 5 + movu [r0 + r1], m2 + palignr m2, m0, m3, 5 + movu [r0 + r1 + 16], m2 + palignr m2, m3, m1, 6 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m3, 6 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m3, m1, 7 + movu [r0 + r3], m2 + palignr m2, m0, m3, 7 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + palignr m2, m3, m1, 8 + movu [r0], m2 + palignr m2, m0, m3, 8 + movu [r0 + 16], m2 + palignr m2, m3, m1, 9 + movu [r0 + r1], m2 + palignr m2, m0, m3, 9 + movu [r0 + r1 + 16], m2 + palignr m2, m3, m1, 10 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m3, 10 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m3, m1, 11 + movu [r0 + r3], m2 + palignr m2, m0, m3, 11 + movu [r0 + r3 + 16], m2 + + lea r0, [r0 + r1 * 4] + + palignr m2, m3, m1, 12 + movu [r0], m2 + palignr m2, m0, m3, 12 + movu [r0 + 16], m2 + palignr m2, m3, m1, 13 + movu [r0 + r1], m2 + palignr m2, m0, m3, 13 + movu [r0 + r1 + 16], m2 + palignr m2, m3, m1, 14 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m3, 14 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m3, m1, 15 + movu [r0 + r3], m2 + palignr m2, m0, m3, 15 + movu [r0 + r3 + 16], m2 + RET + +; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8 +%macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7 + %if %3 == 0 + %else + pshufb m0, [r3] + pmaddubsw m0, [r4 + %3 * 16] + pmulhrsw m0, [pw_1024] + %endif + %if %4 == 0 + pmovzxbw m1, m1 + %else + pshufb m1, [r3] + pmaddubsw m1, [r4 + %4 * 16] + pmulhrsw m1, [pw_1024] + %endif + %if %3 == 0 + packuswb m1, m1 + movlhps m0, m1 + %else + packuswb m0, m1 + %endif + mova m1, [pw_1024] + %if %5 == 0 + %else + pshufb m2, [r3] + pmaddubsw m2, [r4 + %5 * 16] + pmulhrsw m2, m1 + %endif + %if %6 == 0 + pmovzxbw m3, m3 + %else + pshufb m3, [r3] + pmaddubsw m3, [r4 + %6 * 16] + pmulhrsw m3, m1 + %endif + %if %5 == 0 + packuswb m3, m3 + movlhps m2, m3 + %else + packuswb m2, m3 + %endif + %if %7 == 0 + %else + pshufb m4, [r3] + pmaddubsw m4, [r4 + %7 * 16] + pmulhrsw m4, m1 + %endif + %if %8 == 0 + pmovzxbw m5, m5 + %else + pshufb m5, [r3] + pmaddubsw m5, [r4 + %8 * 16] + pmulhrsw m5, m1 + %endif + %if %7 == 0 + packuswb m5, m5 + movlhps m4, m5 + %else + packuswb m4, m5 + %endif + %if %9 == 0 + %else + pshufb m6, [r3] + pmaddubsw m6, [r4 + %9 * 16] + pmulhrsw m6, m1 + %endif + %if %10 == 0 + pmovzxbw m7, m7 + %else + pshufb m7, [r3] + pmaddubsw m7, [r4 + %10 * 16] + pmulhrsw m7, m1 + %endif + %if %9 == 0 + packuswb m7, m7 + movlhps m6, m7 + %else + packuswb m6, m7 + %endif + + %if %2 == 1 + ; transpose + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + punpckhbw m3, m0, m1 + punpcklbw m0, m1 + + punpckhbw m1, m4, m6 + punpcklbw m4, m6 + punpckhbw m6, m4, m1 + punpcklbw m4, m1 + + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckldq m4, m3, m6 + punpckhdq m3, m6 + + movh [r0 + + %1 * 8], m0 + movhps [r0 + r1 + %1 * 8], m0 + movh [r0 + r1*2 + %1 * 8], m2 + movhps [r0 + r5 + %1 * 8], m2 + movh [r6 + %1 * 8], m4 + movhps [r6 + r1 + %1 * 8], m4 + movh [r6 + r1*2 + %1 * 8], m3 + movhps [r6 + r5 + %1 * 8], m3 + %else + movh [r0 ], m0 + movhps [r0 + r1 ], m0 + movh [r0 + r1 * 2], m2 + movhps [r0 + r5 ], m2 + lea r0, [r0 + r1 * 4] + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m6 + movhps [r0 + r5 ], m6 + %endif +%endmacro + +%macro MODE_3_33 1 + movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m5, m2, m0, 4 + pmaddubsw m5, [r3 - 2 * 16] ; [14] + pmulhrsw m5, m7 + palignr m6, m2, m0, 6 + pmaddubsw m6, [r3 - 8 * 16] ; [ 8] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m1, m2, m0, 8 + pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 10 + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + palignr m2, m0, 12 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + movu m0, [r2 + 8] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + pmaddubsw m6, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m1, m2, m0, 6 + pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmulhrsw m6, m7 + palignr m1, m2, m0, 8 + pmaddubsw m1, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 10 + pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 14] ; [00] + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + movu m0, [r2 + 14] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m5, m2, m0, 4 + pmaddubsw m5, [r3 - 2 * 16] ; [14] + pmulhrsw m5, m7 + palignr m6, m2, m0, 6 + pmaddubsw m6, [r3 - 8 * 16] ; [ 8] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m1, m2, m0, 8 + pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 10 + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + palignr m2, m0, 12 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + movu m0, [r2 + 21] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + pmaddubsw m6, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m1, m2, m0, 6 + pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmulhrsw m6, m7 + palignr m1, m2, m0, 8 + pmaddubsw m1, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 10 + pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 27] ; [00] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_3, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_3_33 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_4_32 1 + movu m0, [r2 + 1] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 15 * 16] ; [31] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + pmaddubsw m6, [r3 + 4 * 16] ; [ 20] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m1, m2, m0, 6 + pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 8 + pmaddubsw m1, [r3 + 3 * 16] ; [19] + pmulhrsw m1, m7 + palignr m2, m0, 10 + pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] + pmulhrsw m4, m7 + movu m0, [r2 + 6] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + pmaddubsw m1, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m5, m2, m0, 4 + mova m6, m5 + pmaddubsw m5, [r3 - 9 * 16] ; [07] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m6, m2, m0, 6 + pmaddubsw m6, [r3 + 16] ; [17] + pmulhrsw m6, m7 + palignr m1, m2, m0, 8 + pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + palignr m2, m0, 10 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + movu m0, [r2 + 12] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m1, m0 + pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m5, m2, m0, 2 + pmaddubsw m5, [r3 - 16] ; [15] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + mova m1, m6 + pmaddubsw m1, [r3 - 12 * 16] ; [4] + pmulhrsw m1, m7 + packuswb m5, m1 + pmaddubsw m6, [r3 + 9 * 16] ; [25] + pmulhrsw m6, m7 + palignr m1, m2, m0, 6 + pmaddubsw m1, [r3 - 2 * 16] ; [14] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 8 + mova m2, m1 + pmaddubsw m1, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m2, [r3 + 8 * 16] ; [24] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + movu m0, [r2 + 17] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + palignr m5, m2, m0, 2 + pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 7 * 16] ; [23] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + pmaddubsw m6, [r3 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m6, m2, m0, 6 + mova m1, m6 + pmaddubsw m6, [r3 - 15 * 16] ; [1] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m1, m2, m0, 8 + pmaddubsw m1, [r3 - 5 * 16] ; [11] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 22] ; [00] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;----------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_4, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_4_32 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_5_31 1 + movu m0, [r2 + 1] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + pmaddubsw m4, m0, [r3 + 16] ; [17] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 3 * 16] ; [19] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + mova m1, m6 + pmaddubsw m6, [r3 - 12 * 16] ; [4] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m1, [r3 + 5 * 16] ; [21] + pmulhrsw m6, m7 + palignr m1, m2, m0, 6 + mova m3, m1 + pmaddubsw m3, [r3 - 10 * 16] ; [6] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, [r3 + 7 * 16] ; [23] + pmulhrsw m1, m7 + palignr m2, m0, 8 + pmaddubsw m2, [r3 - 8 * 16] ; [8] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + movu m0, [r2 + 5] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + pmaddubsw m4, m0, [r3 + 9 * 16] ; [25] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 11 * 16] ; [27] + pmulhrsw m5, m7 + palignr m6, m2, m0, 4 + mova m1, m6 + pmaddubsw m6, [r3 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m1, [r3 + 13 * 16] ; [29] + pmulhrsw m6, m7 + palignr m1, m2, m0, 6 + mova m3, m1 + pmaddubsw m3, [r3 - 2 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmulhrsw m1, m7 + palignr m2, m0, 8 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + movu m0, [r2 + 10] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m1, m0 + pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m5, m2, m0, 2 + mova m1, m5 + pmaddubsw m5, [r3 - 13 * 16] ; [3] + pmulhrsw m5, m7 + pmaddubsw m1, [r3 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m5, m1 + palignr m1, m2, m0, 4 + pmaddubsw m6, m1, [r3 - 11 * 16] ; [5] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m2, m0, 6 + pmaddubsw m1, m2, [r3 - 9 * 16] ; [7] + pmulhrsw m1, m7 + pmaddubsw m2, [r3 + 8 * 16] ; [24] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + movu m0, [r2 + 14] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m1, m0 + pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m5, m2, m0, 2 + mova m1, m5 + pmaddubsw m5, [r3 - 5 * 16] ; [11] + pmulhrsw m5, m7 + pmaddubsw m1, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m5, m1 + palignr m1, m2, m0, 4 + pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m2, m0, 6 + pmaddubsw m1, m2, [r3 - 16] ; [15] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 18] ; [00] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_5, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_5_31 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_6_30 1 + movu m0, [r2 + 1] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m1, m0 + pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 + palignr m6, m2, m0, 2 + pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m1, m2, m0, 4 + pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] + pmulhrsw m6, m7 + pmaddubsw m3, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + palignr m2, m0, 6 + pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + movu m0, [r2 + 5] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m6, m0 + pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, m6, [r3 - 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m3, m2, m0, 2 + pmaddubsw m6, m3, [r3 - 7 * 16] ; [9] + pmulhrsw m6, m7 + pmaddubsw m3, [r3 + 6 * 16] ; [22] + pmulhrsw m3, m7 + packuswb m6, m3 + palignr m2, m0, 4 + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r3] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] + pmulhrsw m4, m7 + movu m0, [r2 + 7] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, [r3 + 7 * 16] ; [23] + pmulhrsw m5, m7 + palignr m1, m2, m0, 4 + pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m1, [r3 + 16] ; [17] + pmulhrsw m6, m7 + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 + palignr m2, m2, m0, 6 + pmaddubsw m1, m2, [r3 - 5 * 16] ; [11] + pmulhrsw m1, m7 + pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + movu m0, [r2 + 11] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m5, m0 + pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, [r3 + 15 * 16] ; [31] + pmulhrsw m5, m7 + palignr m6, m2, m0, 2 + pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m5, m1 + pmaddubsw m6, [r3 + 9 * 16] ; [25] + pmulhrsw m6, m7 + palignr m1, m2, m0, 4 + pmaddubsw m2, m1, [r3 - 10 * 16] ; [6] + pmulhrsw m2, m7 + packuswb m6, m2 + pmaddubsw m1, [r3 + 3 * 16] ; [19] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 14] ; [00] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_6, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_6_30 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_7_29 1 + movu m0, [r2 + 1] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mova m5, m0 + pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] + pmulhrsw m4, m7 + pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, [r3 + 11 * 16] ; [27] + pmulhrsw m5, m7 + palignr m1, m2, m0, 2 + palignr m2, m0, 4 + pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] + pmulhrsw m6, m7 + pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 16] ; [17] + pmulhrsw m4, m7 + pmaddubsw m2, [r3 + 10 * 16] ; [26] + pmulhrsw m2, m7 + packuswb m4, m2 + movu m0, [r2 + 4] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m2, m0, 2 + pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] + pmulhrsw m6, m7 + pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r3] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 9 * 16] ; [25] + pmulhrsw m4, m7 + movu m0, [r2 + 6] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m2, m0, 2 + pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, m0, [r3 - 5 * 16] ; [11] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m0, [r3 + 13 * 16] ; [29] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r3 - 10 * 16] ; [6] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r3 - 16] ; [15] + pmulhrsw m1, m7 + pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + movu m0, [r2 + 8] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m0, [r3 + 3 * 16] ; [19] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m2, m0, 2 + pmaddubsw m6, m2, [r3 - 11 * 16] ; [5] + pmulhrsw m6, m7 + pmaddubsw m0, m2, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, m2, [r3 + 7 * 16] ; [23] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 10] ; [0] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_7, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_7_29 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_8_28 1 + movu m0, [r2 + 1] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m2, m0, 2 + pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m0, [r3 - 1 * 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m0, [r3 + 9 * 16] ; [25] + pmulhrsw m6, m7 + pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] + pmulhrsw m5, m7 + pmaddubsw m2, [r3 + 12 * 16] ; [28] + pmulhrsw m2, m7 + packuswb m5, m2 + movu m0, [r2 + 3] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m6, m0, [r3 - 15 * 16] ; [01] + pmulhrsw m6, m7 + pmaddubsw m1, m0, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m0, [r3 - 5 * 16] ; [11] + pmulhrsw m1, m7 + mova m2, m0 + pmaddubsw m0, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 10 * 16] ; [26] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r3 + 15 * 16] ; [31] + pmulhrsw m5, m7 + movu m0, [r2 + 4] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, m0, [r3 - 12 * 16] ; [4] + pmulhrsw m2, m7 + packuswb m5, m2 + pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] + pmulhrsw m6, m7 + pmaddubsw m1, m0, [r3 - 2 * 16] ; [14] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m0, [r3 + 3 * 16] ; [19] + pmulhrsw m1, m7 + mova m2, m0 + pmaddubsw m0, [r3 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] + pmulhrsw m4, m7 + movu m0, [r2 + 5] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 + pmaddubsw m5, m0, [r3 - 9 * 16] ; [7] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m0, [r3 + 16] ; [17] + pmulhrsw m6, m7 + pmaddubsw m1, m0, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m0, [r3 + 11 * 16] ; [27] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 6] ; [00] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_8, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_8_28 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +%macro MODE_9_27 1 + movu m2, [r2 + 1] + palignr m1, m2, 1 + punpckhbw m0, m2, m1 + punpcklbw m2, m1 + pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] + pmulhrsw m6, m7 + pmaddubsw m3, m2, [r3 - 4 * 16] ; [12] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 2] ; [00] + + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + + movu m2, [r2 + 2] + palignr m1, m2, 1 + punpcklbw m2, m1 + pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] + pmulhrsw m6, m7 + pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + + movu m2, [r2 + 2] + palignr m1, m2, 1 + punpcklbw m2, m1 + pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m1, m1 + movhps m1, [r2 + 3] ; [00] + + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_9, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] +.loop: + MODE_9_27 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize) +%define m8 [rsp + 0 * mmsize] +%define m9 [rsp + 1 * mmsize] + lea r4, [r1 * 3] + pxor m7, m7 + mov r6, 2 + movu m0, [r3] + movu m1, [r3 + 1] + mova m8, m0 + mova m9, m1 + mov r3d, r5d + +.loop: + movu m0, [r2 + 1] + palignr m1, m0, 1 + pshufb m1, m7 + palignr m2, m0, 2 + pshufb m2, m7 + palignr m3, m0, 3 + pshufb m3, m7 + palignr m4, m0, 4 + pshufb m4, m7 + palignr m5, m0, 5 + pshufb m5, m7 + palignr m6, m0, 6 + pshufb m6, m7 + + movu [r0 + r1], m1 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 16], m2 + movu [r0 + r4], m3 + movu [r0 + r4 + 16], m3 + lea r5, [r0 + r1 * 4] + movu [r5], m4 + movu [r5 + 16], m4 + movu [r5 + r1], m5 + movu [r5 + r1 + 16], m5 + movu [r5 + r1 * 2], m6 + movu [r5 + r1 * 2 + 16], m6 + + palignr m1, m0, 7 + pshufb m1, m7 + movhlps m2, m0 + pshufb m2, m7 + palignr m3, m0, 9 + pshufb m3, m7 + palignr m4, m0, 10 + pshufb m4, m7 + palignr m5, m0, 11 + pshufb m5, m7 + palignr m6, m0, 12 + pshufb m6, m7 + + movu [r5 + r4], m1 + movu [r5 + r4 + 16], m1 + lea r5, [r5 + r1 * 4] + movu [r5], m2 + movu [r5 + 16], m2 + movu [r5 + r1], m3 + movu [r5 + r1 + 16], m3 + movu [r5 + r1 * 2], m4 + movu [r5 + r1 * 2 + 16], m4 + movu [r5 + r4], m5 + movu [r5 + r4 + 16], m5 + lea r5, [r5 + r1 * 4] + movu [r5], m6 + movu [r5 + 16], m6 + + palignr m1, m0, 13 + pshufb m1, m7 + palignr m2, m0, 14 + pshufb m2, m7 + palignr m3, m0, 15 + pshufb m3, m7 + pshufb m0, m7 + + movu [r5 + r1], m1 + movu [r5 + r1 + 16], m1 + movu [r5 + r1 * 2], m2 + movu [r5 + r1 * 2 + 16], m2 + movu [r5 + r4], m3 + movu [r5 + r4 + 16], m3 + +; filter + cmp r3d, byte 0 + jz .quit + movhlps m1, m0 + pmovzxbw m0, m0 + mova m1, m0 + movu m2, m8 + movu m3, m9 + + pshufb m2, m7 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 + +.quit: + movu [r0], m0 + movu [r0 + 16], m0 + dec r6 + lea r0, [r5 + r1 * 4] + lea r2, [r2 + 16] + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_11, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3 + 16] + pxor m1, m1 + pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + mova [rsp], m0 + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu [rsp + 1], m0 + movu [rsp + 1 + 16], m1 + movu [rsp + 1 + 32], m2 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 1] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16 + + ; Row[8 - 15] + movu m7, [r2] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0 + + ; Row[16 - 23] + movu m7, [r2 - 1] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16 + + ; Row[24 - 31] + movu m7, [r2 - 1] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +%macro MODE_12_24_ROW0 1 + movu m0, [r3 + 6] + pshufb m0, [c_mode32_12_0] + pinsrb m0, [r3 + 26], 12 + mova above, m0 + movu m2, [r2] + palignr m1, m2, 1 + punpcklbw m2, m1 + pmaddubsw m4, m2, [r4 + 11 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4 + 6 * 16] ; [22] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 + 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 9 * 16] ; [7] + pmulhrsw m6, m7 + pmaddubsw m3, m2, [r4 - 14 * 16] ; [2] + pmulhrsw m3, m7 + packuswb m6, m3 + movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] + punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] + pmulhrsw m5, m7 + pslldq m1, above, 1 + palignr m2, m1, 14 + pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] + pmulhrsw m4, m7 + pslldq m1, above, 2 + palignr m2, m1, 14 + pmaddubsw m5, m2, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 16] ; [15] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + movu m0, [pb_fact0] + pshufb m2, m0 + pmovzxbw m2, m2 + packuswb m1, m2 + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_12_24 1 + movu m2, [r2] + palignr m1, m2, 1 + punpckhbw m0, m2, m1 + punpcklbw m2, m1 + palignr m0, m2, 2 + pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r4 + 6 * 16] ; [22] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m0, [r4 + 16] ; [17] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] + pmulhrsw m6, m7 + pmaddubsw m3, m0, [r4 - 14 * 16] ; [2] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + movu m0, [r2 - 2] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m2, m0, 2 + pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] + pmulhrsw m5, m7 + movu m0, [r2 - 3] + palignr m1, m0, 1 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m2, m0, 2 + pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] + pmulhrsw m4, m7 + movu m2, [r2 - 4] + palignr m1, m2, 1 + punpckhbw m0, m2, m1 + punpcklbw m2, m1 + palignr m0, m2, 2 + pmaddubsw m5, m0, [r4 + 14 * 16] ; [30] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m0, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m0, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m0, [r4 - 16] ; [15] + pmulhrsw m6, m7 + pmaddubsw m1, m0, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m0, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + movu m2, [pb_fact0] + pshufb m0, m2 + pmovzxbw m0, m0 + packuswb m1, m0 + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;----------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize) + %define above [rsp + 0 * mmsize] + + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + MODE_12_24_ROW0 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 7 + mov r3, 3 +.loop: + MODE_12_24 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r3 + jnz .loop + RET + +%macro MODE_13_23_ROW0 1 + movu m0, [r3 + 1] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_13_0] + pshufb m1, [c_mode32_13_0] + punpckldq m0, m1 + pshufb m0, [c_mode32_13_shuf] + mova above, m0 + movu m2, [r2] + palignr m1, m2, 1 + punpcklbw m2, m1 + pmaddubsw m4, m2, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4 - 2 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 - 11 * 16] ; [5] + pmulhrsw m5, m7 + movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] + punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m0, m2, [r4 - 6 * 16] ; [10] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] + pmulhrsw m1, m7 + palignr m2, above, 14 + pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r4 - 10 * 16] ; [6] + pmulhrsw m5, m7 + packuswb m4, m5 + pslldq m0, above, 1 + palignr m2, m0, 14 + pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m6, m1 + pslldq m0, 1 + palignr m2, m0, 14 + pmaddubsw m1, m2, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 - 9 * 16] ; [7] + pmulhrsw m4, m7 + pslldq m0, above, 3 + palignr m2, m0, 14 + pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] + pmulhrsw m6, m7 + pslldq m0, 1 + palignr m2, m0, 14 + pmaddubsw m0, m2, [r4 + 10 * 16] ; [26] + pmulhrsw m0, m7 + packuswb m6, m0 + pmaddubsw m1, m2, [r4 + 16] ; [17] + pmulhrsw m1, m7 + pmaddubsw m0, m2, [r4 - 8 * 16] ; [8] + pmulhrsw m0, m7 + packuswb m1, m0 + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + pslldq m0, above, 5 + palignr m2, m0, 14 + pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + pslldq m0, 1 + palignr m2, m0, 14 + pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_13_23 1 + movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] + pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] + pmulhrsw m1, m7 + movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] + palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + punpckhbw m0, m2, m3 + punpcklbw m2, m3 + palignr m0, m2, 2 + pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] + pmulhrsw m3, m7 + packuswb m1, m3 + mova m3, m0 + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m6, m1 + movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] + pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m3, m0, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + mova m3, m0 + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] + pmulhrsw m6, m7 + movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] + pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m0, [r4 + 16] ; [17] + pmulhrsw m1, m7 + pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + movu m0, [pb_fact0] + pshufb m2, m0 + pmovzxbw m2, m2 + packuswb m1, m2 + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro +;----------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize) +%define above [rsp + 0 * mmsize] + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + MODE_13_23_ROW0 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 7 + mov r3, 3 +.loop: + MODE_13_23 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r3 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_14, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] + pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] + pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] + palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] + mova [rsp], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 13], m0 + movu [rsp + 13 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 13] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 4] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m4 + PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 + + ; Row[8 - 15] + movu m7, [r2 - 7] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + mova m3, m1 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 + + ; Row[16 - 23] + movu m7, [r2 - 10] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m3 + mova m6, m7 + PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 + + ; Row[24 - 31] + movu m7, [r2 - 13] + palignr m0, m7, 2 + mova m1, m0 + mova m2, m0 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m7 + mova m6, m7 + PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_15, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] + pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] + mova [rsp], m1 + movu [rsp + 8], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 17], m0 + movu [rsp + 17 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 17] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 5] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 + + ; Row[8 - 15] + movu m7, [r2 - 9] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 + + ; Row[16 - 23] + movu m7, [r2 - 13] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 + + ; Row[24 - 31] + movu m7, [r2 - 17] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_16, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] + pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] + mova [rsp], m1 + movu [rsp + 10], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 21], m0 + movu [rsp + 21 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 21] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 + + ; Row[8 - 15] + movu m7, [r2 - 11] + palignr m0, m7, 5 + palignr m1, m7, 4 + palignr m2, m7, 3 + mova m3, m2 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 + + ; Row[16 - 23] + movu m7, [r2 - 16] + palignr m0, m7, 4 + mova m1, m0 + palignr m2, m7, 3 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m7 + PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 + + ; Row[24 - 31] + movu m7, [r2 - 21] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_17, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 16] + pshufb m0, [c_mode32_17_0] + pshufb m1, [c_mode32_17_0] + mova [rsp ], m1 + movu [rsp + 13], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 26], m0 + movu [rsp + 26 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 25] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 + + ; Row[7 - 15] + movu m7, [r2 - 12] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 + + ; Row[16 - 23] + movu m7, [r2 - 19] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 + + ; Row[24 - 31] + movu m7, [r2 - 25] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_18, 4,5,5 + movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] + movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] + + lea r2, [r1 * 2] + lea r3, [r1 * 3] + lea r4, [r1 * 4] + + movu [r0], m0 + movu [r0 + 16], m1 + + pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] + + palignr m4, m0, m2, 15 + movu [r0 + r1], m4 + palignr m4, m1, m0, 15 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 14 + movu [r0 + r2], m4 + palignr m4, m1, m0, 14 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 13 + movu [r0 + r3], m4 + palignr m4, m1, m0, 13 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + palignr m4, m0, m2, 12 + movu [r0], m4 + palignr m4, m1, m0, 12 + movu [r0 + 16], m4 + palignr m4, m0, m2, 11 + movu [r0 + r1], m4 + palignr m4, m1, m0, 11 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 10 + movu [r0 + r2], m4 + palignr m4, m1, m0, 10 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 9 + movu [r0 + r3], m4 + palignr m4, m1, m0, 9 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + palignr m4, m0, m2, 8 + movu [r0], m4 + palignr m4, m1, m0, 8 + movu [r0 + 16], m4 + palignr m4, m0, m2, 7 + movu [r0 + r1], m4 + palignr m4, m1, m0, 7 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 6 + movu [r0 + r2], m4 + palignr m4, m1, m0, 6 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 5 + movu [r0 + r3], m4 + palignr m4, m1, m0, 5 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + palignr m4, m0, m2, 4 + movu [r0], m4 + palignr m4, m1, m0, 4 + movu [r0 + 16], m4 + palignr m4, m0, m2, 3 + movu [r0 + r1], m4 + palignr m4, m1, m0, 3 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 2 + movu [r0 + r2], m4 + palignr m4, m1, m0, 2 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 1 + movu [r0 + r3], m4 + palignr m4, m1, m0, 1 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + movu [r0], m2 + movu [r0 + 16], m0 + palignr m4, m2, m3, 15 + movu [r0 + r1], m4 + palignr m4, m0, m2, 15 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 14 + movu [r0 + r2], m4 + palignr m4, m0, m2, 14 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 13 + movu [r0 + r3], m4 + palignr m4, m0, m2, 13 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + palignr m4, m2, m3, 12 + movu [r0], m4 + palignr m4, m0, m2, 12 + movu [r0 + 16], m4 + palignr m4, m2, m3, 11 + movu [r0 + r1], m4 + palignr m4, m0, m2, 11 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 10 + movu [r0 + r2], m4 + palignr m4, m0, m2, 10 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 9 + movu [r0 + r3], m4 + palignr m4, m0, m2, 9 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + palignr m4, m2, m3, 8 + movu [r0], m4 + palignr m4, m0, m2, 8 + movu [r0 + 16], m4 + palignr m4, m2, m3, 7 + movu [r0 + r1], m4 + palignr m4, m0, m2, 7 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 6 + movu [r0 + r2], m4 + palignr m4, m0, m2, 6 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 5 + movu [r0 + r3], m4 + palignr m4, m0, m2, 5 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r4] + + palignr m4, m2, m3, 4 + movu [r0], m4 + palignr m4, m0, m2, 4 + movu [r0 + 16], m4 + palignr m4, m2, m3, 3 + movu [r0 + r1], m4 + palignr m4, m0, m2, 3 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 2 + movu [r0 + r2], m4 + palignr m4, m0, m2, 2 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 1 + movu [r0 + r3], m4 + palignr m4, m0, m2, 1 + movu [r0 + r3 + 16], m4 + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_19, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + xchg r2, r3 + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 16] + pshufb m0, [c_mode32_17_0] + pshufb m1, [c_mode32_17_0] + mova [rsp ], m1 + movu [rsp + 13], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 26], m0 + movu [rsp + 26 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 25] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 + + ; Row[7 - 15] + movu m7, [r2 - 12] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 + + ; Row[16 - 23] + movu m7, [r2 - 19] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 + + ; Row[24 - 31] + movu m7, [r2 - 25] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 + + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_20, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + xchg r2, r3 + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] + pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] + mova [rsp], m1 + movu [rsp + 10], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 21], m0 + movu [rsp + 21 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 21] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 + + ; Row[8 - 15] + movu m7, [r2 - 11] + palignr m0, m7, 5 + palignr m1, m7, 4 + palignr m2, m7, 3 + mova m3, m2 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m5 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 + + ; Row[16 - 23] + movu m7, [r2 - 16] + palignr m0, m7, 4 + mova m1, m0 + palignr m2, m7, 3 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 + + ; Row[24 - 31] + movu m7, [r2 - 21] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 + + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_21, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + xchg r2, r3 + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] + pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] + mova [rsp], m1 + movu [rsp + 8], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 17], m0 + movu [rsp + 17 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 17] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 5] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 + + ; Row[8 - 15] + movu m7, [r2 - 9] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 + + ; Row[16 - 23] + movu m7, [r2 - 13] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 + + ; Row[24 - 31] + movu m7, [r2 - 17] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 + + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_22, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + + xchg r2, r3 + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3] + movu m1, [r3 + 15] + pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] + pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] + pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] + palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] + mova [rsp], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 13], m0 + movu [rsp + 13 + 16], m1 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 13] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2 - 4] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m4 + PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 + + ; Row[8 - 15] + movu m7, [r2 - 7] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + mova m3, m1 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 + + ; Row[16 - 23] + movu m7, [r2 - 10] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m3 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 + + ; Row[24 - 31] + movu m7, [r2 - 13] + palignr m0, m7, 2 + mova m1, m0 + mova m2, m0 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 + + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;----------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) +%define above [rsp + 0 * mmsize] + xchg r2, r3 + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + MODE_13_23_ROW0 0 + add r6, 8 + mov r0, r6 + add r2, 7 + mov r3, 3 +.loop: + MODE_13_23 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r3 + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) + %define above [rsp + 0 * mmsize] + xchg r2, r3 + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + + MODE_12_24_ROW0 0 + add r6, 8 + mov r0, r6 + add r2, 7 + mov r3, 3 +.loop: + MODE_12_24 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r3 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_25, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + xchg r2, r3 + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 + + ; collect reference pixel + movu m0, [r3 + 16] + pxor m1, m1 + pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + mova [rsp], m0 + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu [rsp + 1], m0 + movu [rsp + 1 + 16], m1 + movu [rsp + 1 + 32], m2 + mov [rsp + 63], byte 4 + + ; filter + lea r2, [rsp + 1] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 + +.loop: + ; Row[0 - 7] + movu m7, [r2] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 + + ; Row[8 - 15] + movu m7, [r2] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 + + ; Row[16 - 23] + movu m7, [r2 - 1] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 + + ; Row[24 - 31] + movu m7, [r2 - 1] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 + + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize) +%define m8 [rsp + 0 * mmsize] +%define m9 [rsp + 1 * mmsize] + lea r4, [r1 * 3] + mov r6, 2 + movu m0, [r2] + movu m1, [r2 + 1] + mova m8, m0 + mova m9, m1 + mov r2d, r5d + +.loop: + movu m0, [r3 + 1] + + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r4], m0 + lea r5, [r0 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r0 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + +; filter + cmp r2d, byte 0 + jz .quit + + pxor m4, m4 + pshufb m0, m4 + pmovzxbw m0, m0 + mova m1, m0 + movu m2, m8 + movu m3, m9 + + pshufb m2, m4 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 + + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r4], m0, 3 + lea r5, [r0 + r1 * 4] + pextrb [r5], m0, 4 + pextrb [r5 + r1], m0, 5 + pextrb [r5 + r1 * 2], m0, 6 + pextrb [r5 + r4], m0, 7 + lea r5, [r5 + r1 * 4] + pextrb [r5], m0, 8 + pextrb [r5 + r1], m0, 9 + pextrb [r5 + r1 * 2], m0, 10 + pextrb [r5 + r4], m0, 11 + lea r5, [r5 + r1 * 4] + pextrb [r5], m0, 12 + pextrb [r5 + r1], m0, 13 + pextrb [r5 + r1 * 2], m0, 14 + pextrb [r5 + r4], m0, 15 + +.quit: + lea r3, [r3 + 16] + add r0, 16 + dec r6d + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_27, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_9_27 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_28, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_8_28 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_29, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_7_29 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_30, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_6_30 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_31, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_5_31 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------- +; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_ang32_32, 3,7,8 + mov r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_4_32 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------------ +; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal intra_pred_ang32_33, 3,7,8 + xchg r2, r3mp + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_3_33 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;----------------------------------------------------------------------------- +; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_4x4, 6, 6, 8 + +; mode 2 + +movh m0, [r2 + 2] +movd [r0], m0 + +palignr m1, m0, 1 +movd [r0 + 4], m1 + +palignr m1, m0, 2 +movd [r0 + 8], m1 + +psrldq m0, 3 +movd [r0 + 12], m0 + +; mode 3 + +mova m0, [pw_1024] + +movh m1, [r2 + 1] + +palignr m2, m1, 1 +punpcklbw m1, m2 + +lea r5, [ang_table] + +pmaddubsw m5, m1, [r5 + 26 * 16] +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 16], m5 + +palignr m2, m1, 2 + +mova m7, [r5 + 20 * 16] + +pmaddubsw m6, m2, m7 +pmulhrsw m6, m0 +packuswb m6, m6 +movd [r0 + 20], m6 + +palignr m3, m1, 4 + +pmaddubsw m4, m3, [r5 + 14 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 24], m4 + +palignr m4, m1, 6 + +pmaddubsw m4, [r5 + 8 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 28], m4 + +; mode 4 + +pmaddubsw m4, m1, [r5 + 21 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 32], m4 + +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 36], m4 + +pmaddubsw m4, m2, [r5 + 31 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 40], m4 + +pmaddubsw m4, m3, m7 +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 44], m4 + +; mode 5 + +pmaddubsw m4, m1, [r5 + 17 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 48], m4 + +pmaddubsw m4, m2, [r5 + 2 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 52], m4 + +pmaddubsw m4, m2, [r5 + 19 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 56], m4 + +pmaddubsw m3, [r5 + 4 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 60], m3 + +; mode 6 + +pmaddubsw m3, m1, [r5 + 13 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 64], m3 + +movd [r0 + 68], m5 + +pmaddubsw m3, m2, [r5 + 7 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 72], m3 + +movd [r0 + 76], m6 + +; mode 7 + +pmaddubsw m3, m1, [r5 + 9 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 80], m3 + +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 84], m3 + +pmaddubsw m3, m1, [r5 + 27 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 88], m3 + +pmaddubsw m2, [r5 + 4 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 92], m2 + +; mode 8 + +pmaddubsw m2, m1, [r5 + 5 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 96], m2 + +pmaddubsw m2, m1, [r5 + 10 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 100], m2 + +pmaddubsw m2, m1, [r5 + 15 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 104], m2 + +pmaddubsw m2, m1, m7 +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 108], m2 + +; mode 9 + +pmaddubsw m2, m1, [r5 + 2 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 112], m2 + +pmaddubsw m2, m1, [r5 + 4 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 116], m2 + +pmaddubsw m2, m1, [r5 + 6 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 120], m2 + +pmaddubsw m1, [r5 + 8 * 16] +pmulhrsw m1, m0 +packuswb m1, m1 +movd [r0 + 124], m1 + +; mode 10 + +movh m1, [r2] +palignr m2, m1, 1 +pshufd m3, m2, 0 +movu [r0 + 128], m3 + +pxor m3, m3 + +pshufb m4, m2, m3 +punpcklbw m4, m3 + +movh m5, [r1] + +pshufb m6, m5, m3 +punpcklbw m6, m3 + +psrldq m5, 1 +punpcklbw m5, m3 + +psubw m5, m6 +psraw m5, 1 + +paddw m4, m5 + +packuswb m4, m3 + +pextrb [r0 + 128], m4, 0 +pextrb [r0 + 132], m4, 1 +pextrb [r0 + 136], m4, 2 +pextrb [r0 + 140], m4, 3 + +; mode 11 + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m2, m1, [r5 + 30 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 144], m2 + +pmaddubsw m2, m1, [r5 + 28 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 148], m2 + +pmaddubsw m2, m1, [r5 + 26 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 152], m2 + +pmaddubsw m2, m1, [r5 + 24 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 156], m2 + +; mode 12 + +pmaddubsw m2, m1, [r5 + 27 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 160], m2 + +pmaddubsw m2, m1, [r5 + 22 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 164], m2 + +pmaddubsw m2, m1, [r5 + 17 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 168], m2 + +pmaddubsw m2, m1, [r5 + 12 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 172], m2 + +; mode 13 + +pmaddubsw m2, m1, [r5 + 23 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 176], m2 + +pmaddubsw m2, m1, [r5 + 14 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 180], m2 + +pmaddubsw m2, m1, [r5 + 5 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 184], m2 + +pslldq m2, m1, 2 +pinsrb m2, [r1 + 0], 1 +pinsrb m2, [r1 + 4], 0 + +pmaddubsw m3, m2, [r5 + 28 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 188], m3 + +; mode 14 + +pmaddubsw m3, m1, [r5 + 19 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 192], m3 + +pmaddubsw m5, m1, [r5 + 6 * 16] +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 196], m5 + +pinsrb m2, [r1 + 2], 0 + +pmaddubsw m3, m2, [r5 + 25 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 200], m3 + +pmaddubsw m3, m2, [r5 + 12 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 204], m3 + +; mode 15 + +pmaddubsw m3, m1, [r5 + 15 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 208], m3 + +pmaddubsw m3, m2, [r5 + 30 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 212], m3 + +pmaddubsw m3, m2, [r5 + 13 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 216], m3 + +pslldq m3, m2, 2 +pinsrb m3, [r1 + 2], 1 +pinsrb m3, [r1 + 4], 0 + +pmaddubsw m4, m3, [r5 + 28 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 220], m4 + +; mode 16 + +pmaddubsw m4, m1, [r5 + 11 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 224], m4 + +pmaddubsw m4, m2, [r5 + 22 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 228], m4 + +pmaddubsw m4, m2, [r5 + 1 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 232], m4 + +pinsrb m3, [r1 + 3], 0 + +pmaddubsw m3, [r5 + 12 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 236], m3 + +; mode 17 + +movd [r0 + 240], m5 + +pslldq m1, 2 +pinsrb m1, [r1 + 1], 0 +pinsrb m1, [r1 + 0], 1 + +pmaddubsw m2, m1, [r5 + 12 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 244], m2 + +pslldq m1, 2 +pinsrb m1, [r1 + 2], 0 +pinsrb m1, [r1 + 1], 1 + +pmaddubsw m2, m1, [r5 + 18 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 248], m2 + +pslldq m1, 2 +pinsrb m1, [r1 + 4], 0 +pinsrb m1, [r1 + 2], 1 + +pmaddubsw m1, [r5 + 24 * 16] +pmulhrsw m1, m0 +packuswb m1, m1 +movd [r0 + 252], m1 + +; mode 18 + +movh m1, [r1] +movd [r0 + 256], m1 + +pslldq m2, m1, 1 +pinsrb m2, [r2 + 1], 0 +movd [r0 + 260], m2 + +pslldq m3, m2, 1 +pinsrb m3, [r2 + 2], 0 +movd [r0 + 264], m3 + +pslldq m4, m3, 1 +pinsrb m4, [r2 + 3], 0 +movd [r0 + 268], m4 + +; mode 19 + +palignr m4, m1, 1 +punpcklbw m1, m4 + +pmaddubsw m5, m1, [r5 + 6 * 16] +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 272], m5 + +pslldq m2, m1, 2 +pinsrb m2, [r2 + 1], 0 +pinsrb m2, [r2], 1 + +pmaddubsw m3, m2, [r5 + 12 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 276], m3 + +pslldq m3, m2, 2 +pinsrb m3, [r2 + 1], 1 +pinsrb m3, [r2 + 2], 0 + +pmaddubsw m4, m3, [r5 + 18 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 280], m4 + +pslldq m3, 2 +pinsrb m3, [r2 + 2], 1 +pinsrb m3, [r2 + 4], 0 + +pmaddubsw m3, [r5 + 24 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 284], m3 + +; mode 20 + +pmaddubsw m3, m1, [r5 + 11 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 288], m3 + +pinsrb m2, [r2 + 2], 0 + +pmaddubsw m3, m2, [r5 + 22 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 292], m3 + +pmaddubsw m3, m2, [r5 + 1 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 296], m3 + +pslldq m3, m2, 2 +pinsrb m3, [r2 + 2], 1 +pinsrb m3, [r2 + 3], 0 + +pmaddubsw m4, m3, [r5 + 12 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 300], m4 + +; mode 21 + +pmaddubsw m4, m1, [r5 + 15 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 304], m4 + +pmaddubsw m4, m2, [r5 + 30 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 308], m4 + +pmaddubsw m4, m2, [r5 + 13 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 312], m4 + +pinsrb m3, [r2 + 4], 0 + +pmaddubsw m3, [r5 + 28 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 316], m3 + +; mode 22 + +pmaddubsw m3, m1, [r5 + 19 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 320], m3 + +movd [r0 + 324], m5 + +pmaddubsw m3, m2, [r5 + 25 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 328], m3 + +pmaddubsw m3, m2, [r5 + 12 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 332], m3 + +; mode 23 + +pmaddubsw m3, m1, [r5 + 23 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 336], m3 + +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 340], m3 + +pmaddubsw m3, m1, [r5 + 5 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 344], m3 + +pinsrb m2, [r2 + 4], 0 + +pmaddubsw m2, [r5 + 28 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 348], m2 + +; mode 24 + +pmaddubsw m2, m1, [r5 + 27 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 352], m2 + +pmaddubsw m2, m1, [r5 + 22 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 356], m2 + +pmaddubsw m2, m1, [r5 + 17 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 360], m2 + +pmaddubsw m2, m1, [r5 + 12 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 364], m2 + +; mode 25 + +pmaddubsw m2, m1, [r5 + 30 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 368], m2 + +pmaddubsw m2, m1, [r5 + 28 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 372], m2 + +pmaddubsw m2, m1, [r5 + 26 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 376], m2 + +pmaddubsw m2, m1, [r5 + 24 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 380], m2 + +; mode 26 + +movh m1, [r1 + 1] +pshufd m2, m1, 0 +movu [r0 + 384], m2 + +pxor m2, m2 + +pshufb m3, m1, m2 +punpcklbw m3, m2 + +movh m4, [r2] + +pshufb m5, m4, m2 +punpcklbw m5, m2 + +psrldq m4, 1 +punpcklbw m4, m2 + +psubw m4, m5 +psraw m4, 1 + +paddw m3, m4 + +packuswb m3, m2 + +pextrb [r0 + 384], m3, 0 +pextrb [r0 + 388], m3, 1 +pextrb [r0 + 392], m3, 2 +pextrb [r0 + 396], m3, 3 + +; mode 27 + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m2, m1, [r5 + 2 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 400], m2 + +pmaddubsw m2, m1, [r5 + 4 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 404], m2 + +pmaddubsw m2, m1, [r5 + 6 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 408], m2 + +pmaddubsw m2, m1, [r5 + 8 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 412], m2 + +; mode 28 + +pmaddubsw m2, m1, [r5 + 5 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 416], m2 + +pmaddubsw m2, m1, [r5 + 10 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 420], m2 + +pmaddubsw m2, m1, [r5 + 15 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 424], m2 + +pmaddubsw m2, m1, m7 +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 428], m2 + +; mode 29 + +pmaddubsw m2, m1, [r5 + 9 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 432], m2 + +pmaddubsw m2, m1, [r5 + 18 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 436], m2 + +pmaddubsw m2, m1, [r5 + 27 * 16] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 440], m2 + +palignr m2, m1, 2 + +pmaddubsw m3, m2, [r5 + 4 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 444], m3 + +; mode 30 + +pmaddubsw m3, m1, [r5 + 13 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 448], m3 + +pmaddubsw m6, m1, [r5 + 26 * 16] +pmulhrsw m6, m0 +packuswb m6, m6 +movd [r0 + 452], m6 + +pmaddubsw m3, m2, [r5 + 7 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 456], m3 + +pmaddubsw m5, m2, m7 +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 460], m5 + +; mode 31 + +pmaddubsw m3, m1, [r5 + 17 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 464], m3 + +pmaddubsw m3, m2, [r5 + 2 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 468], m3 + +pmaddubsw m3, m2, [r5 + 19 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 472], m3 + +palignr m3, m2, 2 + +pmaddubsw m4, m3, [r5 + 4 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 476], m4 + +; mode 32 + +pmaddubsw m4, m1, [r5 + 21 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 480], m4 + +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 484], m4 + +pmaddubsw m4, m2, [r5 + 31 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 488], m4 + +pmaddubsw m4, m3, m7 +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 492], m4 + +; mode 33 + +movd [r0 + 496], m6 + +movd [r0 + 500], m5 + +pmaddubsw m4, m3, [r5 + 14 * 16] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 504], m4 + +psrldq m3, 2 + +pmaddubsw m3, [r5 + 8 * 16] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 508], m3 + +; mode 34 + +movh m0, [r1 + 2] +movd [r0 + 512], m0 + +palignr m1, m0, 1 +movd [r0 + 516], m1 + +palignr m1, m0, 2 +movd [r0 + 520], m1 + +palignr m1, m0, 3 +movd [r0 + 524], m1 + +RET + +;----------------------------------------------------------------------------- +; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma + +; mode 2 + +movu m0, [r4 + 2] + +palignr m1, m0, 1 +punpcklqdq m2, m0, m1 +movu [r0], m2 + +palignr m1, m0, 2 +palignr m2, m0, 3 +punpcklqdq m1, m2 +movu [r0 + 16], m1 + +palignr m1, m0, 4 +palignr m2, m0, 5 +punpcklqdq m1, m2 +movu [r0 + 32], m1 + +palignr m1, m0, 6 +palignr m2, m0, 7 +punpcklqdq m1, m2 +movu [r0 + 48], m1 + +; mode 3 [row 0, 1] + +mova m7, [pw_1024] +lea r5, [ang_table] + +movu m0, [r2 + 1] + +palignr m1, m0, 1 +palignr m2, m0, 2 + +punpcklbw m3, m0, m1 +pmaddubsw m4, m3, [r5 + 26 * 16] +pmulhrsw m4, m7 + +punpcklbw m1, m2 +pmaddubsw m5, m1, [r5 + 20 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 + +movu [r0 + 64], m4 + +; mode 6 [row 1] + +movh [r0 + 264], m4 + +; mode 6 [row 3] + +movhps [r0 + 280], m4 + +; mode 4 [row 0, 1] + +pmaddubsw m4, m3, [r5 + 21 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 10 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 128], m4 + +; mode 5 [row 0, 1] + +pmaddubsw m4, m3, [r5 + 17 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 2 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 192], m4 + +; mode 6 [row 0] + +pmaddubsw m4, m3, [r5 + 13 * 16] +pmulhrsw m4, m7 + +pxor m5, m5 + +packuswb m4, m5 +movh [r0 + 256], m4 + +; mode 7 [row 0, 1] + +pmaddubsw m4, m3, [r5 + 9 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 18 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 320], m4 + +; mode 8 [row 0, 1] + +pmaddubsw m4, m3, [r5 + 5 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 10 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 384], m4 + +; mode 8 [row 2, 3] + +pmaddubsw m4, m3, [r5 + 15 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 20 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 400], m4 + +; mode 8 [row 4, 5] + +pmaddubsw m4, m3, [r5 + 25 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 30 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 416], m4 + +; mode 8 [row 6, 7] + +pmaddubsw m4, m1, [r5 + 3 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 8 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 432], m4 + +; mode 9 [row 0, 1] + +pmaddubsw m4, m3, [r5 + 2 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 4 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 448], m4 + +; mode 9 [row 2, 3] + +pmaddubsw m4, m3, [r5 + 6 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 8 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 464], m4 + +; mode 9 [row 4, 5] + +pmaddubsw m4, m3, [r5 + 10 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 12 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 480], m4 + +; mode 9 [row 6, 7] + +pmaddubsw m4, m3, [r5 + 14 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 16 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 496], m4 + +; mode 7 [row 2, 3] + +pmaddubsw m4, m3, [r5 + 27 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 4 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 336], m4 + +; mode 7 [row 4, 5] + +pmaddubsw m4, m1, [r5 + 13 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 22 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 352], m4 + +; mode 6 [row 2] + +pmaddubsw m4, m1, [r5 + 7 * 16] +pmulhrsw m4, m7 + +pxor m5, m5 + +packuswb m4, m5 +movh [r0 + 272], m4 + +; mode 3 [row 2, 3] + +palignr m1, m0, 3 +palignr m3, m0, 4 + +punpcklbw m2, m1 +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, m7 + +punpcklbw m1, m3 +pmaddubsw m6, m1, [r5 + 8 * 16] +pmulhrsw m6, m7 + +packuswb m5, m6 +movu [r0 + 80], m5 + +; mode 6 [row 7] + +movhps [r0 + 312], m5 + +; mode 6 [row 5] + +movh [r0 + 296], m5 + +; mode 4 [calculate and store row 4, 5] + +pmaddubsw m4, m1, [r5 + 9 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 30 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 160], m4 + +; mode 5 [row 4, 5] + +pmaddubsw m4, m2, [r5 + 21 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 6 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 224], m4 + +; mode 6 [row 4, 5] + +pmaddubsw m5, m2, [r5 + 1 * 16] +pmulhrsw m5, m7 + +pxor m6, m6 + +packuswb m5, m6 +movh [r0 + 288], m5 + +; mode 6 [row 6, 7] + +pmaddubsw m5, m2, [r5 + 27 * 16] +pmulhrsw m5, m7 + +pxor m6, m6 + +packuswb m5, m6 +movh [r0 + 304], m5 + +; mode 5 [calculate row 6] + +pmaddubsw m6, m1, [r5 + 23 * 16] +pmulhrsw m6, m7 + +; mode 3 [row 4, 5] + +palignr m1, m0, 5 + +punpcklbw m3, m1 +pmaddubsw m4, m3, [r5 + 2 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 28 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 96], m4 + +; mode 4 [calculate row 7] + +pmaddubsw m5, m3, [r5 + 19 * 16] +pmulhrsw m5, m7 + +; mode 5 [calculate row 6] + +pmaddubsw m4, m3, [r5 + 8 * 16] +pmulhrsw m4, m7 + +packuswb m6, m4 +movu [r0 + 240], m6 + +; mode 3 [row 6, 7] + +palignr m2, m0, 6 +palignr m3, m0, 7 + +punpcklbw m1, m2 +pmaddubsw m4, m1, [r5 + 22 * 16] +pmulhrsw m4, m7 + +punpcklbw m2, m3 +pmaddubsw m2, [r5 + 16 * 16] +pmulhrsw m2, m7 + +packuswb m4, m2 +movu [r0 + 112], m4 + +; mode 4 [calculate row 7] + +pmaddubsw m2, m1, [r5 + 8 * 16] +pmulhrsw m2, m7 + +; mode 4 [store row 6 and 7] + +packuswb m5, m2 +movu [r0 + 176], m5 + +; mode 4 [row 2, 3] + +palignr m1, m0, 1 +palignr m2, m0, 2 +palignr m3, m0, 3 + +punpcklbw m1, m2 +pmaddubsw m4, m1, [r5 + 31 * 16] +pmulhrsw m4, m7 + +punpcklbw m2, m3 +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 144], m4 + +; mode 5 [row 2, 3] + +pmaddubsw m4, m1, [r5 + 19 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 208], m4 + +; mode 7 [row 6, 7] + +pmaddubsw m4, m1, [r5 + 31 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 368], m4 + +; mode 10 + +pshufb m1, m0, [tab_Si] +movu [r0 + 512], m1 +movu [r0 + 528], m1 +movu [r0 + 544], m1 +movu [r0 + 560], m1 + +pxor m0, m0 + +pshufb m1, m1, m0 +punpcklbw m1, m0 + +movu m2, [r1] + +pshufb m3, m2, m0 +punpcklbw m3, m0 + +psrldq m4, m2, 1 +punpcklbw m4, m0 + +movu m2, [r1 + 9] +punpcklbw m2, m0 + +psubw m4, m3 +psubw m2, m3 + +psraw m4, 1 +psraw m2, 1 + +paddw m4, m1 +paddw m2, m1 + +packuswb m4, m2 + +pextrb [r0 + 512], m4, 0 +pextrb [r0 + 520], m4, 1 +pextrb [r0 + 528], m4, 2 +pextrb [r0 + 536], m4, 3 +pextrb [r0 + 544], m4, 4 +pextrb [r0 + 552], m4, 5 +pextrb [r0 + 560], m4, 6 +pextrb [r0 + 568], m4, 7 + +; mode 11 [row 0, 1] + +movu m0, [r2] +palignr m1, m0, 1 +punpcklbw m2, m0, m1 + +pmaddubsw m3, m2, [r5 + 30 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 28 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 576], m3 + +; mode 11 [row 2, 3] + +pmaddubsw m3, m2, [r5 + 26 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 24 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 592], m3 + +; mode 11 [row 4, 5] + +pmaddubsw m3, m2, [r5 + 22 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 20 * 16] +pmulhrsw m4, m7 + +packuswb m5, m3, m4 +movu [r0 + 608], m5 + +; mode 12 [row 0, 1] + +pmaddubsw m4, m2, [r5 + 27 * 16] +pmulhrsw m4, m7 + +packuswb m4, m3 +movu [r0 + 640], m4 + +; mode 11 [row 6, 7] + +pmaddubsw m3, m2, [r5 + 18 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 16 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 624], m3 + +; mode 12 [row 2, 3] + +pmaddubsw m3, m2, [r5 + 17 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 12 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 656], m3 + +; mode 12 [row 4, 5] + +pmaddubsw m3, m2, [r5 + 7 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 2 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 672], m3 + +; mode 12 [row 6, 7] + +pslldq m3, m2, 2 +pinsrb m3, [r1 + 0], 1 +pinsrb m3, [r1 + 6], 0 + +pmaddubsw m4, m3, [r5 + 29 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 24 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 688], m4 + +; mode 13 [row 0, 1] + +pmaddubsw m4, m2, [r5 + 23 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 704], m4 + +; mode 13 [row 2, 3] + +pmaddubsw m4, m2, [r5 + 5 * 16] +pmulhrsw m4, m7 + +pinsrb m3, [r1 + 4], 0 +pmaddubsw m5, m3, [r5 + 28 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 720], m4 + +; mode 13 [row 4, 5] + +pmaddubsw m4, m3, [r5 + 19 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 10 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 736], m4 + +; mode 13 [row 6, 7] + +pmaddubsw m4, m3, [r5 + 1 * 16] +pmulhrsw m4, m7 + +pslldq m5, m3, 2 +pinsrb m5, [r1 + 4], 1 +pinsrb m5, [r1 + 7], 0 + +pmaddubsw m5, [r5 + 24 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 752], m4 + +; mode 14 [row 0, 1] + +pmaddubsw m4, m2, [r5 + 19 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m2, [r5 + 6 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 768], m4 + +; mode 14 [row 2, 3] + +pinsrb m3, [r1 + 2], 0 + +pmaddubsw m4, m3, [r5 + 25 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 12 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 784], m4 + +; mode 14 [row 4, 5] + +pslldq m1, m3, 2 +pinsrb m1, [r1 + 2], 1 +pinsrb m1, [r1 + 5], 0 + +pmaddubsw m4, m1, [r5 + 31 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m1, [r5 + 18 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 800], m4 + +; mode 14 [row 6, 7] + +pmaddubsw m4, m1, [r5 + 5 * 16] +pmulhrsw m4, m7 + +pslldq m1, 2 +pinsrb m1, [r1 + 5], 1 +pinsrb m1, [r1 + 7], 0 + +pmaddubsw m5, m1, [r5 + 24 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 816], m4 + +; mode 15 [row 0, 1] + +pmaddubsw m4, m2, [r5 + 15 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 30 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 832], m4 + +; mode 15 [row 2, 3] + +pmaddubsw m4, m3, [r5 + 13 * 16] +pmulhrsw m4, m7 + +pslldq m1, m3, 2 +pinsrb m1, [r1 + 2], 1 +pinsrb m1, [r1 + 4], 0 + +pmaddubsw m5, m1, [r5 + 28 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 848], m4 + +; mode 15 [row 4, 5] + +pmaddubsw m4, m1, [r5 + 11 * 16] +pmulhrsw m4, m7 + +pslldq m1, 2 +pinsrb m1, [r1 + 4], 1 +pinsrb m1, [r1 + 6], 0 + +pmaddubsw m5, m1, [r5 + 26 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 864], m4 + +; mode 15 [row 6, 7] + +pmaddubsw m4, m1, [r5 + 9 * 16] +pmulhrsw m4, m7 + +pslldq m1, 2 +pinsrb m1, [r1 + 6], 1 +pinsrb m1, [r1 + 8], 0 + +pmaddubsw m1, [r5 + 24 * 16] +pmulhrsw m1, m7 + +packuswb m4, m1 +movu [r0 + 880], m4 + +; mode 16 [row 0, 1] + +pmaddubsw m4, m2, [r5 + 11 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 22 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 896], m4 + +; mode 16 [row 2, 3] + +pmaddubsw m4, m3, [r5 + 1 * 16] +pmulhrsw m4, m7 + +pslldq m3, 2 +pinsrb m3, [r1 + 2], 1 +pinsrb m3, [r1 + 3], 0 + +pmaddubsw m5, m3, [r5 + 12 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 912], m4 + +; mode 16 [row 4, 5] + +pslldq m3, 2 +pinsrb m3, [r1 + 3], 1 +pinsrb m3, [r1 + 5], 0 + +pmaddubsw m4, m3, [r5 + 23 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m3, [r5 + 2 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 928], m4 + +; mode 16 [row 6, 7] + +pslldq m3, 2 +pinsrb m3, [r1 + 5], 1 +pinsrb m3, [r1 + 6], 0 + +pmaddubsw m4, m3, [r5 + 13 * 16] +pmulhrsw m4, m7 + +pslldq m3, 2 +pinsrb m3, [r1 + 6], 1 +pinsrb m3, [r1 + 8], 0 + +pmaddubsw m3, [r5 + 24 * 16] +pmulhrsw m3, m7 + +packuswb m4, m3 +movu [r0 + 944], m4 + +; mode 17 [row 0, 1] + +pmaddubsw m4, m2, [r5 + 6 * 16] +pmulhrsw m4, m7 + +pslldq m2, 2 +pinsrb m2, [r1 + 0], 1 +pinsrb m2, [r1 + 1], 0 + +pmaddubsw m3, m2, [r5 + 12 * 16] +pmulhrsw m3, m7 + +packuswb m4, m3 +movu [r0 + 960], m4 + +; mode 17 [row 2, 3] + +pslldq m2, 2 +pinsrb m2, [r1 + 1], 1 +pinsrb m2, [r1 + 2], 0 + +pmaddubsw m4, m2, [r5 + 18 * 16] +pmulhrsw m4, m7 + +pslldq m2, 2 +pinsrb m2, [r1 + 2], 1 +pinsrb m2, [r1 + 4], 0 + +pmaddubsw m3, m2, [r5 + 24 * 16] +pmulhrsw m3, m7 + +packuswb m4, m3 +movu [r0 + 976], m4 + +; mode 17 [row 4, 5] + +pslldq m2, 2 +pinsrb m2, [r1 + 4], 1 +pinsrb m2, [r1 + 5], 0 + +pmaddubsw m4, m2, [r5 + 30 * 16] +pmulhrsw m4, m7 + +pmaddubsw m3, m2, [r5 + 4 * 16] +pmulhrsw m3, m7 + +packuswb m4, m3 +movu [r0 + 992], m4 + +; mode 17 [row 6, 7] + +pslldq m2, 2 +pinsrb m2, [r1 + 5], 1 +pinsrb m2, [r1 + 6], 0 + +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m7 + +pslldq m2, 2 +pinsrb m2, [r1 + 6], 1 +pinsrb m2, [r1 + 7], 0 + +pmaddubsw m3, m2, [r5 + 16 * 16] +pmulhrsw m3, m7 + +packuswb m4, m3 +movu [r0 + 1008], m4 + +; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7] + +movh m1, [r3] +movh [r0 + 1024], m1 + +pslldq m2, m1, 1 +pinsrb m2, [r4 + 1], 0 +movh [r0 + 1032], m2 + +pslldq m2, 1 +pinsrb m2, [r4 + 2], 0 +movh [r0 + 1040], m2 + +pslldq m2, 1 +pinsrb m2, [r4 + 3], 0 +movh [r0 + 1048], m2 + +pslldq m2, 1 +pinsrb m2, [r4 + 4], 0 +movh [r0 + 1056], m2 + +pslldq m2, 1 +pinsrb m2, [r4 + 5], 0 +movh [r0 + 1064], m2 + +pslldq m2, 1 +pinsrb m2, [r4 + 6], 0 +movh [r0 + 1072], m2 + +pslldq m2, 1 +pinsrb m2, [r4 + 7], 0 +movh [r0 + 1080], m2 + +; mode 19 [row 0, 1] + +movu m0, [r1] +palignr m1, m0, 1 +punpcklbw m0, m1 + +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m7 + +pslldq m2, m0, 2 +pinsrb m2, [r2 + 0], 1 +pinsrb m2, [r2 + 1], 0 + +pmaddubsw m3, m2, [r5 + 12 * 16] +pmulhrsw m3, m7 + +packuswb m1, m3 +movu [r0 + 1088], m1 + +; mode 19 [row 2, 3] + +pslldq m2, 2 +pinsrb m2, [r2 + 1], 1 +pinsrb m2, [r2 + 2], 0 + +pmaddubsw m4, m2, [r5 + 18 * 16] +pmulhrsw m4, m7 + +pslldq m2, 2 +pinsrb m2, [r2 + 2], 1 +pinsrb m2, [r2 + 4], 0 + +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 1104], m4 + +; mode 19 [row 4, 5] + +pslldq m2, 2 +pinsrb m2, [r2 + 4], 1 +pinsrb m2, [r2 + 5], 0 + +pmaddubsw m4, m2, [r5 + 30 * 16] +pmulhrsw m4, m7 + +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, m7 + +packuswb m4, m5 +movu [r0 + 1120], m4 + +; mode 19 [row 6, 7] + +pslldq m2, 2 +pinsrb m2, [r2 + 5], 1 +pinsrb m2, [r2 + 6], 0 + +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m7 + +pslldq m2, 2 +pinsrb m2, [r2 + 6], 1 +pinsrb m2, [r2 + 7], 0 + +pmaddubsw m2, [r5 + 16 * 16] +pmulhrsw m2, m7 + +packuswb m4, m2 +movu [r0 + 1136], m4 + +; mode 20 [row 0, 1] + +pmaddubsw m3, m0, [r5 + 11 * 16] +pmulhrsw m3, m7 + +pslldq m1, m0, 2 +pinsrb m1, [r2 + 0], 1 +pinsrb m1, [r2 + 2], 0 + +pmaddubsw m4, m1, [r5 + 22 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 1152], m3 + +; mode 20 [row 2, 3] + +pmaddubsw m3, m1, [r5 + 1 * 16] +pmulhrsw m3, m7 + +pslldq m2, m1, 2 +pinsrb m2, [r2 + 2], 1 +pinsrb m2, [r2 + 3], 0 + +pmaddubsw m4, m2, [r5 + 12 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 1168], m3 + +; mode 20 [row 4, 5] + +pslldq m2, 2 +pinsrb m2, [r2 + 3], 1 +pinsrb m2, [r2 + 5], 0 + +pmaddubsw m3, m2, [r5 + 23 * 16] +pmulhrsw m3, m7 + +pmaddubsw m4, m2, [r5 + 2 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 1184], m3 + +; mode 20 [row 6, 7] + +pslldq m2, 2 +pinsrb m2, [r2 + 5], 1 +pinsrb m2, [r2 + 6], 0 + +pmaddubsw m3, m2, [r5 + 13 * 16] +pmulhrsw m3, m7 + +pslldq m2, 2 +pinsrb m2, [r2 + 6], 1 +pinsrb m2, [r2 + 8], 0 + +pmaddubsw m4, m2, [r5 + 24 * 16] +pmulhrsw m4, m7 + +packuswb m3, m4 +movu [r0 + 1200], m3 + +; mode 21 [row 0, 1] + +pmaddubsw m2, m0, [r5 + 15 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1216], m2 + +; mode 21 [row 2, 3] + +pmaddubsw m2, m1, [r5 + 13 * 16] +pmulhrsw m2, m7 + +pslldq m3, m1, 2 +pinsrb m3, [r2 + 2], 1 +pinsrb m3, [r2 + 4], 0 + +pmaddubsw m4, m3, [r5 + 28 * 16] +pmulhrsw m4, m7 + +packuswb m2, m4 +movu [r0 + 1232], m2 + +; mode 21 [row 4, 5] + +pmaddubsw m2, m3, [r5 + 11 * 16] +pmulhrsw m2, m7 + +pslldq m3, 2 +pinsrb m3, [r2 + 4], 1 +pinsrb m3, [r2 + 6], 0 + +pmaddubsw m4, m3, [r5 + 26 * 16] +pmulhrsw m4, m7 + +packuswb m2, m4 +movu [r0 + 1248], m2 + +; mode 21 [row 6, 7] + +pmaddubsw m2, m3, [r5 + 9 * 16] +pmulhrsw m2, m7 + +pslldq m3, 2 +pinsrb m3, [r2 + 6], 1 +pinsrb m3, [r2 + 8], 0 + +pmaddubsw m4, m3, [r5 + 24 * 16] +pmulhrsw m4, m7 + +packuswb m2, m4 +movu [r0 + 1264], m2 + +; mode 22 [row 0, 1] + +pmaddubsw m2, m0, [r5 + 19 * 16] +pmulhrsw m2, m7 + +pmaddubsw m4, m0, [r5 + 6 * 16] +pmulhrsw m4, m7 + +packuswb m2, m4 +movu [r0 + 1280], m2 + +; mode 22 [row 2, 3] + +pmaddubsw m2, m1, [r5 + 25 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m1, [r5 + 12 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1296], m2 + +; mode 22 [row 4, 5] + +pslldq m1, 2 +pinsrb m1, [r2 + 5], 0 +pinsrb m1, [r2 + 2], 1 + +pmaddubsw m2, m1, [r5 + 31 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1312], m2 + +; mode 22 [row 6, 7] + +pmaddubsw m2, m1, [r5 + 5 * 16] +pmulhrsw m2, m7 + +pslldq m1, 2 +pinsrb m1, [r2 + 5], 1 +pinsrb m1, [r2 + 7], 0 + +pmaddubsw m1, [r5 + 24 * 16] +pmulhrsw m1, m7 + +packuswb m2, m1 +movu [r0 + 1328], m2 + +; mode 23 [row 0, 1] + +pmaddubsw m2, m0, [r5 + 23 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m0, [r5 + 14 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1344], m2 + +; mode 23 [row 2, 3] + +pmaddubsw m2, m0, [r5 + 5 * 16] +pmulhrsw m2, m7 + +pslldq m1, m0, 2 +pinsrb m1, [r2 + 0], 1 +pinsrb m1, [r2 + 4], 0 + +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1360], m2 + +; mode 23 [row 4, 5] + +pmaddubsw m2, m1, [r5 + 19 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1376], m2 + +; mode 23 [row 6, 7] + +pmaddubsw m2, m1, [r5 + 1 * 16] +pmulhrsw m2, m7 + +pslldq m3, m1, 2 +pinsrb m3, [r2 + 4], 1 +pinsrb m3, [r2 + 7], 0 + +pmaddubsw m3, [r5 + 24 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1392], m2 + +; mode 24 [row 0, 1] + +pmaddubsw m2, m0, [r5 + 27 * 16] +pmulhrsw m2, m7 + +pmaddubsw m5, m0, [r5 + 22 * 16] +pmulhrsw m5, m7 + +packuswb m2, m5 +movu [r0 + 1408], m2 + +; mode 24 [row 2, 3] + +pmaddubsw m2, m0, [r5 + 17 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m0, [r5 + 12 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1424], m2 + +; mode 24 [row 4, 5] + +pmaddubsw m2, m0, [r5 + 7 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m0, [r5 + 2 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1440], m2 + +; mode 24 [row 6, 7] + +pinsrb m1, [r2 + 6], 0 + +pmaddubsw m2, m1, [r5 + 29 * 16] +pmulhrsw m2, m7 + +pmaddubsw m1, [r5 + 24 * 16] +pmulhrsw m1, m7 + +packuswb m2, m1 +movu [r0 + 1456], m2 + +; mode 25 [row 0, 1] + +pmaddubsw m2, m0, [r5 + 30 * 16] +pmulhrsw m2, m7 + +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m7 + +packuswb m2, m1 +movu [r0 + 1472], m2 + +; mode 25 [row 2, 3] + +pmaddubsw m2, m0, [r5 + 26 * 16] +pmulhrsw m2, m7 + +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m7 + +packuswb m2, m1 +movu [r0 + 1488], m2 + +; mode 25 [row 4, 5] + +pmaddubsw m1, m0, [r5 + 20 * 16] +pmulhrsw m1, m7 + +packuswb m5, m1 +movu [r0 + 1504], m5 + +; mode 25 [row 6, 7] + +pmaddubsw m2, m0, [r5 + 18 * 16] +pmulhrsw m2, m7 + +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m7 + +packuswb m2, m1 +movu [r0 + 1520], m2 + +; mode 26 + +movu m0, [r1 + 1] + +pshufb m1, m0, [tab_Si] +movu [r0 + 1536], m1 +movu [r0 + 1552], m1 +movu [r0 + 1568], m1 +movu [r0 + 1584], m1 + +pxor m5, m5 + +pshufb m1, m1, m5 +punpcklbw m1, m5 + +movu m2, [r2] + +pshufb m3, m2, m5 +punpcklbw m3, m5 + +psrldq m4, m2, 1 +punpcklbw m4, m5 + +movu m2, [r2 + 9] +punpcklbw m2, m5 + +psubw m4, m3 +psubw m2, m3 + +psraw m4, 1 +psraw m2, 1 + +paddw m4, m1 +paddw m2, m1 + +packuswb m4, m2 + +pextrb [r0 + 1536], m4, 0 +pextrb [r0 + 1544], m4, 1 +pextrb [r0 + 1552], m4, 2 +pextrb [r0 + 1560], m4, 3 +pextrb [r0 + 1568], m4, 4 +pextrb [r0 + 1576], m4, 5 +pextrb [r0 + 1584], m4, 6 +pextrb [r0 + 1592], m4, 7 + +; mode 27 [row 0, 1] + +palignr m6, m0, 1 +punpcklbw m4, m0, m6 + +pmaddubsw m1, m4, [r5 + 2 * 16] +pmulhrsw m1, m7 + +pmaddubsw m2, m4, [r5 + 4 * 16] +pmulhrsw m2, m7 + +packuswb m1, m2 +movu [r0 + 1600], m1 + +; mode 27 [row 2, 3] + +pmaddubsw m1, m4, [r5 + 6 * 16] +pmulhrsw m1, m7 + +pmaddubsw m2, m4, [r5 + 8 * 16] +pmulhrsw m2, m7 + +packuswb m1, m2 +movu [r0 + 1616], m1 + +; mode 27 [row 4, 5] + +pmaddubsw m3, m4, [r5 + 10 * 16] +pmulhrsw m3, m7 + +pmaddubsw m2, m4, [r5 + 12 * 16] +pmulhrsw m2, m7 + +packuswb m1, m3, m2 +movu [r0 + 1632], m1 + +; mode 27 [row 6, 7] + +pmaddubsw m1, m4, [r5 + 14 * 16] +pmulhrsw m1, m7 + +pmaddubsw m2, m4, [r5 + 16 * 16] +pmulhrsw m2, m7 + +packuswb m1, m2 +movu [r0 + 1648], m1 + +; mode 28 [row 0, 1] + +pmaddubsw m1, m4, [r5 + 5 * 16] +pmulhrsw m1, m7 + +packuswb m1, m3 +movu [r0 + 1664], m1 + +; mode 28 [row 2, 3] + +pmaddubsw m1, m4, [r5 + 15 * 16] +pmulhrsw m1, m7 + +pmaddubsw m2, m4, [r5 + 20 * 16] +pmulhrsw m2, m7 + +packuswb m1, m2 +movu [r0 + 1680], m1 + +; mode 28 [row 4, 5] + +pmaddubsw m1, m4, [r5 + 25 * 16] +pmulhrsw m1, m7 + +pmaddubsw m2, m4, [r5 + 30 * 16] +pmulhrsw m2, m7 + +packuswb m1, m2 +movu [r0 + 1696], m1 + +; mode 28 [row 6, 7] + +palignr m1, m0, 2 +punpcklbw m5, m6, m1 + +pmaddubsw m2, m5, [r5 + 3 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m5, [r5 + 8 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1712], m2 + +; mode 29 [row 0, 1] + +pmaddubsw m2, m4, [r5 + 9 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m4, [r5 + 18 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1728], m2 + +; mode 29 [row 2, 3] + +pmaddubsw m2, m4, [r5 + 27 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m5, [r5 + 4 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1744], m2 + +; mode 29 [row 4, 5] + +pmaddubsw m2, m5, [r5 + 13 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m5, [r5 + 22 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1760], m2 + +; mode 29 [row 6, 7] + +pmaddubsw m2, m5, [r5 + 31 * 16] +pmulhrsw m2, m7 + +palignr m6, m0, 3 +punpcklbw m1, m6 + +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1776], m2 + +; mode 32 [row 2] + +movh [r0 + 1936], m2 + +; mode 30 [row 0, 1] + +pmaddubsw m2, m4, [r5 + 13 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m4, [r5 + 26 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1792], m2 + +; mode 30 [row 2, 3] + +pmaddubsw m2, m5, [r5 + 7 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m5, [r5 + 20 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1808], m2 + +; mode 33 [row 1] + +movhps [r0 + 1992], m2 + +; mode 30 [row 4, 5] + +pmaddubsw m2, m1, [r5 + 1 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1824], m2 + +; mode 33 [row 2] + +movhps [r0 + 2000], m2 + +; mode 30 [row 6, 7] + +pmaddubsw m2, m1, [r5 + 27 * 16] +pmulhrsw m2, m7 + +psrldq m0, 4 +punpcklbw m6, m0 + +pmaddubsw m3, m6, [r5 + 8 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1840], m2 + +; mode 33 [row 3] + +movhps [r0 + 2008], m2 + +; mode 31 [row 0, 1] + +pmaddubsw m2, m4, [r5 + 17 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m5, [r5 + 2 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1856], m2 + +; mode 31 [row 2, 3] + +pmaddubsw m2, m5, [r5 + 19 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1872], m2 + +; mode 31 [row 4, 5] + +pmaddubsw m2, m1, [r5 + 21 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m6, [r5 + 6 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1888], m2 + +; mode 31 [row 6, 7] + +pmaddubsw m2, m6, [r5 + 23 * 16] +pmulhrsw m2, m7 + +movu m3, [r1 + 6] +punpcklbw m0, m3 + +pmaddubsw m3, m0, [r5 + 8 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1904], m2 + +; mode 32 [row 0, 1] + +pmaddubsw m2, m4, [r5 + 21 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m5, [r5 + 10 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1920], m2 + +; mode 32 [row 3] + +pmaddubsw m2, m1, [r5 + 20 * 16] +pmulhrsw m2, m7 + +pxor m3, m3 + +packuswb m2, m3 +movh [r0 + 1944], m2 + +; mode 32 [row 4, 5] + +pmaddubsw m2, m6, [r5 + 9 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m6, [r5 + 30 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1952], m2 + +; mode 33 [row 4, 5] + +pmaddubsw m2, m0, [r5 + 2 * 16] +pmulhrsw m2, m7 + +pmaddubsw m3, m0, [r5 + 28 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 2016], m2 + +; mode 32 [row 6] + +pmaddubsw m2, m0, [r5 + 19 * 16] +pmulhrsw m2, m7 + +; mode 32 [row 7] + +movu m0, [r1 + 6] +palignr m3, m0, 1 +punpcklbw m0, m3 + +pmaddubsw m3, m0, [r5 + 8 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 1968], m2 + +; mode 33 [row 6, 7] + +pmaddubsw m2, m0, [r5 + 22 * 16] +pmulhrsw m2, m7 + +movu m0, [r1 + 7] +palignr m3, m0, 1 +punpcklbw m0, m3 + +pmaddubsw m3, m0, [r5 + 16 * 16] +pmulhrsw m3, m7 + +packuswb m2, m3 +movu [r0 + 2032], m2 + +; mode 33 [row 0] + +pmaddubsw m2, m4, [r5 + 26 * 16] +pmulhrsw m2, m7 + +pxor m3, m3 + +packuswb m2, m3 +movh [r0 + 1984], m2 + +; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7] + +movu m0, [r3 + 2] +palignr m1, m0, 1 +punpcklqdq m2, m0, m1 +movu [r0 + 2048], m2 + +palignr m1, m0, 2 +palignr m2, m0, 3 +punpcklqdq m1, m2 +movu [r0 + 2064], m1 + +palignr m1, m0, 4 +palignr m2, m0, 5 +punpcklqdq m1, m2 +movu [r0 + 2080], m1 + +palignr m1, m0, 6 +palignr m2, m0, 7 +punpcklqdq m1, m2 +movu [r0 + 2096], m1 + +RET + +;----------------------------------------------------------------------------- +; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma + +movu m0, [r4 + 2] +movu [r0 + 0 * 16], m0 + +movu m1, m0 + +movu m6, [r4 + 18] +palignr m5, m6, m0, 1 +movu [r0 + 1 * 16], m5 + +movu m4, m5 + +palignr m5, m6, m0, 2 +movu [r0 + 2 * 16], m5 +palignr m5, m6, m0, 3 +movu [r0 + 3 * 16], m5 +palignr m5, m6, m0, 4 +movu [r0 + 4 * 16], m5 +palignr m5, m6, m0, 5 +movu [r0 + 5 * 16], m5 +palignr m5, m6, m0, 6 +movu [r0 + 6 * 16], m5 +palignr m5, m6, m0, 7 +movu [r0 + 7 * 16], m5 + +movu m7, m5 + +palignr m5, m6, m0, 8 +movu [r0 + 8 * 16], m5 + +movu m2, m5 + +palignr m5, m6, m0, 9 +movu [r0 + 9 * 16], m5 + +palignr m3, m6, m0, 10 +movu [r0 + 10 * 16], m3 +palignr m3, m6, m0, 11 +movu [r0 + 11 * 16], m3 +palignr m3, m6, m0, 12 +movu [r0 + 12 * 16], m3 + +; mode 3 [row 15] +movu [r0 + (3-2)*16*16 + 15 * 16], m3 + +palignr m3, m6, m0, 13 +movu [r0 + 13 * 16], m3 +palignr m3, m6, m0, 14 +movu [r0 + 14 * 16], m3 +palignr m3, m6, m0, 15 +movu [r0 + 15 * 16], m3 + +; mode 3 [row 0] +lea r5, [ang_table] +movu m3, [pw_1024] +movu m0, [r4 + 1] +punpcklbw m0, m1 + +; mode 17 [row 8 - second half] +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 248 * 16 + 8], m1 +; mode 17 [row 8 - second half] end + +pmaddubsw m1, m0, [r5 + 26 * 16] +pmulhrsw m1, m3 +punpcklbw m7, m2 +pmaddubsw m2, m7, [r5 + 26 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 16 * 16], m1 + +;mode 6 [row 1] +movu [r0 + 65 * 16], m1 + +; mode 4 [row 0] +pmaddubsw m1, m0, [r5 + 21 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 21 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 32 * 16], m1 + +; mode 5 [row 0] +pmaddubsw m1, m0, [r5 + 17 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 17 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 48 * 16], m1 + +; mode 6 [row 0] +pmaddubsw m1, m0, [r5 + 13 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 13 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 64 * 16], m1 + +; mode 7 [row 0] +pmaddubsw m1, m0, [r5 + 9 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 9 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 80 * 16], m1 + +; mode 7 [row 1] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 18 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 81 * 16], m1 + +; mode 7 [row 2] +pmaddubsw m1, m0, [r5 + 27 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 27 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 82 * 16], m1 + +; mode 8 [row 0] +pmaddubsw m1, m0, [r5 + 5 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 5 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 96 * 16], m1 + +; mode 8 [row 1] +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 10 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 97 * 16], m1 + +; mode 8 [row 2] +pmaddubsw m1, m0, [r5 + 15 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 15 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 98 * 16], m1 + +; mode 8 [row 3] +pmaddubsw m1, m0, [r5 + 20 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 20 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 99 * 16], m1 + +; mode 8 [row 4] +pmaddubsw m1, m0, [r5 + 25 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 25 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 100 * 16], m1 + +; mode 8 [row 5] +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 30 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 101 * 16], m1 + +; mode 15 [row 13 - second half] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 221 * 16 + 8], m1 +; mode 15 [row 13 - second half] end + +; mode 15 [row 14 - second half] +pmaddubsw m1, m0, [r5 + 1 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 222 * 16 + 8], m1 +; mode 15 [row 14 - second half] end + +; mode 16 [row 10 - second half] +pmaddubsw m1, m0, [r5 + 25 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 234 * 16 + 8], m1 +; mode 16 [row 10 - second half] end + +; mode 16 [row 11 - second half] +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 235 * 16 + 8], m1 +; mode 16 [row 11 - second half] end + +; mode 3 [row 1] +movu m6, [r5 + 20 * 16] +movu m0, [r4 + 2] +punpcklbw m0, m4 + +; mode 17 [row 7 - second half] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 247 * 16 + 8], m1 + +; mode 17 [row 7 - second half] end +pmaddubsw m1, m0, m6 +pmulhrsw m1, m3 +movu m2, [r4 + 10] +punpcklbw m2, m5 +pmaddubsw m4, m2, m6 +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 17 * 16], m1 + +;mode 6 [row 3] +movu [r0 + 67 * 16], m1 + +; mode 4 row [row 1] +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 33 * 16], m1 + +; mode 4 row [row 2] +pmaddubsw m1, m0, [r5 + 31 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 31 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 34 * 16], m1 + +; mode 7 [row 6] +movu [r0 + 86 * 16], m1 + +; mode 5 row [row 1] +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 2 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 49 * 16], m1 + +; mode 5 row [row 2] +pmaddubsw m1, m0, [r5 + 19 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 50 * 16], m1 + +; mode 6 [row 2] +pmaddubsw m1, m0, [r5 + 7 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 7 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 66 * 16], m1 + +; mode 7 [row 3] +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 83 * 16], m1 + +; mode 7 [row 4] +pmaddubsw m1, m0, [r5 + 13 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 13 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 84 * 16], m1 + +; mode 8 [row 8] +movu [r0 + 104 * 16], m1 + +; mode 7 [row 5] +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 85 * 16], m1 + +; mode 8 [row 6] +pmaddubsw m1, m0, [r5 + 3 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 102 * 16], m1 + +; mode 8 [row 7] +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 8 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 103 * 16], m1 + +; mode 8 [row 9] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 105 * 16], m1 + +; mode 8 [row 10] +pmaddubsw m1, m0, [r5 + 23 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 23 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 106 * 16], m1 + +; mode 8 [row 11] +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 107 * 16], m1 + +; mode 3 [row 2] +movu m0, [r4 + 3] +movd m1, [r4 + 19] +palignr m1, m0, 1 +punpcklbw m0, m1 + +; mode 17 [row 6 - second half] +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 246 * 16 + 8], m1 +; mode 17 [row 6 - second half] end + +pmaddubsw m1, m0, [r5 + 14 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 11] +movd m4, [r4 + 27] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 18 * 16], m1 + +; mode 6 [row 5] +movu [r0 + 69 * 16], m1 + +; mode 4 row [row 3] +pmaddubsw m1, m0, [r5 + 20 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 20 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 35 * 16], m1 + +; mode 5 row [row 3] +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 51 * 16], m1 + +; mode 5 row [row 4] +pmaddubsw m1, m0, [r5 + 21 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 21 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 52 * 16], m1 + +; mode 6 [row 4] +pmaddubsw m1, m0, [r5 + 1 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 68 * 16], m1 + +; mode 6 [row 6] +pmaddubsw m1, m0, [r5 + 27 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 27 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 70 * 16], m1 + +; mode 7 [row 7] +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 8 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 87 * 16], m1 + +; mode 7 [row 8] +pmaddubsw m1, m0, [r5 + 17 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 17 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 88 * 16], m1 + +; mode 7 [row 9] +pmaddubsw m1, m0, [r5 + 26 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 89 * 16], m1 + +; mode 8 [row 12] +pmaddubsw m1, m0, [r5 + 1 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 108 * 16], m1 + +; mode 8 [row 13] +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 6 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 109 * 16], m1 + +; mode 8 [row 14] +pmaddubsw m1, m0, [r5 + 11 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 110 * 16], m1 + +; mode 8 [row 15] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 111 * 16], m1 + +; mode 3 [row 3] +movu m0, [r4 + 4] +movd m1, [r4 + 20] +palignr m1, m0, 1 +punpcklbw m0, m1 + +; mode 17 [row 4 - second half] +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 244 * 16 + 8], m1 +; mode 17 [row 4 - second half] end + +; mode 17 [row 5 - second half] +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 245 * 16 + 8], m1 +; mode 17 [row 5 - second half] end + +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 12] +movd m4, [r4 + 28] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 8 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 19 * 16], m1 + +; mode 6 [row 7] +movu [r0 + 71 * 16], m1 + +; mode 4 row [row 4] +pmaddubsw m1, m0, [r5 + 9 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 36 * 16], m1 + +; mode 4 row [row 5] +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 37 * 16], m1 + +; mode 7 row [row 13] +movu [r0 + 93 * 16], m1 + +; mode 5 row [row 5] +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 6 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 53 * 16], m1 + +; mode 5 row [row 6] +pmaddubsw m1, m0, [r5 + 23 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 23 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 54 * 16], m1 + +; mode 6 [row 8] +pmaddubsw m1, m0, [r5 + 21 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 21 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 72 * 16], m1 + +; mode 7 [row 12] +movu [r0 + 92 * 16], m1 + +; mode 7 [row 10] +pmaddubsw m1, m0, [r5 + 3 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 90 * 16], m1 + +; mode 7 [row 11] +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 91 * 16], m1 + +; mode 3 [row 4] +movu m0, [r4 + 5] +movd m1, [r4 + 20] +palignr m1, m0, 1 +punpcklbw m0, m1 + +; mode 17 [row 3 - second half] +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 243 * 16 + 8], m1 + +; mode 17 [row 3 - second half] end +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 13] +movd m4, [r4 + 29] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 2 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 20 * 16], m1 + +;mode 6 [row 9] +movu [r0 + 73 * 16], m1 + +; mode 4 row [row 6] +movu m6, [r5 + 19 * 16] +pmaddubsw m1, m0, m6 +pmulhrsw m1, m3 +pmaddubsw m4, m2, m6 +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 38 * 16], m1 + +; mode 3 [row 5] +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 21 * 16], m1 + +;mode 6 [row 11] +movu [r0 + 75 * 16], m1 + +; mode 5 row [row 7] +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 8 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 55 * 16], m1 + +; mode 5 row [row 8] +pmaddubsw m1, m0, [r5 + 25 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 56 * 16], m1 + +; mode 6 [row 10] +pmaddubsw m1, m0, [r5 + 15 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 15 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 74 * 16], m1 + +; mode 7 [row 14] +pmaddubsw m1, m0, [r5 + 7 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 7 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 94 * 16], m1 + +; mode 7 [row 15] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 95 * 16], m1 + +; mode 3 [row 6] +movu m0, [r4 + 6] +movd m1, [r4 + 22] +palignr m1, m0, 1 +punpcklbw m0, m1 + +; mode 17 [row 2 - second half] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 242 * 16 + 8], m1 +; mode 17 [row 2 - second half] end + +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 14] +movd m4, [r4 + 30] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 22 * 16], m1 + +; mode 6 [row 13] +movu [r0 + 77 * 16], m1 + +; mode 4 row [row 7] +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 8 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 39 * 16], m1 + +; mode 4 row [row 8] +pmaddubsw m1, m0, [r5 + 29 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 40 * 16], m1 + +; mode 5 row [row 9] +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 57 * 16], m1 + +; mode 5 row [row 10] +pmaddubsw m1, m0, [r5 + 27 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 27 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 58 * 16], m1 + +; mode 6 [row 12] +pmaddubsw m1, m0, [r5 + 9 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 76 * 16], m1 + +; mode 3 [row 7] +movu m0, [r4 + 7] +movd m1, [r4 + 27] +palignr m1, m0, 1 +punpcklbw m0, m1 + +; mode 17 [row 1 - second half] +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 241 * 16 + 8], m1 +; mode 17 [row 1 - second half] end + +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 15] +movd m4, [r4 + 25] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 23 * 16], m1 + +; mode 6 [row 15] +movu [r0 + 79 * 16], m1 + +; mode 4 row [row 9] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 41 * 16], m1 + +; mode 5 row [row 11] +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 59 * 16], m1 + +; mode 5 row [row 12] +pmaddubsw m1, m0, [r5 + 29 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 60 * 16], m1 + +; mode 6 [row 14] +pmaddubsw m1, m0, [r5 + 3 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 78 * 16], m1 + +; mode 3 [row 8] +movu m0, [r4 + 8] +movd m1, [r4 + 24] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 16] +psrldq m4, m2, 1 +pinsrb m4, [r4 + 32], 15 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 24 * 16], m1 + +; mode 4 row [row 10] +pmaddubsw m1, m0, [r5 + 7 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 7 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 42 * 16], m1 + +; mode 4 row [row 11] +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 43 * 16], m1 + +; mode 5 row [row 13] +pmaddubsw m1, m0, [r5 + 14 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 61 * 16], m1 + +; mode 5 row [row 14] +pmaddubsw m1, m0, [r5 + 31 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 31 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 62 * 16], m1 + +; mode 3 [row 9] +movu m0, [r4 + 9] +movd m1, [r4 + 16] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 17] +movd m4, [r4 + 33] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 25 * 16], m1 + +; mode 4 row [row 12] +pmaddubsw m1, m0, [r5 + 17 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 17 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 44 * 16], m1 + +; mode 3 [row 10] +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 26 * 16], m1 + +; mode 5 row [row 15] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 63 * 16], m1 + +; mode 3 [row 11] +movu m0, [r4 + 10] +movd m1, [r4 + 26] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 18] +movd m4, [r4 + 34] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 27 * 16], m1 + +; mode 4 row [row 13] +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 6 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 45 * 16], m1 + +; mode 4 row [row 14] +pmaddubsw m1, m0, [r5 + 27 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 27 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 46 * 16], m1 + +; mode 3 [row 12] +movu m0, [r4 + 11] +movd m1, [r4 + 27] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 19] +movd m4, [r4 + 35] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 28 * 16], m1 + +; mode 4 row [row 15] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m2, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 47 * 16], m1 + +; mode 3 [row 13] +movu m0, [r4 + 12] +movd m1, [r4 + 28] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 20] +movd m4, [r4 + 36] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 29 * 16], m1 + +; mode 3 [row 14] +movu m0, [r4 + 13] +movd m1, [r4 + 29] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +movu m2, [r4 + 21] +movd m4, [r4 + 37] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m2, [r5 + 6 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 30 * 16], m1 + +; mode 9 +movu m0, [r2 + 1] +movd m1, [r2 + 17] +palignr m1, m0, 1 + +; mode 9 [row 15] +movu [r0 + 127 * 16], m1 + +; mode 9 [row 0] +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +movu m7, [r2 + 9] +movd m4, [r4 + 25] +palignr m2, m7, 1 +punpcklbw m7, m2 +pmaddubsw m2, m7, [r5 + 2 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 112 * 16], m1 + +; mode 9 [row 1] +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 4 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 113 * 16], m1 + +; mode 9 [row 2] +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 6 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 114 * 16], m1 + +; mode 9 [row 3] +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 8 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 115 * 16], m1 + +; mode 9 [row 4] +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 10 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 116 * 16], m1 + +; mode 9 [row 5] +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 12 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 117 * 16], m1 + +; mode 9 [row 6] +pmaddubsw m1, m0, [r5 + 14 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 14 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 118 * 16], m1 + +; mode 9 [row 7] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 16 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 119 * 16], m1 + +; mode 9 [row 8] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 18 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 120 * 16], m1 + +; mode 9 [row 9] +pmaddubsw m1, m0, [r5 + 20 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 20 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 121 * 16], m1 + +; mode 9 [row 10] +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 22 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 122 * 16], m1 + +; mode 9 [row 11] +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 24 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 123 * 16], m1 + +; mode 9 [row 12] +pmaddubsw m1, m0, [r5 + 26 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 26 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 124 * 16], m1 + +; mode 9 [row 13] +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 28 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 125 * 16], m1 + +; mode 9 [row 14] +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 30 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 126 * 16], m1 + +; mode 10 +movu m1, [r2 + 1] +movu [r0 + 128 * 16], m1 +movu [r0 + 129 * 16], m1 +movu [r0 + 130 * 16], m1 +movu [r0 + 131 * 16], m1 +movu [r0 + 132 * 16], m1 +movu [r0 + 133 * 16], m1 +movu [r0 + 134 * 16], m1 +movu [r0 + 135 * 16], m1 +movu [r0 + 136 * 16], m1 +movu [r0 + 137 * 16], m1 +movu [r0 + 138 * 16], m1 +movu [r0 + 139 * 16], m1 +movu [r0 + 140 * 16], m1 +movu [r0 + 141 * 16], m1 +movu [r0 + 142 * 16], m1 +movu [r0 + 143 * 16], m1 + +pxor m0, m0 +pshufb m1, m1, m0 +punpcklbw m1, m0 +movu m2, [r1] +pshufb m2, m2, m0 +punpcklbw m2, m0 +movu m4, [r1 + 1] +punpcklbw m5, m4, m0 +punpckhbw m4, m0 +psubw m5, m2 +psubw m4, m2 +psraw m5, 1 +psraw m4, 1 +paddw m5, m1 +paddw m4, m1 +packuswb m5, m4 + +pextrb [r0 + 128 * 16], m5, 0 +pextrb [r0 + 129 * 16], m5, 1 +pextrb [r0 + 130 * 16], m5, 2 +pextrb [r0 + 131 * 16], m5, 3 +pextrb [r0 + 132 * 16], m5, 4 +pextrb [r0 + 133 * 16], m5, 5 +pextrb [r0 + 134 * 16], m5, 6 +pextrb [r0 + 135 * 16], m5, 7 +pextrb [r0 + 136 * 16], m5, 8 +pextrb [r0 + 137 * 16], m5, 9 +pextrb [r0 + 138 * 16], m5, 10 +pextrb [r0 + 139 * 16], m5, 11 +pextrb [r0 + 140 * 16], m5, 12 +pextrb [r0 + 141 * 16], m5, 13 +pextrb [r0 + 142 * 16], m5, 14 +pextrb [r0 + 143 * 16], m5, 15 + +; mode 11 +movu m0, [r2] + +; mode 11 [row 15] +movu [r0 + 159 * 16], m0 + +; mode 11 [row 0] +movu m1, [r2 + 1] +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m3 +movu m7, [r2 + 8] +movu m2, [r2 + 9] +punpcklbw m7, m2 +pmaddubsw m2, m7, [r5 + 30 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 144 * 16], m1 + +; mode 11 [row 1] +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 28 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 145 * 16], m1 + +; mode 11 [row 2] +pmaddubsw m1, m0, [r5 + 26 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 26 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 146 * 16], m1 + +; mode 11 [row 3] +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 24 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 147 * 16], m1 + +; mode 11 [row 4] +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 22 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 148 * 16], m1 + +; mode 11 [row 5] +pmaddubsw m1, m0, [r5 + 20 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 20 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 149 * 16], m1 + +; mode 11 [row 6] +pmaddubsw m1, m0, [r5 + 18 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 18 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 150 * 16], m1 + +; mode 11 [row 7] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 16 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 151 * 16], m1 + +; mode 11 [row 8] +pmaddubsw m1, m0, [r5 + 14 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 14 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 152 * 16], m1 + +; mode 11 [row 9] +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 12 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 153 * 16], m1 + +; mode 11 [row 10] +pmaddubsw m1, m0, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 10 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 154 * 16], m1 + +; mode 11 [row 11] +pmaddubsw m1, m0, [r5 + 8 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 8 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 155 * 16], m1 + +; mode 11 [row 12] +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 6 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 156 * 16], m1 + +; mode 11 [row 13] +pmaddubsw m1, m0, [r5 + 4 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 4 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 157 * 16], m1 + +; mode 11 [row 14] +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 2 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 158 * 16], m1 + +; mode 12 [row 0] +movu m0, [r4] +movu m1, [r4 + 1] +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 27 * 16] +pmulhrsw m1, m3 +movu m7, [r4 + 8] +movd m2, [r4 + 24] +palignr m2, m7, 1 +punpcklbw m7, m2 +pmaddubsw m2, m7, [r5 + 27 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 160 * 16], m1 + +; mode 12 [row 1] +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 22 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 161 * 16], m1 + +; mode 12 [row 2] +pmaddubsw m1, m0, [r5 + 17 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 17 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 162 * 16], m1 + +; mode 12 [row 3] +pmaddubsw m1, m0, [r5 + 12 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 12 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 163 * 16], m1 + +; mode 12 [row 4] +pmaddubsw m1, m0, [r5 + 7 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 7 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 164 * 16], m1 + +; mode 12 [row 5] +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 2 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 165 * 16], m1 + +; mode 13 [row 0] +pmaddubsw m1, m0, [r5 + 23 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 23 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 176 * 16], m1 + +; mode 13 [row 1] +pmaddubsw m1, m0, [r5 + 14 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 14 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 177 * 16], m1 + +; mode 13 [row 2] +pmaddubsw m1, m0, [r5 + 5 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 5 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 178 * 16], m1 + +; mode 14 [row 0] +pmaddubsw m1, m0, [r5 + 19 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 19 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 192 * 16], m1 + +; mode 14 [row 1] +pmaddubsw m1, m0, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 6 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 193 * 16], m1 + +; mode 17 [row 0] +movu [r0 + 240 * 16], m1 + +; mode 15 [row 0] +pmaddubsw m1, m0, [r5 + 15 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 15 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 208 * 16], m1 + +; mode 15 [row 15 - second half] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 223 * 16 + 8], m1 +; mode 15 [row 15 - second half] end + +; mode 16 [row 0] +pmaddubsw m1, m0, [r5 + 11 * 16] +pmulhrsw m1, m3 +pmaddubsw m2, m7, [r5 + 11 * 16] +pmulhrsw m2, m3 +packuswb m1, m2 +movu [r0 + 224 * 16], m1 + +; mode 17 [row 9 - second half] +pmaddubsw m1, m0, [r5 + 28 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 249 * 16 + 8], m1 +; mode 17 [row 9 - second half] end + +; mode 17 [row 10 - second half] +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 250 * 16 + 8], m1 +; mode 17 [row 10 - second half] end + +; mode 17 [row 1 - first half] +pslldq m6, m0, 2 +pinsrb m6, [r3 + 0], 1 +pinsrb m6, [r3 + 1], 0 +pmaddubsw m1, m6, [r5 + 12 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 241 * 16], m1 + +; mode 17 [row 11 - second half] +pmaddubsw m1, m6, [r5 + 8 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 251 * 16 + 8], m1 +; mode 17 [row 11 - second half] end + +; mode 17 [row 2 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 1], 1 +pinsrb m6, [r3 + 2], 0 +pmaddubsw m1, m6, [r5 + 18 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 242 * 16], m1 + +; mode 17 [row 12 - second half] +pmaddubsw m1, m6, [r5 + 14 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 252 * 16 + 8], m1 +; mode 17 [row 12 - second half] end + +; mode 17 [row 3 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 2], 1 +pinsrb m6, [r3 + 4], 0 +pmaddubsw m1, m6, [r5 + 24 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 243 * 16], m1 + +; mode 17 [row 13 - first half] +pmaddubsw m1, m6, [r5 + 20 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 253 * 16 + 8], m1 + +; mode 17 [row 4 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 4], 1 +pinsrb m6, [r3 + 5], 0 +pmaddubsw m1, m6, [r5 + 30 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 244 * 16], m1 + +; mode 17 [row 5 - first half] +pmaddubsw m1, m6, [r5 + 4 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 245 * 16], m1 + +; mode 17 [row 14 - second half] +pmaddubsw m1, m6, [r5 + 26 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 254 * 16 + 8], m1 +; mode 17 [row 14 - second half] end + +; mode 17 [row 6 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 5], 1 +pinsrb m6, [r3 + 6], 0 +pmaddubsw m1, m6, [r5 + 10 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 246 * 16], m1 + +; mode 17 [row 7 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 6], 1 +pinsrb m6, [r3 + 7], 0 +pmaddubsw m1, m6, [r5 + 16 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 247 * 16], m1 + +; mode 17 [row 8 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 7], 1 +pinsrb m6, [r3 + 9], 0 +pmaddubsw m1, m6, [r5 + 22 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 248 * 16], m1 + +; mode 17 [row 9 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 9], 1 +pinsrb m6, [r3 + 10], 0 +pmaddubsw m1, m6, [r5 + 28 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 249 * 16], m1 + +; mode 17 [row 10 - first half] +pmaddubsw m1, m6, [r5 + 2 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 250 * 16], m1 + +; mode 17 [row 11 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 10], 1 +pinsrb m6, [r3 + 11], 0 +pmaddubsw m1, m6, [r5 + 8 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 251 * 16], m1 + +; mode 17 [row 12 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 11], 1 +pinsrb m6, [r3 + 12], 0 +pmaddubsw m1, m6, [r5 + 14 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 252 * 16], m1 + +; mode 17 [row 13 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 12], 1 +pinsrb m6, [r3 + 14], 0 +pmaddubsw m1, m6, [r5 + 20 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 253 * 16], m1 + +; mode 17 [row 14 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 14], 1 +pinsrb m6, [r3 + 15], 0 +pmaddubsw m1, m6, [r5 + 26 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 254 * 16], m1 + +; mode 16 [row 12 - second half] +pmaddubsw m1, m0, [r5 + 15 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 236 * 16 + 8], m1 +; mode 16 [row 12 - second half] + +; mode 12 [row 6] +pslldq m2, m0, 2 +pinsrb m2, [r3 + 0], 1 +pinsrb m2, [r3 + 6], 0 +pmaddubsw m1, m2, [r5 + 29 * 16] +pmulhrsw m1, m3 +movu m0, [r4 + 7] +psrldq m4, m0, 1 +punpcklbw m0, m4 +pmaddubsw m4, m0, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 166 * 16], m1 + +; mode 12 [row 7] +pmaddubsw m1, m2, [r5 + 24 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 167 * 16], m1 + +; mode 12 [row 8] +pmaddubsw m1, m2, [r5 + 19 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 168 * 16], m1 + +; mode 12 [row 9] +pmaddubsw m1, m2, [r5 + 14 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 169 * 16], m1 + +; mode 12 [row 10] +pmaddubsw m1, m2, [r5 + 9 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 170 * 16], m1 + +; mode 12 [row 11] +pmaddubsw m1, m2, [r5 + 4 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 171 * 16], m1 + +; mode 13 [row 3] +pinsrb m7, m2, [r3 + 4], 0 +pmaddubsw m1, m7, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 179 * 16], m1 + +; mode 13 [row 4] +pmaddubsw m1, m7, [r5 + 19 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 180 * 16], m1 + +; mode 13 [row 5] +pmaddubsw m1, m7, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 181 * 16], m1 + +; mode 13 [row 6] +pmaddubsw m1, m7, [r5 + 1 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 182 * 16], m1 + +; mode 14 [row 2] +pinsrb m5, m7, [r3 + 2], 0 +pmaddubsw m1, m5, [r5 + 25 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 194 * 16], m1 + +; mode 14 [row 3] +pmaddubsw m1, m5, [r5 + 12 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 195 * 16], m1 + +; mode 15 [row 1] +pmaddubsw m1, m5, [r5 + 30 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 209 * 16], m1 + +; mode 15 [row 2] +pmaddubsw m1, m5, [r5 + 13 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 13 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 210 * 16], m1 + +; mode 16 [row 1] +pmaddubsw m1, m5, [r5 + 22 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 225 * 16], m1 + +; mode 16 [row 2] +pmaddubsw m1, m5, [r5 + 1 * 16] +pmulhrsw m1, m3 +pmaddubsw m4, m0, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m1, m4 +movu [r0 + 226 * 16], m1 + +; mode 16 [row 13 - second half] +pmaddubsw m1, m5, [r5 + 26 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 237 * 16 + 8], m1 +; mode 16 [row 13 - second half] + +; mode 16 [row 14 - second half] +pmaddubsw m1, m5, [r5 + 5 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 238 * 16 + 8], m1 +; mode 16 [row 14 - second half] + +; mode 16 [row 3] +pslldq m6, m5, 2 +pinsrb m6, [r3 + 2], 1 +pinsrb m6, [r3 + 3], 0 +pmaddubsw m1, m6, [r5 + 12 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 227 * 16], m1 + +; mode 16 [row 15 - second half] +pmaddubsw m1, m6, [r5 + 16 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 239 * 16 + 8], m1 +; mode 16 [row 15 - second half] end + +; mode 16 [row 4- first half] +pslldq m6, 2 +pinsrb m6, [r3 + 3], 1 +pinsrb m6, [r3 + 5], 0 +pmaddubsw m1, m6, [r5 + 23 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 228 * 16], m1 + +; mode 16 [row 5- first half] +pmaddubsw m1, m6, [r5 + 2 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 229 * 16], m1 + +; mode 16 [row 6- first half] +pslldq m6, 2 +pinsrb m6, [r3 + 5], 1 +pinsrb m6, [r3 + 6], 0 +pmaddubsw m1, m6, [r5 + 13 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 230 * 16], m1 + +; mode 16 [row 7- first half] +pslldq m6, 2 +pinsrb m6, [r3 + 6], 1 +pinsrb m6, [r3 + 8], 0 +pmaddubsw m1, m6, [r5 + 24 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 231 * 16], m1 + +; mode 16 [row 8- first half] +pmaddubsw m1, m6, [r5 + 3 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 232 * 16], m1 +; mode 19 [row 0 - second half] end + +; mode 16 [row 9- first half] +pslldq m6, 2 +pinsrb m6, [r3 + 8], 1 +pinsrb m6, [r3 + 9], 0 +pmaddubsw m1, m6, [r5 + 14 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 233 * 16], m1 + +; mode 16 [row 10 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 9], 1 +pinsrb m6, [r3 + 11], 0 +pmaddubsw m1, m6, [r5 + 25 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 234 * 16], m1 + +; mode 16 [row 11 - first half] +pmaddubsw m1, m6, [r5 + 4 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 235 * 16], m1 + +; mode 16 [row 12 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 11], 1 +pinsrb m6, [r3 + 12], 0 +pmaddubsw m1, m6, [r5 + 15 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 236 * 16], m1 + +; mode 16 [row 13 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 12], 1 +pinsrb m6, [r3 + 14], 0 +pmaddubsw m1, m6, [r5 + 26 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 237 * 16], m1 + +; mode 16 [row 14 - first half] +pmaddubsw m1, m6, [r5 + 5 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 238 * 16], m1 + +; mode 16 [row 15 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 14], 1 +pinsrb m6, [r3 + 15], 0 +pmaddubsw m1, m6, [r5 + 16 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 239 * 16], m1 + +; mode 14 [row 4] +pslldq m5, 2 +pinsrb m5, [r3 + 2], 1 +pinsrb m5, [r3 + 5], 0 +movu m4, [r4 + 6] +psrldq m0, m4, 1 +punpcklbw m4, m0 + +; mode 16 [row 3 - second half] +pmaddubsw m1, m4, [r5 + 12 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 227 * 16 + 8], m1 + +; mode 16 [row 3 - second half] end +pmaddubsw m1, m5, [r5 + 31 * 16] +pmulhrsw m1, m3 +pmaddubsw m0, m4, [r5 + 31 * 16] +pmulhrsw m0, m3 +packuswb m1, m0 +movu [r0 + 196 * 16], m1 + +; mode 14 [row 5] +pmaddubsw m1, m5, [r5 + 18 * 16] +pmulhrsw m1, m3 +pmaddubsw m0, m4, [r5 + 18 * 16] +pmulhrsw m0, m3 +packuswb m1, m0 +movu [r0 + 197 * 16], m1 + +; mode 14 [row 6] +pmaddubsw m1, m5, [r5 + 5 * 16] +pmulhrsw m1, m3 +pmaddubsw m0, m4, [r5 + 5 * 16] +pmulhrsw m0, m3 +packuswb m1, m0 +movu [r0 + 198 * 16], m1 + +; mode 15 [row 3] +movu m6, m5 +pinsrb m6, [r3 + 4], 0 +pmaddubsw m1, m6, [r5 + 28 * 16] +pmulhrsw m1, m3 +pmaddubsw m0, m4, [r5 + 28 * 16] +pmulhrsw m0, m3 +packuswb m1, m0 +movu [r0 + 211 * 16], m1 + +; mode 15 [row 4] +pmaddubsw m1, m6, [r5 + 11 * 16] +pmulhrsw m1, m3 +pmaddubsw m0, m4, [r5 + 11 * 16] +pmulhrsw m0, m3 +packuswb m1, m0 +movu [r0 + 212 * 16], m1 + +; mode 15 [row 5 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 4], 1 +pinsrb m6, [r3 + 6], 0 +pmaddubsw m1, m6, [r5 + 26 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 213 * 16], m1 + +; mode 15 [row 6 - first half] +pmaddubsw m1, m6, [r5 + 9 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 214 * 16], m1 + +; mode 15 [row 7 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 6], 1 +pinsrb m6, [r3 + 8], 0 +pmaddubsw m1, m6, [r5 + 24 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 215 * 16], m1 + +; mode 15 [row 8 - first half] +pmaddubsw m1, m6, [r5 + 7 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 216 * 16], m1 + +; mode 15 [row 9 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 8], 1 +pinsrb m6, [r3 + 9], 0 +pmaddubsw m1, m6, [r5 + 22 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 217 * 16], m1 + +; mode 15 [row 10 - first half] +pmaddubsw m1, m6, [r5 + 5 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 218 * 16], m1 + +; mode 15 [row 11 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 9], 1 +pinsrb m6, [r3 + 11], 0 +pmaddubsw m1, m6, [r5 + 20 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 219 * 16], m1 + +; mode 15 [row 12 - first half] +pmaddubsw m1, m6, [r5 + 3 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 220 * 16], m1 + +; mode 15 [row 13 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 11], 1 +pinsrb m6, [r3 + 13], 0 +pmaddubsw m1, m6, [r5 + 18 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 221 * 16], m1 + +; mode 15 [row 14 - first half] +pmaddubsw m1, m6, [r5 + 1 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 222 * 16], m1 + +; mode 15 [row 15 - first half] +pslldq m6, 2 +pinsrb m6, [r3 + 13], 1 +pinsrb m6, [r3 + 15], 0 +pmaddubsw m1, m6, [r5 + 16 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 223 * 16], m1 + +; mode 14 [row 7] +pslldq m5, 2 +pinsrb m5, [r3 + 5], 1 +pinsrb m5, [r3 + 7], 0 +movu m0, [r4 + 5] +psrldq m6, m0, 1 +punpcklbw m0, m6 + +; mode 15 [row 5 - second half] +pmaddubsw m1, m0, [r5 + 26 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 213 * 16 + 8], m1 +; mode 15 [row 5 - second half] end + +; mode 15 [row 6 - second half] +pmaddubsw m1, m0, [r5 + 9 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 214 * 16 + 8], m1 +; mode 15 [row 6 - second half] end + +; mode 16 [row 4 - second half] +pmaddubsw m1, m0, [r5 + 23 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 228 * 16 + 8], m1 +; mode 16 [row 4 - second half] end + +; mode 16 [row 5 - second half] +pmaddubsw m1, m0, [r5 + 2 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 229 * 16 + 8], m1 + +; mode 16 [row 5 - second half] end +pmaddubsw m1, m5, [r5 + 24 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 24 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 199 * 16], m1 + +; mode 14 [row 8] +pmaddubsw m1, m5, [r5 + 11 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 11 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 200 * 16], m1 + +; mode 14 [row 9] +pslldq m5, 2 +pinsrb m5, [r3 + 7], 1 +pinsrb m5, [r3 + 10], 0 +movu m0, [r4 + 4] +psrldq m6, m0, 1 +punpcklbw m0, m6 + +; mode 15 [row 7 - second half] +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 215 * 16 + 8], m1 +; mode 15 [row 7 - second half] end + +; mode 15 [row 8 - second half] +pmaddubsw m1, m0, [r5 + 7 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 216 * 16 + 8], m1 +; mode 15 [row 8 - second half] end + +; mode 16 [row 6 - second half] +pmaddubsw m1, m0, [r5 + 13 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 230 * 16 + 8], m1 +; mode 16 [row 6 - second half] end + +; mode 15 [row 6 - second half] end +pmaddubsw m1, m5, [r5 + 30 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 30 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 201 * 16], m1 + +; mode 14 [row 10] +pmaddubsw m1, m5, [r5 + 17 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 17 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 202 * 16], m1 + +; mode 14 [row 11] +pmaddubsw m1, m5, [r5 + 4 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 4 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 203 * 16], m1 + +; mode 14 [row 12] +pslldq m5, 2 +pinsrb m5, [r3 + 10], 1 +pinsrb m5, [r3 + 12], 0 +movu m0, [r4 + 3] +psrldq m6, m0, 1 +punpcklbw m0, m6 + +; mode 15 [row 9 - second half] +pmaddubsw m1, m0, [r5 + 22 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 217 * 16 + 8], m1 +; mode 15 [row 9 - second half] end + +; mode 15 [row 10 - second half] +pmaddubsw m1, m0, [r5 + 5 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 218 * 16 + 8], m1 +; mode 15 [row 10 - second half] end + +; mode 16 [row 7 - second half] +pmaddubsw m1, m0, [r5 + 24 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 231 * 16 + 8], m1 +; mode 16 [row 7 - second half] end + +; mode 16 [row 8 - second half] +pmaddubsw m1, m0, [r5 + 3 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 232 * 16 + 8], m1 +; mode 16 [row 8 - second half] end + +pmaddubsw m1, m5, [r5 + 23 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 23 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 204 * 16], m1 + +; mode 14 [row 13] +pmaddubsw m1, m5, [r5 + 10 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 10 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 205 * 16], m1 + +; mode 14 [row 14] +pslldq m5, 2 +pinsrb m5, [r3 + 12], 1 +pinsrb m5, [r3 + 15], 0 +movu m0, [r4 + 2] +psrldq m6, m0, 1 +punpcklbw m0, m6 + +; mode 15 [row 11 - second half] +pmaddubsw m1, m0, [r5 + 20 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 219 * 16 + 8], m1 +; mode 15 [row 11 - second half] end + +; mode 15 [row 12 - second half] +pmaddubsw m1, m0, [r5 + 3 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 220 * 16 + 8], m1 +; mode 15 [row 12 - second half] end + +; mode 16 [row 9 - second half] +pmaddubsw m1, m0, [r5 + 14 * 16] +pmulhrsw m1, m3 +packuswb m1, m1 +movh [r0 + 233 * 16 + 8], m1 + +; mode 16 [row 9 - second half] end +pmaddubsw m1, m5, [r5 + 29 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 29 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 206 * 16], m1 + +; mode 14 [row 15] +pmaddubsw m1, m5, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m6, m0, [r5 + 16 * 16] +pmulhrsw m6, m3 +packuswb m1, m6 +movu [r0 + 207 * 16], m1 + +; mode 12 [row 12] +pslldq m0, m2, 2 +pinsrb m0, [r3 + 6], 1 +pinsrb m0, [r3 + 13], 0 +pmaddubsw m1, m0, [r5 + 31 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 31 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 172 * 16], m1 + +; mode 12 [row 13] +pmaddubsw m1, m0, [r5 + 26 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 173 * 16], m1 + +; mode 12 [row 14] +pmaddubsw m1, m0, [r5 + 21 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 174 * 16], m1 + +; mode 12 [row 15] +pmaddubsw m1, m0, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 175 * 16], m1 + +; mode 13 [row 7] +pslldq m7, 2 +pinsrb m7, [r3 + 4], 1 +pinsrb m7, [r3 + 7], 0 +pmaddubsw m1, m7, [r5 + 24 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 183 * 16], m1 + +; mode 13 [row 8] +pmaddubsw m1, m7, [r5 + 15 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 184 * 16], m1 + +; mode 13 [row 9] +pmaddubsw m1, m7, [r5 + 6 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 185 * 16], m1 + +; mode 13 [row 10] +pslldq m7, 2 +pinsrb m7, [r3 + 7], 1 +pinsrb m7, [r3 + 11], 0 +pmaddubsw m1, m7, [r5 + 29 * 16] +pmulhrsw m1, m3 +movu m4, [r4 + 5] +psrldq m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, [r5 + 29 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 186 * 16], m1 + +; mode 13 [row 11] +pmaddubsw m1, m7, [r5 + 20 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 187 * 16], m1 + +; mode 13 [row 12] +pmaddubsw m1, m7, [r5 + 11 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 11 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 188 * 16], m1 + +; mode 13 [row 13] +pmaddubsw m1, m7, [r5 + 2 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 189 * 16], m1 + +; mode 13 [row 14] +pslldq m7, 2 +pinsrb m7, [r3 + 11], 1 +pinsrb m7, [r3 + 14], 0 +pmaddubsw m1, m7, [r5 + 25 * 16] +pmulhrsw m1, m3 +movu m4, [r4 + 4] +psrldq m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 190 * 16], m1 + +; mode 13 [row 15] +pmaddubsw m1, m7, [r5 + 16 * 16] +pmulhrsw m1, m3 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m1, m5 +movu [r0 + 191 * 16], m1 + +; mode 17 [row 15] +movu m0, [r3] +pshufb m1, m0, [tab_S1] +movu [r0 + 255 * 16], m1 +movu m2, [r4] +movd [r0 + 255 * 16 + 12], m2 + +; mode 18 [row 0] +movu [r0 + 256 * 16], m0 + +; mode 18 [row 1] +pslldq m4, m0, 1 +pinsrb m4, [r4 + 1], 0 +movu [r0 + 257 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 2], 0 +movu [r0 + 258 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 3], 0 +movu [r0 + 259 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 4], 0 +movu [r0 + 260 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 5], 0 +movu [r0 + 261 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 6], 0 +movu [r0 + 262 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 7], 0 +movu [r0 + 263 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 8], 0 +movu [r0 + 264 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 9], 0 +movu [r0 + 265 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 10], 0 +movu [r0 + 266 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 11], 0 +movu [r0 + 267 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 12], 0 +movu [r0 + 268 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 13], 0 +movu [r0 + 269 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 14], 0 +movu [r0 + 270 * 16], m4 +pslldq m4, 1 +pinsrb m4, [r4 + 15], 0 +movu [r0 + 271 * 16], m4 + +; mode 19 [row 0] +psrldq m2, m0, 1 +punpcklbw m0, m2 +movu m5, [r3 + 8] +psrldq m6, m5, 1 +punpcklbw m5, m6 +pmaddubsw m4, m0, [r5 + 6 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 6 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 272 * 16], m4 + +; mode 20 [row 0] +pmaddubsw m4, m0, [r5 + 11 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 11 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 288 * 16], m4 + +; mode 21 [row 0] +pmaddubsw m4, m0, [r5 + 15 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 15 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 304 * 16], m4 + +; mode 22 [row 0] +pmaddubsw m4, m0, [r5 + 19 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 19 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 320 * 16], m4 + +; mode 22 [row 1] +pmaddubsw m4, m0, [r5 + 6 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 6 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 321 * 16], m4 + +; mode 23 [row 0] +pmaddubsw m4, m0, [r5 + 23 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 23 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 336 * 16], m4 + +; mode 23 [row 1] +pmaddubsw m4, m0, [r5 + 14 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 14 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 337 * 16], m4 + +; mode 23 [row 2] +pmaddubsw m4, m0, [r5 + 5 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 5 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 338 * 16], m4 + +; mode 24 [row 0] +pmaddubsw m4, m0, [r5 + 27 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 27 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 352 * 16], m4 + +; mode 24 [row 1] +pmaddubsw m4, m0, [r5 + 22 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 22 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 353 * 16], m4 + +; mode 24 [row 2] +pmaddubsw m4, m0, [r5 + 17 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 17 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 354 * 16], m4 + +; mode 24 [row 3] +pmaddubsw m4, m0, [r5 + 12 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 12 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 355 * 16], m4 + +; mode 24 [row 4] +pmaddubsw m4, m0, [r5 + 7 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 7 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 356 * 16], m4 + +; mode 24 [row 5] +pmaddubsw m4, m0, [r5 + 2 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 2 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 357 * 16], m4 + +; mode 24 [row 6 - first half] +pslldq m7, m0, 2 +pinsrb m7, [r4 + 0], 1 +pinsrb m7, [r4 + 6], 0 +pmaddubsw m4, m7, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 358 * 16], m4 + +; mode 24 [row 7 - first half] +pmaddubsw m4, m7, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 359 * 16], m4 + +; mode 24 [row 8 - first half] +pmaddubsw m4, m7, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 360 * 16], m4 + +; mode 24 [row 9 - first half] +pmaddubsw m4, m7, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 361 * 16], m4 + +; mode 24 [row 10 - first half] +pmaddubsw m4, m7, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 362 * 16], m4 + +; mode 24 [row 11 - first half] +pmaddubsw m4, m7, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 363 * 16], m4 + +; mode 24 [row 12 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 6], 1 +pinsrb m7, [r4 + 13], 0 +pmaddubsw m4, m7, [r5 + 31 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 364 * 16], m4 + +; mode 24 [row 13 - first half] +pmaddubsw m4, m7, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 365 * 16], m4 + +; mode 24 [row 14 - first half] +pmaddubsw m4, m7, [r5 + 21 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 366 * 16], m4 + +; mode 24 [row 15 - first half] +pmaddubsw m4, m7, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 367 * 16], m4 + +; mode 23 [row 3 - first half] +pslldq m7, m0, 2 +pinsrb m7, [r4 + 0], 1 +pinsrb m7, [r4 + 4], 0 +pmaddubsw m4, m7, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 339 * 16], m4 + +; mode 23 [row 4 - first half] +pmaddubsw m4, m7, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 340 * 16], m4 + +; mode 23 [row 5 - first half] +pmaddubsw m4, m7, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 341 * 16], m4 + +; mode 23 [row 6 - first half] +pmaddubsw m4, m7, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 342 * 16], m4 + +; mode 23 [row 7 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 4], 1 +pinsrb m7, [r4 + 7], 0 +pmaddubsw m4, m7, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 343 * 16], m4 + +; mode 23 [row 8 - first half] +pmaddubsw m4, m7, [r5 + 15 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 344 * 16], m4 + +; mode 23 [row 9 - first half] +pmaddubsw m4, m7, [r5 + 6 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 345 * 16], m4 + +; mode 23 [row 10 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 7], 1 +pinsrb m7, [r4 + 11], 0 +pmaddubsw m4, m7, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 346 * 16], m4 + +; mode 23 [row 11 - first half] +pmaddubsw m4, m7, [r5 + 20 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 347 * 16], m4 + +; mode 23 [row 12 - first half] +pmaddubsw m4, m7, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 348 * 16], m4 + +; mode 23 [row 13 - first half] +pmaddubsw m4, m7, [r5 + 2 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 349 * 16], m4 + +; mode 23 [row 14 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 11], 1 +pinsrb m7, [r4 + 14], 0 +pmaddubsw m4, m7, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 350 * 16], m4 + +; mode 23 [row 15 - first half] +pmaddubsw m4, m7, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 351 * 16], m4 + +; mode 21 [row 15 - first half] +pmaddubsw m4, m0, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 319 * 16 + 8], m4 +; mode 21 [row 15 - second half] end + +; mode 20 [row 1 - first half] +pslldq m7, m0, 2 +pinsrb m7, [r4 + 0], 1 +pinsrb m7, [r4 + 2], 0 +pmaddubsw m4, m7, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 289 * 16], m4 + +; mode 20 [row 2 - first half] +pmaddubsw m4, m7, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 290 * 16], m4 + +; mode 21 [row 1 - first half] +pmaddubsw m4, m7, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 305 * 16], m4 + +; mode 21 [row 2 - first half] +pmaddubsw m4, m7, [r5 + 13 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 306 * 16], m4 + +; mode 22 [row 2 - first half] +pmaddubsw m4, m7, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 322 * 16], m4 + +; mode 22 [row 3 - first half] +pmaddubsw m4, m7, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 323 * 16], m4 + +; mode 22 [row 4 - first half] +pslldq m1, m7, 2 +pinsrb m1, [r4 + 2], 1 +pinsrb m1, [r4 + 5], 0 +pmaddubsw m4, m1, [r5 + 31 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 324 * 16], m4 + +; mode 22 [row 5 - first half] +pmaddubsw m4, m1, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 325 * 16], m4 + +; mode 22 [row 6 - first half] +pmaddubsw m4, m1, [r5 + 5 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 326 * 16], m4 + +; mode 22 [row 7 - first half] +pslldq m1, 2 +pinsrb m1, [r4 + 5], 1 +pinsrb m1, [r4 + 7], 0 +pmaddubsw m4, m1, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 327 * 16], m4 + +; mode 22 [row 8 - first half] +pmaddubsw m4, m1, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 328 * 16], m4 + +; mode 22 [row 9 - first half] +pslldq m1, 2 +pinsrb m1, [r4 + 7], 1 +pinsrb m1, [r4 + 10], 0 +pmaddubsw m4, m1, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 329 * 16], m4 + +; mode 22 [row 10 - first half] +pmaddubsw m4, m1, [r5 + 17 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 330 * 16], m4 + +; mode 22 [row 11 - first half] +pmaddubsw m4, m1, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 331 * 16], m4 + +; mode 22 [row 12 - first half] +pslldq m1, 2 +pinsrb m1, [r4 + 10], 1 +pinsrb m1, [r4 + 12], 0 +pmaddubsw m4, m1, [r5 + 23 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 332 * 16], m4 + +; mode 22 [row 13 - first half] +pmaddubsw m4, m1, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 333 * 16], m4 + +; mode 22 [row 14 - first half] +pslldq m1, 2 +pinsrb m1, [r4 + 12], 1 +pinsrb m1, [r4 + 15], 0 +pmaddubsw m4, m1, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 334 * 16], m4 + +; mode 22 [row 15 - first half] +pmaddubsw m4, m1, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 335 * 16], m4 + +; mode 21 [row 3 - first half] +pslldq m6, m7, 2 +pinsrb m6, [r4 + 2], 1 +pinsrb m6, [r4 + 4], 0 +pmaddubsw m4, m6, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 307 * 16], m4 + +; mode 21 [row 4 - first half] +pmaddubsw m4, m6, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 308 * 16], m4 + +; mode 21 [row 5 - first half] +pslldq m6, 2 +pinsrb m6, [r4 + 4], 1 +pinsrb m6, [r4 + 6], 0 +pmaddubsw m4, m6, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 309 * 16], m4 + +; mode 21 [row 6 - first half] +pmaddubsw m4, m6, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 310 * 16], m4 + +; mode 21 [row 7 - first half] +pslldq m6, 2 +pinsrb m6, [r4 + 6], 1 +pinsrb m6, [r4 + 8], 0 +pmaddubsw m4, m6, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 311 * 16], m4 + +; mode 21 [row 8 - first half] +pmaddubsw m4, m6, [r5 + 7 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 312 * 16], m4 + +; mode 21 [row 9 - first half] +pslldq m6, 2 +pinsrb m6, [r4 + 8], 1 +pinsrb m6, [r4 + 9], 0 +pmaddubsw m4, m6, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 313 * 16], m4 + +; mode 21 [row 10 - first half] +pmaddubsw m4, m6, [r5 + 5 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 314 * 16], m4 + +; mode 21 [row 11 - first half] +pslldq m6, 2 +pinsrb m6, [r4 + 9], 1 +pinsrb m6, [r4 + 11], 0 +pmaddubsw m4, m6, [r5 + 20 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 315 * 16], m4 + +; mode 21 [row 12 - first half] +pmaddubsw m4, m6, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 316 * 16], m4 + +; mode 21 [row 13 - first half] +pslldq m6, 2 +pinsrb m6, [r4 + 11], 1 +pinsrb m6, [r4 + 13], 0 +pmaddubsw m4, m6, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 317 * 16], m4 + +; mode 21 [row 14 - first half] +pmaddubsw m4, m6, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 318 * 16], m4 + +; mode 21 [row 15 - first half] +pslldq m6, 2 +pinsrb m6, [r4 + 13], 1 +pinsrb m6, [r4 + 15], 0 +pmaddubsw m4, m6, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 319 * 16], m4 + +; mode 20 [row 13 - second half] +pmaddubsw m4, m7, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 301 * 16 + 8], m4 +; mode 20 [row 13 - second half] + +; mode 20 [row 14 - second half] +pmaddubsw m4, m7, [r5 + 5 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 302 * 16 + 8], m4 +; mode 20 [row 14 - second half] + +; mode 20 [row 3 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 2], 1 +pinsrb m7, [r4 + 3], 0 +pmaddubsw m4, m7, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 291 * 16], m4 + +; mode 20 [row 15 - second half] +pmaddubsw m4, m7, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 303 * 16 + 8], m4 +; mode 20 [row 15 - second half] + +; mode 20 [row 4 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 3], 1 +pinsrb m7, [r4 + 5], 0 +pmaddubsw m4, m7, [r5 + 23 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 292 * 16], m4 + +; mode 20 [row 5 - first half] +pmaddubsw m4, m7, [r5 + 2 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 293 * 16], m4 + +; mode 20 [row 6 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 5], 1 +pinsrb m7, [r4 + 6], 0 +pmaddubsw m4, m7, [r5 + 13 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 294 * 16], m4 + +; mode 20 [row 7 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 6], 1 +pinsrb m7, [r4 + 8], 0 +pmaddubsw m4, m7, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 295 * 16], m4 + +; mode 20 [row 8 - first half] +pmaddubsw m4, m7, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 296 * 16], m4 + +; mode 20 [row 9 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 8], 1 +pinsrb m7, [r4 + 9], 0 +pmaddubsw m4, m7, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 297 * 16], m4 + +; mode 20 [row 10 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 9], 1 +pinsrb m7, [r4 + 11], 0 +pmaddubsw m4, m7, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 298 * 16], m4 + +; mode 20 [row 11 - first half] +pmaddubsw m4, m7, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 299 * 16], m4 + +; mode 20 [row 12 - first half] +movu m1, [r5 + 15 * 16] +pslldq m7, 2 +pinsrb m7, [r4 + 11], 1 +pinsrb m7, [r4 + 12], 0 +pmaddubsw m4, m7, [r5 + 15 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 300 * 16], m4 + +; mode 20 [row 13 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 12], 1 +pinsrb m7, [r4 + 14], 0 +pmaddubsw m4, m7, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 301 * 16], m4 + +; mode 20 [row 14 - first half] +pmaddubsw m4, m7, [r5 + 5 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 302 * 16], m4 + +; mode 20 [row 15 - first half] +pslldq m7, 2 +pinsrb m7, [r4 + 14], 1 +pinsrb m7, [r4 + 15], 0 +pmaddubsw m4, m7, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 303 * 16], m4 + +; mode 19 [row 1] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 1], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 8], 1 +pinsrb m5, [r3 + 7], 0 + +; mode 20 [row 1 - second half] +pmaddubsw m4, m5, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 289 * 16 + 8], m4 +; mode 20 [row 1 - second half] end + +; mode 20 [row 2 - second half] +pmaddubsw m4, m5, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 290 * 16 + 8], m4 +; mode 20 [row 2 - second half] end + +; mode 21 [row 2 - second half] +pmaddubsw m4, m5, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 305 * 16 + 8], m4 +; mode 21 [row 2 - second half] end + +; mode 21 [row 3 - second half] +pmaddubsw m4, m5, [r5 + 13 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 306 * 16 + 8], m4 +; mode 21 [row 3 - second half] end + +; mode 21 [row 4 - second half] +pmaddubsw m4, m5, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 307 * 16 + 8], m4 +; mode 21 [row 4 - second half] end + +; mode 22 [row 2 - second half] +pmaddubsw m4, m5, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 322 * 16 + 8], m4 +; mode 22 [row 2 - second half] end + +; mode 22 [row 3 - second half] +pmaddubsw m4, m5, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 323 * 16 + 8], m4 +; mode 22 [row 3 - second half] end + +; mode 23 [row 3 - second half] +pmaddubsw m4, m5, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 339 * 16 + 8], m4 +; mode 23 [row 3 - second half] end + +; mode 23 [row 4 - second half] +pmaddubsw m4, m5, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 340 * 16 + 8], m4 +; mode 23 [row 4 - second half] end + +; mode 23 [row 5 - second half] +pmaddubsw m4, m5, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 341 * 16 + 8], m4 +; mode 23 [row 5 - second half] end + +; mode 23 [row 6 - second half] +pmaddubsw m4, m5, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 342 * 16 + 8], m4 +; mode 23 [row 6 - second half] end + +; mode 24 [row 6 - second half] +pmaddubsw m4, m5, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 358 * 16 + 8], m4 +; mode 24 [row 6 - second half] end + +; mode 24 [row 7 - second half] +pmaddubsw m4, m5, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 359 * 16 + 8], m4 +; mode 24 [row 7 - second half] end + +; mode 24 [row 8 - second half] +pmaddubsw m4, m5, [r5 + 19 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 360 * 16 + 8], m4 +; mode 24 [row 8 - second half] end + +; mode 24 [row 9 - second half] +pmaddubsw m4, m5, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 361 * 16 + 8], m4 +; mode 24 [row 9 - second half] end + +; mode 24 [row 10 - second half] +pmaddubsw m4, m5, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 362 * 16 + 8], m4 +; mode 24 [row 10 - second half] end + +; mode 24 [row 11 - second half] +pmaddubsw m4, m5, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 363 * 16 + 8], m4 +; mode 24 [row 11 - second half] end + +pmaddubsw m4, m0, [r5 + 12 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 12 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 273 * 16], m4 + +; mode 19 [row 2] +pslldq m0, 2 +pinsrb m0, [r4 + 1], 1 +pinsrb m0, [r4 + 2], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 7], 1 +pinsrb m5, [r3 + 6], 0 + +; mode 20 [row 3 - second half] +pmaddubsw m4, m5, [r5 + 12 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 291 * 16 + 8], m4 +; mode 20 [row 3 - second half] end + +; mode 21 [row 3 - second half] +pmaddubsw m4, m5, [r5 + 28 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 307 * 16 + 8], m4 +; mode 21 [row 3 - second half] end + +; mode 21 [row 4 - second half] +pmaddubsw m4, m5, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 308 * 16 + 8], m4 +; mode 21 [row 4 - second half] end + +; mode 22 [row 4 - second half] +pmaddubsw m4, m5, [r5 + 31 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 324 * 16 + 8], m4 +; mode 22 [row 4 - second half] end + +; mode 22 [row 5 - second half] +pmaddubsw m4, m5, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 325 * 16 + 8], m4 +; mode 22 [row 5 - second half] end + +; mode 22 [row 6 - second half] +pmaddubsw m4, m5, [r5 + 5 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 326 * 16 + 8], m4 +; mode 22 [row 6 - second half] end + +; mode 23 [row 7 - second half] +pmaddubsw m4, m5, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 343 * 16 + 8], m4 +; mode 23 [row 7 - second half] end + +; mode 23 [row 8 - second half] +pmaddubsw m4, m5, [r5 + 15 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 344 * 16 + 8], m4 +; mode 23 [row 8 - second half] end + +; mode 23 [row 9 - second half] +pmaddubsw m4, m5, [r5 + 6 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 345 * 16 + 8], m4 +; mode 23 [row 9 - second half] end + +; mode 24 [row 12 - second half] +pmaddubsw m4, m5, [r5 + 31 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 364 * 16 + 8], m4 +; mode 24 [row 12 - second half] end + +; mode 24 [row 13 - second half] +pmaddubsw m4, m5, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 365 * 16 + 8], m4 +; mode 24 [row 13 - second half] end + +; mode 24 [row 14 - second half] +pmaddubsw m4, m5, [r5 + 21 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 366 * 16 + 8], m4 +; mode 24 [row 14 - second half] end + +; mode 24 [row 15 - second half] +pmaddubsw m4, m5, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 367 * 16 + 8], m4 +; mode 24 [row 15 - second half] end + +pmaddubsw m4, m0, [r5 + 18 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 18 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 274 * 16], m4 + +; mode 19 [row 3] +pslldq m0, 2 +pinsrb m0, [r4 + 2], 1 +pinsrb m0, [r4 + 4], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 6], 1 +pinsrb m5, [r3 + 5], 0 + +; mode 20 [row 4 - second half] +pmaddubsw m4, m5, [r5 + 23 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 292 * 16 + 8], m4 +; mode 20 [row 4 - second half] end + +; mode 20 [row 5 - second half] +pmaddubsw m4, m5, [r5 + 2 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 293 * 16 + 8], m4 +; mode 20 [row 5 - second half] end + +; mode 21 [row 5 - second half] +pmaddubsw m4, m5, [r5 + 26 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 309 * 16 + 8], m4 +; mode 21 [row 5 - second half] end + +; mode 21 [row 6 - second half] +pmaddubsw m4, m5, [r5 + 9 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 310 * 16 + 8], m4 +; mode 21 [row 6 - second half] end + +; mode 22 [row 7 - second half] +pmaddubsw m4, m5, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 327 * 16 + 8], m4 +; mode 22 [row 7 - second half] end + +; mode 22 [row 8 - second half] +pmaddubsw m4, m5, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 328 * 16 + 8], m4 +; mode 22 [row 7 - second half] end + +; mode 23 [row 10 - second half] +pmaddubsw m4, m5, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 346 * 16 + 8], m4 +; mode 23 [row 10 - second half] end + +; mode 23 [row 11 - second half] +pmaddubsw m4, m5, [r5 + 20 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 347 * 16 + 8], m4 +; mode 23 [row 11 - second half] end + +; mode 23 [row 12 - second half] +pmaddubsw m4, m5, [r5 + 11 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 348 * 16 + 8], m4 +; mode 23 [row 12 - second half] end + +; mode 23 [row 13 - second half] +pmaddubsw m4, m5, [r5 + 2 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 349 * 16 + 8], m4 +; mode 23 [row 13 - second half] end + +pmaddubsw m4, m0, [r5 + 24 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 24 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 275 * 16], m4 + +; mode 19 [row 4] +pslldq m0, 2 +pinsrb m0, [r4 + 4], 1 +pinsrb m0, [r4 + 5], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 5], 1 +pinsrb m5, [r3 + 4], 0 + +; mode 20 [row 6 - second half] +pmaddubsw m4, m5, [r5 + 13 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 294 * 16 + 8], m4 +; mode 20 [row 6 - second half] end + +; mode 21 [row 7 - second half] +pmaddubsw m4, m5, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 311 * 16 + 8], m4 +; mode 21 [row 7 - second half] end + +; mode 21 [row 8 - second half] +pmaddubsw m4, m5, [r5 + 7 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 312 * 16 + 8], m4 +; mode 21 [row 8 - second half] end + +; mode 22 [row 9 - second half] +pmaddubsw m4, m5, [r5 + 30 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 329 * 16 + 8], m4 +; mode 22 [row 9 - second half] end + +; mode 22 [row 10 - second half] +pmaddubsw m4, m5, [r5 + 17 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 330 * 16 + 8], m4 +; mode 22 [row 10 - second half] end + +; mode 22 [row 11 - second half] +pmaddubsw m4, m5, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 331 * 16 + 8], m4 +; mode 22 [row 11 - second half] end + +; mode 23 [row 14 - second half] +pmaddubsw m4, m5, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 350 * 16 + 8], m4 +; mode 23 [row 14 - second half] end + +; mode 23 [row 15 - second half] +pmaddubsw m4, m5, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 351 * 16 + 8], m4 + +; mode 23 [row 15 - second half] end +pmaddubsw m4, m0, [r5 + 30 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 30 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 276 * 16], m4 + +; mode 19 [row 5] +pmaddubsw m4, m0, [r5 + 4 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 4 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 277 * 16], m4 + +; mode 19 [row 6] +pslldq m0, 2 +pinsrb m0, [r4 + 5], 1 +pinsrb m0, [r4 + 6], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 4], 1 +pinsrb m5, [r3 + 3], 0 + +; mode 20 [row 7 - second half] +pmaddubsw m4, m5, [r5 + 24 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 295 * 16 + 8], m4 +; mode 20 [row 7 - second half] end + +; mode 20 [row 8 - second half] +pmaddubsw m4, m5, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 296 * 16 + 8], m4 +; mode 20 [row 8 - second half] end + +; mode 21 [row 9 - second half] +pmaddubsw m4, m5, [r5 + 22 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 313 * 16 + 8], m4 +; mode 21 [row 9 - second half] end + +; mode 21 [row 10 - second half] +pmaddubsw m4, m5, [r5 + 5 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 314 * 16 + 8], m4 +; mode 21 [row 10 - second half] end + +; mode 22 [row 12 - second half] +pmaddubsw m4, m5, [r5 + 23 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 332 * 16 + 8], m4 +; mode 22 [row 12 - second half] end + +; mode 22 [row 12 - second half] +pmaddubsw m4, m5, [r5 + 10 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 333 * 16 + 8], m4 +; mode 22 [row 12 - second half] end + +pmaddubsw m4, m0, [r5 + 10 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 10 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 278 * 16], m4 + +; mode 19 [row 7] +pslldq m0, 2 +pinsrb m0, [r4 + 6], 1 +pinsrb m0, [r4 + 7], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 3], 1 +pinsrb m5, [r3 + 2], 0 + +; mode 20 [row 9 - second half] +pmaddubsw m4, m5, [r5 + 14 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 297 * 16 + 8], m4 +; mode 20 [row 9 - second half] + +; mode 21 [row 11 - second half] +pmaddubsw m4, m5, [r5 + 20 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 315 * 16 + 8], m4 +; mode 21 [row 11 - second half] end + +; mode 21 [row 12 - second half] +pmaddubsw m4, m5, [r5 + 3 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 316 * 16 + 8], m4 +; mode 21 [row 12 - second half] end + +; mode 22 [row 14 - second half] +pmaddubsw m4, m5, [r5 + 29 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 334 * 16 + 8], m4 +; mode 22 [row 14 - second half] end + +; mode 22 [row 15 - second half] +pmaddubsw m4, m5, [r5 + 16 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 335 * 16 + 8], m4 +; mode 22 [row 15 - second half] end + +pmaddubsw m4, m0, [r5 + 16 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 16 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 279 * 16], m4 + +; mode 19 [row 8] +pslldq m0, 2 +pinsrb m0, [r4 + 7], 1 +pinsrb m0, [r4 + 9], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 2], 1 +pinsrb m5, [r3 + 1], 0 + +; mode 20 [row 10 - second half] +pmaddubsw m4, m5, [r5 + 25 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 298 * 16 + 8], m4 +; mode 20 [row 10 - second half] end + +; mode 20 [row 11 - second half] +pmaddubsw m4, m5, [r5 + 4 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 299 * 16 + 8], m4 +; mode 20 [row 11 - second half] end + +; mode 21 [row 13 - second half] +pmaddubsw m4, m5, [r5 + 18 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 317 * 16 + 8], m4 +; mode 21 [row 13 - second half] end + +; mode 21 [row 14 - second half] +pmaddubsw m4, m5, [r5 + 1 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 318 * 16 + 8], m4 +; mode 21 [row 14 - second half] end + +pmaddubsw m4, m0, [r5 + 22 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 22 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 280 * 16], m4 + +; mode 19 [row 9] +pslldq m0, 2 +pinsrb m0, [r4 + 9], 1 +pinsrb m0, [r4 + 10], 0 +pslldq m5, 2 +pinsrb m5, [r3 + 1], 1 +pinsrb m5, [r3 + 0], 0 + +; mode 20 [row 12 - second half] +pmaddubsw m4, m5, [r5 + 15 * 16] +pmulhrsw m4, m3 +packuswb m4, m4 +movh [r0 + 300 * 16 + 8], m4 + +; mode 20 [row 12 - second half] end +pmaddubsw m4, m0, [r5 + 28 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 28 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 281 * 16], m4 + +; mode 19 [row 10] +pmaddubsw m4, m0, [r5 + 2 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 2 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 282 * 16], m4 + +; mode 19 [row 11] +pslldq m0, 2 +pinsrb m0, [r4 + 10], 1 +pinsrb m0, [r4 + 11], 0 +pmaddubsw m4, m0, [r5 + 8 * 16] +pmulhrsw m4, m3 +pslldq m5, 2 +pinsrb m5, [r4 + 0], 1 +pinsrb m5, [r4 + 1], 0 +pmaddubsw m6, m5, [r5 + 8 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 283 * 16], m4 + +; mode 19 [row 12] +pslldq m0, 2 +pinsrb m0, [r4 + 11], 1 +pinsrb m0, [r4 + 12], 0 +pslldq m5, 2 +pinsrb m5, [r4 + 1], 1 +pinsrb m5, [r4 + 2], 0 +pmaddubsw m4, m0, [r5 + 14 * 16] +pmulhrsw m4, m3 +pmaddubsw m6, m5, [r5 + 14 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 284 * 16], m4 + +; mode 19 [row 13] +pslldq m0, 2 +pinsrb m0, [r4 + 12], 1 +pinsrb m0, [r4 + 14], 0 +pmaddubsw m4, m0, [r5 + 20 * 16] +pmulhrsw m4, m3 +pslldq m5, 2 +pinsrb m5, [r4 + 2], 1 +pinsrb m5, [r4 + 4], 0 +pmaddubsw m6, m5, [r5 + 20 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 285 * 16], m4 + +; mode 19 [row 14] +pslldq m0, 2 +pinsrb m0, [r4 + 14], 1 +pinsrb m0, [r4 + 15], 0 +pmaddubsw m4, m0, [r5 + 26 * 16] +pmulhrsw m4, m3 +pslldq m5, 2 +pinsrb m5, [r4 + 4], 1 +pinsrb m5, [r4 + 5], 0 +pmaddubsw m6, m5, [r5 + 26 * 16] +pmulhrsw m6, m3 +packuswb m4, m6 +movu [r0 + 286 * 16], m4 + +; mode 19 [row 15] +movu m0, [r4] +pshufb m0, [tab_S1] +movu [r0 + 287 * 16], m0 +movd m1, [r3] +movd [r0 + 287 * 16 + 12], m1 + +; mode 25 +movu m1, [r1] + +; mode 26 [all rows] +psrldq m6, m1, 1 +pinsrb m6, [r1 + 16], 15 +movu m7, m6 +movu [r0 + 384 * 16], m6 +movu [r0 + 385 * 16], m6 +movu [r0 + 386 * 16], m6 +movu [r0 + 387 * 16], m6 +movu [r0 + 388 * 16], m6 +movu [r0 + 389 * 16], m6 +movu [r0 + 390 * 16], m6 +movu [r0 + 391 * 16], m6 +movu [r0 + 392 * 16], m6 +movu [r0 + 393 * 16], m6 +movu [r0 + 394 * 16], m6 +movu [r0 + 395 * 16], m6 +movu [r0 + 396 * 16], m6 +movu [r0 + 397 * 16], m6 +movu [r0 + 398 * 16], m6 +movu [r0 + 399 * 16], m6 + +pxor m0, m0 +pshufb m6, m6, m0 +punpcklbw m6, m0 +movu m2, [r2] +pshufb m2, m2, m0 +punpcklbw m2, m0 +movu m4, [r2 + 1] +punpcklbw m5, m4, m0 +punpckhbw m4, m0 +psubw m5, m2 +psubw m4, m2 +psraw m5, 1 +psraw m4, 1 +paddw m5, m6 +paddw m4, m6 +packuswb m5, m4 + +pextrb [r0 + 384 * 16], m5, 0 +pextrb [r0 + 385 * 16], m5, 1 +pextrb [r0 + 386 * 16], m5, 2 +pextrb [r0 + 387 * 16], m5, 3 +pextrb [r0 + 388 * 16], m5, 4 +pextrb [r0 + 389 * 16], m5, 5 +pextrb [r0 + 390 * 16], m5, 6 +pextrb [r0 + 391 * 16], m5, 7 +pextrb [r0 + 392 * 16], m5, 8 +pextrb [r0 + 393 * 16], m5, 9 +pextrb [r0 + 394 * 16], m5, 10 +pextrb [r0 + 395 * 16], m5, 11 +pextrb [r0 + 396 * 16], m5, 12 +pextrb [r0 + 397 * 16], m5, 13 +pextrb [r0 + 398 * 16], m5, 14 +pextrb [r0 + 399 * 16], m5, 15 + +; mode 25 [row 15] +movu [r0 + 383 * 16], m1 + +; mode 25 [row 0] +psrldq m2, m1, 1 +punpcklbw m1, m2 +movu m2, [r1 + 8] +psrldq m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m4, m1, [r5 + 30 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 30 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 368 * 16], m4 + +; mode 25 [row 1] +pmaddubsw m4, m1, [r5 + 28 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 369 * 16], m4 + +; mode 25 [row 2] +pmaddubsw m4, m1, [r5 + 26 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 370 * 16], m4 + +; mode 25 [row 3] +pmaddubsw m4, m1, [r5 + 24 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 371 * 16], m4 + +; mode 25 [row 4] +pmaddubsw m4, m1, [r5 + 22 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 22 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 372 * 16], m4 + +; mode 25 [row 5] +pmaddubsw m4, m1, [r5 + 20 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 373 * 16], m4 + +; mode 25 [row 6] +pmaddubsw m4, m1, [r5 + 18 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 374 * 16], m4 + +; mode 25 [row 7] +pmaddubsw m4, m1, [r5 + 16 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 375 * 16], m4 + +; mode 25 [row 8] +pmaddubsw m4, m1, [r5 + 14 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 376 * 16], m4 + +; mode 25 [row 9] +pmaddubsw m4, m1, [r5 + 12 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 12 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 377 * 16], m4 + +; mode 25 [row 10] +pmaddubsw m4, m1, [r5 + 10 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 10 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 378 * 16], m4 + +; mode 25 [row 11] +pmaddubsw m4, m1, [r5 + 8 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 379 * 16], m4 + +; mode 25 [row 12] +pmaddubsw m4, m1, [r5 + 6 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 380 * 16], m4 + +; mode 25 [row 13] +pmaddubsw m4, m1, [r5 + 4 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 381 * 16], m4 + +; mode 25 [row 14] +pmaddubsw m4, m1, [r5 + 2 * 16] +pmulhrsw m4, m3 +pmaddubsw m5, m2, [r5 + 2 * 16] +pmulhrsw m5, m3 +packuswb m4, m5 +movu [r0 + 382 * 16], m4 + +; mode 27 [row 15] +psrldq m6, m7, 1 +punpcklbw m7, m6 +pinsrb m6, [r1 + 17], 15 +movu [r0 + 415 * 16], m6 + +; mode 27 [row 0] +movu m4, [r1 + 9] +psrldq m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m6, m7, [r5 + 2 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 400 * 16], m6 + +; mode 27 [row 1] +pmaddubsw m6, m7, [r5 + 4 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 401 * 16], m6 + +; mode 27 [row 2] +pmaddubsw m6, m7, [r5 + 6 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 402 * 16], m6 + +; mode 27 [row 3] +pmaddubsw m6, m7, [r5 + 8 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 403 * 16], m6 + +; mode 27 [row 4] +pmaddubsw m6, m7, [r5 + 10 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 404 * 16], m6 + +; mode 27 [row 5] +pmaddubsw m6, m7, [r5 + 12 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 405 * 16], m6 + +; mode 27 [row 6] +pmaddubsw m6, m7, [r5 + 14 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 406 * 16], m6 + +; mode 27 [row 7] +pmaddubsw m6, m7, [r5 + 16 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 407 * 16], m6 + +; mode 27 [row 8] +pmaddubsw m6, m7, [r5 + 18 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 408 * 16], m6 + +; mode 27 [row 9] +pmaddubsw m6, m7, [r5 + 20 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 409 * 16], m6 + +; mode 27 [row 10] +pmaddubsw m6, m7, [r5 + 22 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 22 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 410 * 16], m6 + +; mode 27 [row 11] +pmaddubsw m6, m7, [r5 + 24 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 411 * 16], m6 + +; mode 27 [row 12] +pmaddubsw m6, m7, [r5 + 26 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 412 * 16], m6 + +; mode 27 [row 13] +pmaddubsw m6, m7, [r5 + 28 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 413 * 16], m6 + +; mode 27 [row 14] +pmaddubsw m6, m7, [r5 + 30 * 16] +pmulhrsw m6, m3 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, m3 +packuswb m6, m5 +movu [r0 + 414 * 16], m6 + +; mode 28 [row 0] +movu m1, [r3 + 1] +psrldq m2, m1, 1 +punpcklbw m1, m2 +movu m4, [r3 + 9] +psrldq m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m2, m1, [r5 + 5 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 5 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 416 * 16], m2 + +; mode 28 [row 0] +pmaddubsw m2, m1, [r5 + 5 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 5 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 416 * 16], m2 + +; mode 28 [row 1] +pmaddubsw m2, m1, [r5 + 10 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 417 * 16], m2 + +; mode 28 [row 2] +pmaddubsw m2, m1, [r5 + 15 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 418 * 16], m2 + +; mode 28 [row 3] +pmaddubsw m2, m1, [r5 + 20 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 419 * 16], m2 + +; mode 28 [row 4] +pmaddubsw m2, m1, [r5 + 25 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 420 * 16], m2 + +; mode 28 [row 5] +pmaddubsw m2, m1, [r5 + 30 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 421 * 16], m2 + +; mode 29 [row 0] +pmaddubsw m2, m1, [r5 + 9 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 9 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 432 * 16], m2 + +; mode 29 [row 1] +pmaddubsw m2, m1, [r5 + 18 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 433 * 16], m2 + +; mode 29 [row 2] +pmaddubsw m2, m1, [r5 + 27 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 27 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 434 * 16], m2 + +; mode 30 [row 0] +pmaddubsw m2, m1, [r5 + 13 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 13 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 448 * 16], m2 + +; mode 30 [row 1] +pmaddubsw m2, m1, [r5 + 26 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 449 * 16], m2 + +; mode 33 [row 0] +movu [r0 + 496 * 16], m2 + +; mode 31 [row 0] +pmaddubsw m2, m1, [r5 + 17 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 464 * 16], m2 + +; mode 32 [row 0] +pmaddubsw m2, m1, [r5 + 21 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 480 * 16], m2 + +; mode 28 [row 6] +movd m7, [r3 + 9] +palignr m7, m1, 2 +pmaddubsw m2, m7, [r5 + 3 * 16] +pmulhrsw m2, m3 +movd m6, [r3 + 17] +palignr m6, m4, 2 +pmaddubsw m5, m6, [r5 + 3 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 422 * 16], m2 + +; mode 28 [row 7] +pmaddubsw m2, m7, [r5 + 8 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 423 * 16], m2 + +; mode 28 [row 8] +pmaddubsw m2, m7, [r5 + 13 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 13 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 424 * 16], m2 + +; mode 28 [row 9] +pmaddubsw m2, m7, [r5 + 18 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 18 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 425 * 16], m2 + +; mode 28 [row 10] +pmaddubsw m2, m7, [r5 + 23 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 23 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 426 * 16], m2 + +; mode 29 [row 3] +pmaddubsw m2, m7, [r5 + 4 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 4 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 435 * 16], m2 + +; mode 29 [row 4] +pmaddubsw m2, m7, [r5 + 13 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 13 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 436 * 16], m2 + +; mode 29 [row 5] +pmaddubsw m2, m7, [r5 + 22 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 22 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 437 * 16], m2 + +; mode 29 [row 6] +pmaddubsw m2, m7, [r5 + 31 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 31 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 438 * 16], m2 + +; mode 32 [row 2] +movu [r0 + 482 * 16], m2 + +; mode 30 [row 2] +pmaddubsw m2, m7, [r5 + 7 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 7 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 450 * 16], m2 + +; mode 30 [row 3] +pmaddubsw m2, m7, [r5 + 20 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 20 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 451 * 16], m2 + +; mode 33 [row 1] +movu [r0 + 497 * 16], m2 + +; mode 31 [row 1] +pmaddubsw m2, m7, [r5 + 2 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 2 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 465 * 16], m2 + +; mode 31 [row 2] +pmaddubsw m2, m7, [r5 + 19 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 19 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 466 * 16], m2 + +; mode 32 [row 1] +pmaddubsw m2, m7, [r5 + 10 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 10 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 481 * 16], m2 + +; mode 28 [row 11] +pmaddubsw m2, m7, [r5 + 28 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 28 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 427 * 16], m2 + +; mode 28 [row 12] +movd m1, [r3 + 10] +palignr m1, m7, 2 +pmaddubsw m2, m1, [r5 + 1 * 16] +pmulhrsw m2, m3 +movd m4, [r3 + 18] +palignr m4, m6, 2 +pmaddubsw m5, m4, [r5 + 1 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 428 * 16], m2 + +; mode 30 [row 4] +movu [r0 + 452 * 16], m2 + +; mode 28 [row 13] +pmaddubsw m2, m1, [r5 + 6 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 429 * 16], m2 + +; mode 28 [row 14] +pmaddubsw m2, m1, [r5 + 11 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 11 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 430 * 16], m2 + +; mode 28 [row 15] +pmaddubsw m2, m1, [r5 + 16 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 431 * 16], m2 + +; mode 29 [row 7] +pmaddubsw m2, m1, [r5 + 8 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 439 * 16], m2 + +; mode 29 [row 8] +pmaddubsw m2, m1, [r5 + 17 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 440 * 16], m2 + +; mode 29 [row 9] +pmaddubsw m2, m1, [r5 + 26 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 441 * 16], m2 + +; mode 30 [row 5] +pmaddubsw m2, m1, [r5 + 14 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 453 * 16], m2 + +; mode 33 [row 2] +movu [r0 + 498 * 16], m2 + +; mode 30 [row 6] +pmaddubsw m2, m1, [r5 + 27 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 27 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 454 * 16], m2 + +; mode 31 [row 3] +pmaddubsw m2, m1, [r5 + 4 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 467 * 16], m2 + +; mode 31 [row 4] +pmaddubsw m2, m1, [r5 + 21 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 468 * 16], m2 + +; mode 32 [row 3] +pmaddubsw m2, m1, [r5 + 20 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 483 * 16], m2 + +; mode 29 [row 10] +movd m7, [r3 + 11] +palignr m7, m1, 2 +pmaddubsw m2, m7, [r5 + 3 * 16] +pmulhrsw m2, m3 +movd m6, [r3 + 19] +palignr m6, m4, 2 +pmaddubsw m5, m6, [r5 + 3 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 442 * 16], m2 + +; mode 29 [row 11] +pmaddubsw m2, m7, [r5 + 12 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 12 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 443 * 16], m2 + +; mode 29 [row 12] +pmaddubsw m2, m7, [r5 + 21 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 21 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 444 * 16], m2 + +; mode 30 [row 8] +movu [r0 + 456 * 16], m2 + +; mode 29 [row 13] +pmaddubsw m2, m7, [r5 + 30 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 30 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 445 * 16], m2 + +; mode 32 [row 5] +movu [r0 + 485 * 16], m2 + +; mode 30 [row 7] +pmaddubsw m2, m7, [r5 + 8 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 455 * 16], m2 + +; mode 33 [row 3] +movu [r0 + 499 * 16], m2 + +; mode 31 [row 5] +pmaddubsw m2, m7, [r5 + 6 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 469 * 16], m2 + +; mode 31 [row 6] +pmaddubsw m2, m7, [r5 + 23 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 23 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 470 * 16], m2 + +; mode 32 [row 4] +pmaddubsw m2, m7, [r5 + 9 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 9 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 484 * 16], m2 + +movu m1, m7 +movu m4, m6 + +; mode 29 [row 14] +movu m1, [r3 + 12] +palignr m1, m7, 2 +pmaddubsw m2, m1, [r5 + 7 * 16] +pmulhrsw m2, m3 +movd m4, [r3 + 20] +palignr m4, m6, 2 +pmaddubsw m5, m4, [r5 + 7 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 446 * 16], m2 + +; mode 29 [row 15] +pmaddubsw m2, m1, [r5 + 16 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 447 * 16], m2 + +; mode 30 [row 9] +pmaddubsw m2, m1, [r5 + 2 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 457 * 16], m2 + +; mode 33 [row 4] +movu [r0 + 500 * 16], m2 + +; mode 30 [row 10] +pmaddubsw m2, m1, [r5 + 15 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 458 * 16], m2 + +; mode 30 [row 11] +pmaddubsw m2, m1, [r5 + 28 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 459 * 16], m2 + +; mode 33 [row 5] +movu [r0 + 501 * 16], m2 + +; mode 31 [row 7] +pmaddubsw m2, m1, [r5 + 8 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 471 * 16], m2 + +; mode 31 [row 8] +pmaddubsw m2, m1, [r5 + 25 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 472 * 16], m2 + +; mode 32 [row 6] +pmaddubsw m2, m1, [r5 + 19 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 19 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 486 * 16], m2 + +; mode 30 [row 12] +movd m7, [r3 + 13] +palignr m7, m1, 2 +pmaddubsw m2, m7, [r5 + 9 * 16] +pmulhrsw m2, m3 +movd m6, [r3 + 21] +palignr m6, m4, 2 +pmaddubsw m5, m6, [r5 + 9 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 460 * 16], m2 + +; mode 30 [row 13] +pmaddubsw m2, m7, [r5 + 22 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 22 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 461 * 16], m2 + +; mode 33 [row 6] +movu [r0 + 502 * 16], m2 + +; mode 31 [row 9] +pmaddubsw m2, m7, [r5 + 10 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 10 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 473 * 16], m2 + +; mode 31 [row 10] +pmaddubsw m2, m7, [r5 + 27 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 27 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 474 * 16], m2 + +; mode 32 [row 7] +pmaddubsw m2, m7, [r5 + 8 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 8 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 487 * 16], m2 + +; mode 32 [row 8] +pmaddubsw m2, m7, [r5 + 29 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 29 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 488 * 16], m2 + + +movu m1, m7 +movu m4, m6 + +; mode 30 [row 14] +movd m1, [r3 + 14] +palignr m1, m7, 2 +pmaddubsw m2, m1, [r5 + 3 * 16] +pmulhrsw m2, m3 +movd m4, [r3 + 22] +palignr m4, m6, 2 +pmaddubsw m5, m4, [r5 + 3 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 462 * 16], m2 + +; mode 30 [row 15] +pmaddubsw m2, m1, [r5 + 16 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 463 * 16], m2 + +; mode 33 [row 7] +movu [r0 + 503 * 16], m2 + +; mode 31 [row 11] +pmaddubsw m2, m1, [r5 + 12 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 475 * 16], m2 + +; mode 31 [row 12] +pmaddubsw m2, m1, [r5 + 29 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 29 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 476 * 16], m2 + +; mode 32 [row 9] +pmaddubsw m2, m1, [r5 + 18 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 489 * 16], m2 + +; mode 31 [row 13] +movd m7, [r3 + 15] +palignr m7, m1, 2 +pmaddubsw m2, m7, [r5 + 14 * 16] +pmulhrsw m2, m3 +movd m6, [r3 + 23] +palignr m6, m4, 2 +pmaddubsw m5, m6, [r5 + 14 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 477 * 16], m2 + +; mode 31 [row 14] +pmaddubsw m2, m7, [r5 + 31 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 31 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 478 * 16], m2 + +; mode 32 [row 10] +pmaddubsw m2, m7, [r5 + 7 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 7 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 490 * 16], m2 + +; mode 32 [row 11] +pmaddubsw m2, m7, [r5 + 28 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 28 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 491 * 16], m2 + +; mode 33 [row 8] +pmaddubsw m2, m7, [r5 + 10 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 10 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 504 * 16], m2 + +; mode 31 [row 15] +movd m1, [r3 + 16] +palignr m1, m7, 2 +pmaddubsw m2, m1, [r5 + 16 * 16] +pmulhrsw m2, m3 +movd m4, [r3 + 24] +palignr m4, m6, 2 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 479 * 16], m2 + +; mode 32 [row 12] +pmaddubsw m2, m1, [r5 + 17 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 492 * 16], m2 + +; mode 33 [row 9] +pmaddubsw m2, m1, [r5 + 4 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 505 * 16], m2 + +; mode 33 [row 10] +pmaddubsw m2, m1, [r5 + 30 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 506 * 16], m2 + +; mode 33 [row 10] +pmaddubsw m2, m1, [r5 + 4 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 505 * 16], m2 + +; mode 32 [row 13] +movd m7, [r3 + 17] +palignr m7, m1, 2 +pmaddubsw m2, m7, [r5 + 6 * 16] +pmulhrsw m2, m3 + +movd m6, [r3 + 25] +palignr m6, m4, 2 +pmaddubsw m5, m6, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 493 * 16], m2 + +; mode 32 [row 14] +pmaddubsw m2, m7, [r5 + 27 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 27 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 494 * 16], m2 + +; mode 33 [row 11] +pmaddubsw m2, m7, [r5 + 24 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m6, [r5 + 24 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 507 * 16], m2 + +; mode 32 [row 15] +movd m1, [r3 + 18] +palignr m1, m7, 2 +pmaddubsw m2, m1, [r5 + 16 * 16] +pmulhrsw m2, m3 +psrldq m4, 2 +pinsrb m4, [r3 + 26], 14 +pinsrb m4, [r3 + 27], 15 +movd m4, [r3 + 26] +palignr m4, m6, 2 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 495 * 16], m2 + +; mode 33 [row 12] +pmaddubsw m2, m1, [r5 + 18 * 16] +pmulhrsw m2, m3 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 508 * 16], m2 + +; mode 33 [row 13] +movd m7, [r3 + 19] +palignr m7, m1, 2 +pmaddubsw m2, m7, [r5 + 12 * 16] +pmulhrsw m2, m3 +movd m6, [r3 + 27] +palignr m6, m4, 2 +pmaddubsw m5, m6, [r5 + 12 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 509 * 16], m2 + +; mode 33 [row 14] +movd m1, [r3 + 20] +palignr m1, m7, 2 +pmaddubsw m2, m1, [r5 + 6 * 16] +pmulhrsw m2, m3 +movd m4, [r3 + 28] +palignr m4, m6, 2 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m3 +packuswb m2, m5 +movu [r0 + 510 * 16], m2 + +; mode 34 [row 0] +movu m1, [r3 + 2] +movu [r0 + 512 * 16], m1 +movu m2, [r3 + 18] +palignr m3, m2, m1, 1 +movu [r0 + 513 * 16], m3 +palignr m3, m2, m1, 2 +movu [r0 + 514 * 16], m3 +palignr m3, m2, m1, 3 +movu [r0 + 515 * 16], m3 +palignr m3, m2, m1, 4 +movu [r0 + 516 * 16], m3 +palignr m3, m2, m1, 5 +movu [r0 + 517 * 16], m3 +palignr m3, m2, m1, 6 +movu [r0 + 518 * 16], m3 +palignr m3, m2, m1, 7 +movu [r0 + 519 * 16], m3 +palignr m3, m2, m1, 8 +movu [r0 + 520 * 16], m3 +palignr m3, m2, m1, 9 +movu [r0 + 521 * 16], m3 +palignr m3, m2, m1, 10 +movu [r0 + 522 * 16], m3 +palignr m3, m2, m1, 11 +movu [r0 + 523 * 16], m3 +palignr m3, m2, m1, 12 +movu [r0 + 524 * 16], m3 + +; mode 33 [row 15] +movu [r0 + 511 * 16], m3 + +; mode 34 +palignr m3, m2, m1, 13 +movu [r0 + 525 * 16], m3 +palignr m3, m2, m1, 14 +movu [r0 + 526 * 16], m3 +palignr m3, m2, m1, 15 +movu [r0 + 527 * 16], m3 + +RET + +;----------------------------------------------------------------------------- +; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma + +;mode 2[row 0] +movu m0, [r4 + 2] +movu [r0 + 0 * 16], m0 +movu m1, [r4 + 18] +movu [r0 + 1 * 16], m1 + +;mode 9 [row 15] +movu [r0 + 478 * 16], m0 +movu [r0 + 479 * 16], m1 + +;mode 2[row 1] +movu m2, [r4 + 34] +palignr m3, m1, m0, 1 +movu [r0 + 2 * 16], m3 +palignr m4, m2, m1, 1 +movu [r0 + 3 * 16], m4 + +; mode 9 [row 31] +movu [r0 + 510 * 16], m3 +movu [r0 + 511 * 16], m4 + +;mode 2[row 17] +movu [r0 + 34 * 16], m4 +movu m5, [r4 + 35] +movu [r0 + 35 * 16], m5 + +;mode 2[row 2] +palignr m3, m1, m0, 2 +movu [r0 + 4 * 16], m3 +palignr m4, m2, m1, 2 +movu [r0 + 5 * 16], m4 + +;mode 2[row 18] +movu [r0 + 36 * 16], m4 +movu m6, [r4 + 51] +palignr m7, m6, m5, 1 +movu [r0 + 37 * 16], m7 + +;mode 2[row 3] +palignr m3, m1, m0, 3 +movu [r0 + 6 * 16], m3 +palignr m4, m2, m1, 3 +movu [r0 + 7 * 16], m4 + +;mode 2[row 19] +movu [r0 + 38 * 16], m4 +palignr m7, m6, m5, 2 +movu [r0 + 39 * 16], m7 + +;mode 2[row 4] +palignr m3, m1, m0, 4 +movu [r0 + 8 * 16], m3 +palignr m4, m2, m1, 4 +movu [r0 + 9 * 16], m4 + +; mode 8 [row 31] +movu [r0 + 446 * 16], m3 +movu [r0 + 447 * 16], m4 + +;mode 2[row 20] +movu [r0 + 40 * 16], m4 +palignr m7, m6, m5, 3 +movu [r0 + 41 * 16], m7 + +; mode 4 [row 31] +movu [r0 + 190 * 16], m4 +movu [r0 + 191 * 16], m7 + +;mode 2[row 5] +palignr m3, m1, m0, 5 +movu [r0 + 10 * 16], m3 +palignr m4, m2, m1, 5 +movu [r0 + 11 * 16], m4 + +;mode 2[row 21] +movu [r0 + 42 * 16], m4 +palignr m7, m6, m5, 4 +movu [r0 + 43 * 16], m7 + +;mode 2[row 6] +palignr m3, m1, m0, 6 +movu [r0 + 12 * 16], m3 +palignr m4, m2, m1, 6 +movu [r0 + 13 * 16], m4 + +;mode 2[row 22] +movu [r0 + 44 * 16], m4 +palignr m7, m6, m5, 5 +movu [r0 + 45 * 16], m7 + +;mode 2[row 7] +palignr m3, m1, m0, 7 +movu [r0 + 14 * 16], m3 +palignr m4, m2, m1, 7 +movu [r0 + 15 * 16], m4 + +;mode 2[row 23] +movu [r0 + 46 * 16], m4 +palignr m7, m6, m5, 6 +movu [r0 + 47 * 16], m7 + +;mode 2[row 8] +palignr m3, m1, m0, 8 +movu [r0 + 16 * 16], m3 +palignr m4, m2, m1, 8 +movu [r0 + 17 * 16], m4 + +;mode 7[row 31] +movu [r0 + 382 * 16], m3 +movu [r0 + 383 * 16], m4 + +;mode 2[row 24] +movu [r0 + 48 * 16], m4 +palignr m7, m6, m5, 7 +movu [r0 + 49 * 16], m7 + +;mode 2[row 9] +palignr m3, m1, m0, 9 +movu [r0 + 18 * 16], m3 +palignr m4, m2, m1, 9 +movu [r0 + 19 * 16], m4 + +;mode 2[row 25] +movu [r0 + 50 * 16], m4 +palignr m7, m6, m5, 8 +movu [r0 + 51 * 16], m7 + +; mode 3 [row 31] +movu [r0 + 126 * 16], m4 +movu [r0 + 127 * 16], m7 + +;mode 2[row 10] +palignr m3, m1, m0, 10 +movu [r0 + 20 * 16], m3 +palignr m4, m2, m1, 10 +movu [r0 + 21 * 16], m4 + +;mode 2[row 26] +movu [r0 + 52 * 16], m4 +palignr m7, m6, m5, 9 +movu [r0 + 53 * 16], m7 + +;mode 2[row 11] +palignr m3, m1, m0, 11 +movu [r0 + 22 * 16], m3 +palignr m4, m2, m1, 11 +movu [r0 + 23 * 16], m4 + +;mode 2[row 27] +movu [r0 + 54 * 16], m4 +palignr m7, m6, m5, 10 +movu [r0 + 55 * 16], m7 + +;mode 2[row 12] +palignr m3, m1, m0, 12 +movu [r0 + 24 * 16], m3 +palignr m4, m2, m1, 12 +movu [r0 + 25 * 16], m4 + +; mode 6 [row 31] +movu [r0 + 318 * 16], m3 +movu [r0 + 319 * 16], m4 + +; mode 3 [row 15] +movu [r0 + 94 * 16], m3 +movu [r0 + 95 * 16], m4 + +;mode 2[row 28] +movu [r0 + 56 * 16], m4 +palignr m7, m6, m5, 11 +movu [r0 + 57 * 16], m7 + +;mode 2[row 13] +palignr m3, m1, m0, 13 +movu [r0 + 26 * 16], m3 +palignr m4, m2, m1, 13 +movu [r0 + 27 * 16], m4 + +;mode 2[row 29] +movu [r0 + 58 * 16], m4 +palignr m7, m6, m5, 12 +movu [r0 + 59 * 16], m7 + +;mode 2[row 14] +palignr m3, m1, m0, 14 +movu [r0 + 28 * 16], m3 +palignr m4, m2, m1, 14 +movu [r0 + 29 * 16], m4 + +;mode 2[row 30] +movu [r0 + 60 * 16], m4 +palignr m7, m6, m5, 13 +movu [r0 + 61 * 16], m7 + +;mode 2[row 15] +palignr m3, m1, m0, 15 +movu [r0 + 30 * 16], m3 +palignr m4, m2, m1, 15 +movu [r0 + 31 * 16], m4 + +;mode 2[row 31] +movu [r0 + 62 * 16], m4 +palignr m7, m6, m5, 14 +movu [r0 + 63 * 16], m7 + +;mode 2[row 16] +movu [r0 + 32 * 16], m1 +movu [r0 + 33 * 16], m2 + +; mode 5[row 31] +movu [r0 + 254 * 16], m1 +movu [r0 + 255 * 16], m2 + +; mode 3 [row 0] +lea r5, [ang_table] +movu m6, [r5 + 26 * 16] +movu m7, [pw_1024 ] +movu m1, [r4 + 1 ] +punpcklbw m1, m0 +pmaddubsw m0, m1, m6 +pmulhrsw m0, m7 +movu m2, [r4 + 9] +movd m3, [r4 + 10] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m0, m3 +movu [r0 + 64 * 16], m0 + +; mode 6 [row 1 - first half] +movu [r0 + 258 * 16], m0 + +; mode 9 [row 12 - first half] +movu [r0 + 472 * 16], m0 + +movu m0, [r4 + 17] +movd m3, [r4 + 18] +palignr m3, m0, 1 +punpcklbw m0, m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 25] +movd m5, [r4 + 26] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 65 * 16], m3 + +; mode 6 [row 1 - second half] +movu [r0 + 259 * 16], m3 + +; mode 9 [row 12 - second half] +movu [r0 + 473 * 16], m3 + +; mode 4 [row 0] +movu m6, [r5 + 21 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 128 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 129 * 16], m3 + +; mode 5 [row 0] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 192 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 193 * 16], m3 + +; mode 6 [row 0] +movu m6, [r5 + 13 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 256 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 257 * 16], m3 + +; mode 7 [row 0] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 320 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 321 * 16], m3 + +; mode 7 [row 1] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 322 * 16], m3 + +; mode 9 [row 8 - first half] +movu [r0 + 464 * 16], m3 + +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 323 * 16], m3 + +; mode 9 [row 8 - second half] +movu [r0 + 465 * 16], m3 + +; mode 7 [row 2] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 324 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 325 * 16], m3 + +; mode 8 [row 0] +movu m6, [r5 + 5 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 384 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 385 * 16], m3 + +; mode 8 [row 1] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 386 * 16], m3 + +; mode 9 [row 4 - first half] +movu [r0 + 456 * 16], m3 + +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 387 * 16], m3 + +; mode 9 [row 4 - second half] +movu [r0 + 457 * 16], m3 + +; mode 8 [row 2] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 388 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 389 * 16], m3 + +; mode 8 [row 3] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 390 * 16], m3 + +; mode 9 [row 9 - first half] +movu [r0 + 466 * 16], m3 + +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 391 * 16], m3 + +; mode 9 [row 9 - second half] +movu [r0 + 467 * 16], m3 + +; mode 8 [row 4] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 392 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 393 * 16], m3 + +; mode 8 [row 5] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 394 * 16], m3 + +; mode 9 [row 14 - first half] +movu [r0 + 476 * 16], m3 + +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 395 * 16], m3 + +; mode 9 [row 14 - second half] +movu [r0 + 477 * 16], m3 + +; mode 9 [row 0] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 448 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 449 * 16], m3 + +; mode 9 [row 1] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 450 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 451 * 16], m3 + +; mode 9 [row 2] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 452 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 453 * 16], m3 + +; mode 9 [row 3] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 454 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 455 * 16], m3 + +; mode 9 [row 5] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 458 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 459 * 16], m3 + +; mode 9 [row 6] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 460 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 461 * 16], m3 + +; mode 9 [row 7] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 462 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 463 * 16], m3 + +; mode 9 [row 10] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 468 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 469 * 16], m3 + +; mode 9 [row 11] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 470 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 471 * 16], m3 + +; mode 9 [row 13] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 474 * 16], m3 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 475 * 16], m3 + +; mode 3 [row 1] +movu m6, [r5 + 20 * 16] +movu m0, [r4 + 2] +movd m1, [r4 + 3] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 10] +movd m3, [r4 + 11] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 66 * 16], m1 + +; mode 6 [row 3 - first half] +movu [r0 + 262 * 16], m1 + +; mode 9 [row 25 - first half] +movu [r0 + 498 * 16], m1 + +movu m1, [r4 + 18] +movd m3, [r4 + 19] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 26] +movd m5, [r4 + 27] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 67 * 16], m3 + +; mode 6 [row 3 - second half] +movu [r0 + 263 * 16], m3 + +; mode 9 [row 25 - second half] +movu [r0 + 499 * 16], m3 + +; mode 4 [row 1] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 130 * 16], m3 + +; mode 9 [row 20 - first half] +movu [r0 + 488 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 131 * 16], m3 + +; mode 9 [row 20 - second half] +movu [r0 + 489 * 16], m3 + +; mode 4 [row 2] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 132 * 16], m3 + +; mode 7 [row 6 - first half] +movu [r0 + 332 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 133 * 16], m3 + +; mode 7 [row 6 - second half] +movu [r0 + 333 * 16], m3 + +; mode 5 [row 1] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 194 * 16], m3 + +; mode 5 [row 1 - first half] +movu [r0 + 480 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 195 * 16], m3 + +; mode 5 [row 1 - second half] +movu [r0 + 481 * 16], m3 + +; mode 5 [row 2] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 196 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 197 * 16], m3 + +; mode 6 [row 2] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 260 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 261 * 16], m3 + +; mode 7 [row 3] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 326 * 16], m3 + +; mode 9 [row 17 - first half] +movu [r0 + 482 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 327 * 16], m3 + +; mode 9 [row 17 - second half] +movu [r0 + 483 * 16], m3 + +; mode 7 [row 4] +movu m6, [r5 + 13 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 328 * 16], m3 + +; mode 8 [row 8 - first half] +movu [r0 + 400 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 329 * 16], m3 + +; mode 8 [row 8 - second half] +movu [r0 + 401 * 16], m3 + +; mode 7 [row 5] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 330 * 16], m3 + +; mode 9 [row 26 - first half] +movu [r0 + 500 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 331 * 16], m3 + +; mode 9 [row 26 - second half] +movu [r0 + 501 * 16], m3 + +; mode 8 [row 6] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 396 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 397 * 16], m3 + +; mode 9 [row 18] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 484 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 485 * 16], m3 + +; mode 9 [row 21] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 490 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 491 * 16], m3 + +; mode 9 [row 22] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 492 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 493 * 16], m3 + +; mode 9 [row 23] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 494 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 495 * 16], m3 + +; mode 9 [row 27] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 502 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 503 * 16], m3 + +; mode 9 [row 28] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 504 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 505 * 16], m3 + +; mode 9 [row 30] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 508 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 509 * 16], m3 + +; mode 8 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 398 * 16], m3 + +; mode 9 [row 19 - first half] +movu [r0 + 486 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 399 * 16], m3 + +; mode 9 [row 19 - second half] +movu [r0 + 487 * 16], m3 + +; mode 8 [row 9] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 402 * 16], m3 + +; mode 9 [row 24 - first half] +movu [r0 + 496 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 403 * 16], m3 + +; mode 9 [row 24 - second half] +movu [r0 + 497 * 16], m3 + +; mode 8 [row 10] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 404 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 405 * 16], m3 + +; mode 8 [row 11] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 406 * 16], m3 + +; mode 9 [row 29 - first half] +movu [r0 + 506 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 407 * 16], m3 + +; mode 9 [row 29 - second half] +movu [r0 + 507 * 16], m3 + +; mode 3 [row 2] +movu m6, [r5 + 14 * 16] +movu m0, [r4 + 3] +movd m1, [r4 + 4] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 11] +movd m3, [r4 + 12] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 68 * 16], m1 + +; mode 3 [row 2 - first half] +movu [r0 + 266 * 16], m1 + +movu m1, [r4 + 19] +movd m3, [r4 + 20] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 27] +movd m5, [r4 + 28] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 69 * 16], m3 + +; mode 3 [row 2 - second half] +movu [r0 + 267 * 16], m3 + +; mode 4 [row 3] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 134 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 135 * 16], m3 + +; mode 5 [row 3] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 198 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 199 * 16], m3 + +; mode 5 [row 4] +movu m6, [r5 + 21 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 200 * 16], m3 + +; mode 8 [row 16 - first half] +movu [r0 + 416 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 201 * 16], m3 + +; mode 8 [row 16 - second half] +movu [r0 + 417 * 16], m3 + +; mode 6 [row 4] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 264 * 16], m3 + +; mode 6 [row 4 - first half] +movu [r0 + 408 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 265 * 16], m3 + +; mode 6 [row 4 - second half] +movu [r0 + 409 * 16], m3 + +; mode 6 [row 6] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 268 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 269 * 16], m3 + +; mode 7 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 334 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 335 * 16], m3 + +; mode 7 [row 8] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 336 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 337 * 16], m3 + +; mode 7 [row 9] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 338 * 16], m3 + +; mode 8 [row 17 - first half] +movu [r0 + 418 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 339 * 16], m3 + +; mode 8 [row 17 - second half] +movu [r0 + 419 * 16], m3 + +; mode 8 [row 13] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 410 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 411 * 16], m3 + +; mode 8 [row 14] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 412 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 413 * 16], m3 + +; mode 8 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 414 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 415 * 16], m3 + +; mode 8 [row 18] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 420 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 421 * 16], m3 + +; mode 3 [row 3] +movu m6, [r5 + 8 * 16] +movu m0, [r4 + 4] +movd m1, [r4 + 5] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 12] +movd m3, [r4 + 13] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 70 * 16], m1 + +; mode 6 [row 7 - first half] +movu [r0 + 270 * 16], m1 + +movu m1, [r4 + 20] +movd m3, [r4 + 21] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 28] +movd m5, [r4 + 29] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 71 * 16], m3 + +; mode 6 [row 7 - second half] +movu [r0 + 271 * 16], m3 + +; mode 4 [row 4] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 136 * 16], m3 + +; mode 4 [row 4 - first half] +movu [r0 + 424 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 137 * 16], m3 + +; mode 4 [row 4 - second half] +movu [r0 + 425 * 16], m3 + +; mode 4 [row 5] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 138 * 16], m3 + +; mode 7 [row 13 - first half] +movu [r0 + 346 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 139 * 16], m3 + +; mode 7 [row 13 - second half] +movu [r0 + 347 * 16], m3 + +; mode 5 [row 5] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 202 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 203 * 16], m3 + +; mode 5 [row 6] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 204 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 205 * 16], m3 + +; mode 6 [row 8] +movu m6, [r5 + 21 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 272 * 16], m3 + +; mode 7 [row 12 - first half] +movu [r0 + 344 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 273 * 16], m3 + +; mode 7 [row 12 - second half] +movu [r0 + 345 * 16], m3 + +; mode 7 [row 10] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 340 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 341 * 16], m3 + +; mode 7 [row 11] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 342 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 343 * 16], m3 + +; mode 8 [row 19] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 422 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 423 * 16], m3 + +; mode 8 [row 21] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 426 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 427 * 16], m3 + +; mode 8 [row 22] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 428 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 429 * 16], m3 + +; mode 8 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 430 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 431 * 16], m3 + +; mode 8 [row 24] +movu m6, [r5 + 29 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 432 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 433 * 16], m3 + +; mode 3 [row 4] +movu m6, [r5 + 2 * 16] +movu m0, [r4 + 5] +movd m1, [r4 + 6] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 13] +movd m3, [r4 + 14] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 72 * 16], m1 + +; mode 3 [row 4 - first half] +movu [r0 + 274 * 16], m1 + +; mode 8 [row 25 - first half] +movu [r0 + 434 * 16], m1 + +movu m1, [r4 + 21] +movd m3, [r4 + 22] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 29] +movd m5, [r4 + 30] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 73 * 16], m3 + +; mode 3 [row 4 - second half] +movu [r0 + 275 * 16], m3 + +; mode 8 [row 25 - second half] +movu [r0 + 435 * 16], m3 + +; mode 3 [row 5] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 74 * 16], m3 + +; mode 3 [row 5 - first half] +movu [r0 + 278 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 75 * 16], m3 + +; mode 3 [row 5 - second half] +movu [r0 + 279 * 16], m3 + +; mode 4 [row 6] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 140 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 141 * 16], m3 + +; mode 5 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 206 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 207 * 16], m3 + +; mode 5 [row 8] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 208 * 16], m3 + +; mode 7 [row 16 - first half] +movu [r0 + 352 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 209 * 16], m3 + +; mode 7 [row 16 - second half] +movu [r0 + 353 * 16], m3 + +; mode 6 [row 10] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 276 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 277 * 16], m3 + +; mode 7 [row 14] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 348 * 16], m3 + +; mode 8 [row 26 - first half] +movu [r0 + 436 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 349 * 16], m3 + +; mode 8 [row 26 - second half] +movu [r0 + 437 * 16], m3 + +; mode 7 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 350 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 351 * 16], m3 + +; mode 8 [row 27] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 438 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 439 * 16], m3 + +; mode 8 [row 28] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 440 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 441 * 16], m3 + +; mode 8 [row 29] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 442 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 443 * 16], m3 + +; mode 8 [row 30] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 444 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 445 * 16], m3 + +; mode 3 [row 6] +movu m6, [r5 + 22 * 16] +movu m0, [r4 + 6] +movd m1, [r4 + 7] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 14] +movd m3, [r4 + 15] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 76 * 16], m1 + +; mode 6 [row 13 - first half] +movu [r0 + 282 * 16], m1 + +movu m1, [r4 + 22] +movd m3, [r4 + 23] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 30] +movd m5, [r4 + 31] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 77 * 16], m3 + +; mode 6 [row 13 - second half] +movu [r0 + 283 * 16], m3 + +; mode 4 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 142 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 143 * 16], m3 + +; mode 4 [row 8] +movu m6, [r5 + 29 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 144 * 16], m3 + +; mode 4 [row 8 - first half] +movu [r0 + 360 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 145 * 16], m3 + +; mode 4 [row 8 - second half] +movu [r0 + 361 * 16], m3 + +; mode 5 [row 9] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 210 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 211 * 16], m3 + +; mode 5 [row 10] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 212 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 213 * 16], m3 + +; mode 7 [row 17] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 354 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 355 * 16], m3 + +; mode 7 [row 18] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 356 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 357 * 16], m3 + +; mode 7 [row 19] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 358 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 359 * 16], m3 + +; mode 6 [row 12] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 280 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 281 * 16], m3 + +; mode 3 [row 7] +movu m6, [r5 + 16 * 16] +movu m0, [r4 + 7] +movd m1, [r4 + 8] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 15] +movd m3, [r4 + 16] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 78 * 16], m1 + +; mode 6 [row 15 - first half] +movu [r0 + 286 * 16], m1 + +movu m1, [r4 + 23] +movd m3, [r4 + 24] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 31] +movd m5, [r4 + 32] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 79 * 16], m3 + +; mode 6 [row 15 - second half] +movu [r0 + 287 * 16], m3 + +; mode 4 [row 9] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 146 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 147 * 16], m3 + +; mode 5 [row 11] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 214 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 215 * 16], m3 + +; mode 5 [row 12] +movu m6, [r5 + 29 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 216 * 16], m3 + +; mode 6 [row 16 - first half] +movu [r0 + 288 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 217 * 16], m3 + +; mode 6 [row 16 - second half] +movu [r0 + 289 * 16], m3 + +; mode 6 [row 14] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 284 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 285 * 16], m3 + +; mode 7 [row 21] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 362 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 363 * 16], m3 + +; mode 7 [row 22] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 364 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 365 * 16], m3 + +; mode 7 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 366 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 367 * 16], m3 + +; mode 3 [row 8] +movu m6, [r5 + 10 * 16] +movu m0, [r4 + 8] +movd m1, [r4 + 9] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 16] +movd m3, [r4 + 17] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 80 * 16], m1 + +; mode 7 [row 25 - first half] +movu [r0 + 290 * 16], m1 + +; mode 6 [row 17 - first half] +movu [r0 + 370 * 16], m1 + +movu m1, [r4 + 24] +movd m3, [r4 + 25] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 32] +movd m5, [r4 + 33] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 81 * 16], m3 + +; mode 7 [row 25 - second half] +movu [r0 + 291 * 16], m3 + +; mode 6 [row 17 - second half] +movu [r0 + 371 * 16], m3 + +; mode 4 [row 10] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 148 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 149 * 16], m3 + +; mode 4 [row 11] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 150 * 16], m3 + +; mode 7 [row 27 - first half] +movu [r0 + 374 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 151 * 16], m3 + +; mode 7 [row 27 - second half] +movu [r0 + 375 * 16], m3 + +; mode 5 [row 13] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 218 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 219 * 16], m3 + +; mode 5 [row 14] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 220 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 221 * 16], m3 + +; mode 6 [row 18] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 292 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 293 * 16], m3 + +; mode 7 [row 24] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 368 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 369 * 16], m3 + +; mode 7 [row 26] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 372 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 373 * 16], m3 + +; mode 3 [row 9] +movu m6, [r5 + 4 * 16] +movu m0, [r4 + 9] +movd m1, [r4 + 10] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 17] +movd m3, [r4 + 18] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 82 * 16], m1 + +; mode 6 [row 19 - first half] +movu [r0 + 294 * 16], m1 + +movu m1, [r4 + 25] +movd m3, [r4 + 26] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 33] +movd m5, [r4 + 34] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 83 * 16], m3 + +; mode 6 [row 19 - second half] +movu [r0 + 295 * 16], m3 + +; mode 4 [row 12] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 152 * 16], m3 + +; mode 4 [row 12 - first half] +movu [r0 + 296 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 153 * 16], m3 + +; mode 4 [row 12 - second half] +movu [r0 + 297 * 16], m3 + +; mode 3 [row 10] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 84 * 16], m3 + +; mode 6 [row 21 - first half] +movu [r0 + 298 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 85 * 16], m3 + +; mode 6 [row 21 - second half] +movu [r0 + 299 * 16], m3 + +; mode 5 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 222 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 223 * 16], m3 + +; mode 7 [row 28] +movu m6, [r5 + 5 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 376 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 377 * 16], m3 + +; mode 7 [row 29] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 378 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 379 * 16], m3 + +; mode 7 [row 30] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 380 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 381 * 16], m3 + +; mode 3 [row 11] +movu m6, [r5 + 24 * 16] +movu m0, [r4 + 10] +movd m1, [r4 + 11] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 18] +movd m3, [r4 + 19] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 86 * 16], m1 + +; mode 6 [row 23 - first half] +movu [r0 + 302 * 16], m1 + +movu m1, [r4 + 26] +movd m3, [r4 + 27] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 34] +movd m5, [r4 + 35] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 87 * 16], m3 + +; mode 6 [row 23 - second half] +movu [r0 + 303 * 16], m3 + +; mode 4 [row 13] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 154 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 155 * 16], m3 + +; mode 4 [row 14] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 156 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 157 * 16], m3 + +; mode 5 [row 16] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 224 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 225 * 16], m3 + +; mode 5 [row 17] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 226 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 227 * 16], m3 + +; mode 6 [row 22] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 300 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 301 * 16], m3 + +; mode 3 [row 12] +movu m6, [r5 + 18 * 16] +movu m0, [r4 + 11] +movd m1, [r4 + 12] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 19] +movd m3, [r4 + 20] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 88 * 16], m1 + +; mode 6 [row 25 - first half] +movu [r0 + 306 * 16], m1 + +movu m1, [r4 + 27] +movd m3, [r4 + 28] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 35] +movd m5, [r4 + 36] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 89 * 16], m3 + +; mode 6 [row 25 - second half] +movu [r0 + 307 * 16], m3 + +; mode 4 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 158 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 159 * 16], m3 + +; mode 5 [row 18] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 228 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 229 * 16], m3 + +; mode 5 [row 19] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 230 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 231 * 16], m3 + +; mode 6 [row 24] +movu m6, [r5 + 5 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 304 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 305 * 16], m3 + +; mode 6 [row 26] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 308 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 309 * 16], m3 + +; mode 3 [row 13] +movu m6, [r5 + 12 * 16] +movu m0, [r4 + 12] +movd m1, [r4 + 13] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 20] +movd m3, [r4 + 21] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 90 * 16], m1 + +movu m1, [r4 + 28] +movd m3, [r4 + 29] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 36] +movd m5, [r4 + 37] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 91 * 16], m3 + +; mode 4 [row 16] +movu m6, [r5 + 5 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 160 * 16], m3 + +; mode 5 [row 20 - first half] +movu [r0 + 232 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 161 * 16], m3 + +; mode 5 [row 20 - second half] +movu [r0 + 233 * 16], m3 + +; mode 4 [row 17] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 162 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 163 * 16], m3 + +; mode 5 [row 21] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 234 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 235 * 16], m3 + +; mode 6 [row 27] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 310 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 311 * 16], m3 + +; mode 6 [row 28] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 312 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 313 * 16], m3 + +; mode 3 [row 14] +movu m6, [r5 + 6 * 16] +movu m0, [r4 + 13] +movd m1, [r4 + 14] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 21] +movd m3, [r4 + 22] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 92 * 16], m1 + +; mode 6 [row 29 - first half] +movu [r0 + 314 * 16], m1 + +movu m1, [r4 + 29] +movd m3, [r4 + 30] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 37] +movd m5, [r4 + 38] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 93 * 16], m3 + +; mode 6 [row 29 - second half] +movu [r0 + 315 * 16], m3 + +; mode 4 [row 18] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 164 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 165 * 16], m3 + +; mode 5 [row 22] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 236 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 237 * 16], m3 + +; mode 5 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 238 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 239 * 16], m3 + +; mode 6 [row 30] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 316 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 317 * 16], m3 + +; mode 3 [row 16] +movu m6, [r5 + 26 * 16] +movu m0, [r4 + 14] +movd m1, [r4 + 15] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 22] +movd m3, [r4 + 23] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 96 * 16], m1 + +; mode 5 [row 25 - first half] +movu [r0 + 242 * 16], m1 + +movu m1, [r4 + 30] +movd m3, [r4 + 31] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 38] +movd m5, [r4 + 39] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 97 * 16], m3 + +; mode 5 [row 25 - second half] +movu [r0 + 243 * 16], m3 + +; mode 4 [row 19] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 166 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 167 * 16], m3 + +; mode 4 [row 20] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 168 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 169 * 16], m3 + +; mode 5 [row 24] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 240 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 241 * 16], m3 + +; mode 3 [row 17] +movu m6, [r5 + 20 * 16] +movu m0, [r4 + 15] +movd m1, [r4 + 16] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 23] +movd m3, [r4 + 24] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 98 * 16], m1 + +movu m1, [r4 + 31] +movd m3, [r4 + 32] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 39] +movd m5, [r4 + 40] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 99 * 16], m3 + +; mode 4 [row 21] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 170 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 171 * 16], m3 + +; mode 5 [row 26] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 244 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 245 * 16], m3 + +; mode 5 [row 27] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 246 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 247 * 16], m3 + +; mode 3 [row 18] +movu m6, [r5 + 14 * 16] +movu m0, [r4 + 16] +movd m1, [r4 + 17] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 24] +movd m3, [r4 + 25] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 100 * 16], m1 + +movu m1, [r4 + 32] +movd m3, [r4 + 33] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 40] +movd m5, [r4 + 41] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 101 * 16], m3 + +; mode 4 [row 22] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 172 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 173 * 16], m3 + +; mode 4 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 174 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 175 * 16], m3 + +; mode 5 [row 28] +movu m6, [r5 + 13 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 248 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 249 * 16], m3 + +; mode 5 [row 29] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 250 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 251 * 16], m3 + +; mode 3 [row 19] +movu m6, [r5 + 8 * 16] +movu m0, [r4 + 17] +movd m1, [r4 + 18] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 25] +movd m3, [r4 + 26] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 102 * 16], m1 + +movu m1, [r4 + 33] +movd m3, [r4 + 34] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 41] +movd m5, [r4 + 42] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 103 * 16], m3 + +; mode 4 [row 24] +movu m6, [r5 + 13 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 176 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 177 * 16], m3 + +; mode 5 [row 30] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 252 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 253 * 16], m3 + +; mode 3 [row 20] +movu m6, [r5 + 2 * 16] +movu m0, [r4 + 18] +movd m1, [r4 + 19] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 26] +movd m3, [r4 + 27] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 104 * 16], m1 + +movu m1, [r4 + 34] +movd m3, [r4 + 35] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 42] +movd m5, [r4 + 43] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 105 * 16], m3 + +; mode 4 [row 25] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 178 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 179 * 16], m3 + +; mode 4 [row 26] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 180 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 181 * 16], m3 + +; mode 3 [row 21] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 106 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 107 * 16], m3 + +; mode 3 [row 22] +movu m6, [r5 + 22 * 16] +movu m0, [r4 + 19] +movd m1, [r4 + 20] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 27] +movd m3, [r4 + 28] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 108 * 16], m1 + +movu m1, [r4 + 35] +movd m3, [r4 + 36] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 43] +movd m5, [r4 + 44] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 109 * 16], m3 + +; mode 4 [row 27] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 182 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 183 * 16], m3 + +; mode 3 [row 23] +movu m6, [r5 + 16 * 16] +movu m0, [r4 + 20] +movd m1, [r4 + 21] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 28] +movd m3, [r4 + 29] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 110 * 16], m1 + +movu m1, [r4 + 36] +movd m3, [r4 + 37] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 44] +movd m5, [r4 + 45] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 111 * 16], m3 + +; mode 4 [row 28] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 184 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 185 * 16], m3 + +; mode 4 [row 29] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 186 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 187 * 16], m3 + +; mode 3 [row 24] +movu m6, [r5 + 10 * 16] +movu m0, [r4 + 21] +movd m1, [r4 + 22] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 29] +movd m3, [r4 + 30] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 112 * 16], m1 + +movu m1, [r4 + 37] +movd m3, [r4 + 38] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 45] +movd m5, [r4 + 46] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 113 * 16], m3 + +; mode 4 [row 30] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 188 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 189 * 16], m3 + +; mode 3 [row 25] +movu m6, [r5 + 4 * 16] +movu m0, [r4 + 22] +movd m1, [r4 + 23] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 30] +movd m3, [r4 + 31] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 114 * 16], m1 + +movu m1, [r4 + 38] +movd m3, [r4 + 39] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 46] +movd m5, [r4 + 47] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 115 * 16], m3 + +; mode 3 [row 26] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 116 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 117 * 16], m3 + +; mode 3 [row 27] +movu m6, [r5 + 24 * 16] +movu m0, [r4 + 23] +movd m1, [r4 + 24] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 31] +movd m3, [r4 + 32] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 118 * 16], m1 + +movu m1, [r4 + 39] +movd m3, [r4 + 40] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 47] +movd m5, [r4 + 48] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 119 * 16], m3 + +; mode 3 [row 28] +movu m6, [r5 + 18 * 16] +movu m0, [r4 + 24] +movd m1, [r4 + 25] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 32] +movd m3, [r4 + 33] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 120 * 16], m1 + +movu m1, [r4 + 40] +movd m3, [r4 + 41] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 48] +movd m5, [r4 + 49] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 121 * 16], m3 + +; mode 3 [row 29] +movu m6, [r5 + 12 * 16] +movu m0, [r4 + 25] +movd m1, [r4 + 26] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 33] +movd m3, [r4 + 34] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 122 * 16], m1 + +movu m1, [r4 + 41] +movd m3, [r4 + 42] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 49] +movd m5, [r4 + 50] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 123 * 16], m3 + +; mode 3 [row 30] +movu m6, [r5 + 6 * 16] +movu m0, [r4 + 26] +movd m1, [r4 + 27] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r4 + 34] +movd m3, [r4 + 35] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 124 * 16], m1 + +movu m1, [r4 + 42] +movd m3, [r4 + 43] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r4 + 50] +movd m5, [r4 + 51] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 125 * 16], m3 + +; mode 10 +movu m1, [r2 + 1] +movu m2, [r2 + 17] +movu [r0 + 512 * 16], m1 +movu [r0 + 513 * 16], m2 +movu [r0 + 514 * 16], m1 +movu [r0 + 515 * 16], m2 +movu [r0 + 516 * 16], m1 +movu [r0 + 517 * 16], m2 +movu [r0 + 518 * 16], m1 +movu [r0 + 519 * 16], m2 +movu [r0 + 520 * 16], m1 +movu [r0 + 521 * 16], m2 +movu [r0 + 522 * 16], m1 +movu [r0 + 523 * 16], m2 +movu [r0 + 524 * 16], m1 +movu [r0 + 525 * 16], m2 +movu [r0 + 526 * 16], m1 +movu [r0 + 527 * 16], m2 + +movu [r0 + 528 * 16], m1 +movu [r0 + 529 * 16], m2 +movu [r0 + 530 * 16], m1 +movu [r0 + 531 * 16], m2 +movu [r0 + 532 * 16], m1 +movu [r0 + 533 * 16], m2 +movu [r0 + 534 * 16], m1 +movu [r0 + 535 * 16], m2 +movu [r0 + 536 * 16], m1 +movu [r0 + 537 * 16], m2 +movu [r0 + 538 * 16], m1 +movu [r0 + 539 * 16], m2 +movu [r0 + 540 * 16], m1 +movu [r0 + 541 * 16], m2 +movu [r0 + 542 * 16], m1 +movu [r0 + 543 * 16], m2 + +movu [r0 + 544 * 16], m1 +movu [r0 + 545 * 16], m2 +movu [r0 + 546 * 16], m1 +movu [r0 + 547 * 16], m2 +movu [r0 + 548 * 16], m1 +movu [r0 + 549 * 16], m2 +movu [r0 + 550 * 16], m1 +movu [r0 + 551 * 16], m2 +movu [r0 + 552 * 16], m1 +movu [r0 + 553 * 16], m2 +movu [r0 + 554 * 16], m1 +movu [r0 + 555 * 16], m2 +movu [r0 + 556 * 16], m1 +movu [r0 + 557 * 16], m2 +movu [r0 + 558 * 16], m1 +movu [r0 + 559 * 16], m2 + +movu [r0 + 560 * 16], m1 +movu [r0 + 561 * 16], m2 +movu [r0 + 562 * 16], m1 +movu [r0 + 563 * 16], m2 +movu [r0 + 564 * 16], m1 +movu [r0 + 565 * 16], m2 +movu [r0 + 566 * 16], m1 +movu [r0 + 567 * 16], m2 +movu [r0 + 568 * 16], m1 +movu [r0 + 569 * 16], m2 +movu [r0 + 570 * 16], m1 +movu [r0 + 571 * 16], m2 +movu [r0 + 572 * 16], m1 +movu [r0 + 573 * 16], m2 +movu [r0 + 574 * 16], m1 +movu [r0 + 575 * 16], m2 + +; mode 11 [row 0] +movu m0, [r4] + +; mode 11 [row 15 - first half] +movu [r0 + 606 * 16], m0 + +movu [r0 + 606 * 16], m0 + +; mode 12 [row 31] +pslldq m6, m0, 4 +pinsrb m6, [r3 + 26], 0 +pinsrb m6, [r3 + 19], 1 +pinsrb m6, [r3 + 13], 2 +pinsrb m6, [r3 + 6], 3 +movu [r0 + 702 * 16], m6 +movu m6, [r4 + 12] +movu [r0 + 703 * 16], m6 + +; mode 11 [row 31] +pslldq m6, m0, 1 +pinsrb m6, [r3 + 16], 0 +movu [r0 + 638 * 16], m6 +movu m6, [r4 + 15] +movu [r0 + 639 * 16], m6 + +movd m1, [r4 + 1] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m1, m0, [r5 + 30 * 16] +pmulhrsw m1, m7 +movu m2, [r4 + 8] +movd m3, [r4 + 9] +palignr m3, m2, 1 +punpcklbw m2, m3 +pmaddubsw m3, m2, [r5 + 30 * 16] +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 576 * 16], m1 + +movu m1, [r4 + 16] + +; mode 11 [row 15 - second half] +movu [r0 + 607 * 16], m1 + +movd m3, [r4 + 17] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, m7 +movu m4, [r4 + 24] +movd m5, [r4 + 25] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 577 * 16], m3 + +; mode 11 [row 1] +pmaddubsw m3, m0, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 578 * 16], m3 +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 579 * 16], m3 + +; mode 11 [row 2] +pmaddubsw m3, m0, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 580 * 16], m3 +pmaddubsw m3, m1, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 581 * 16], m3 + +; mode 11 [row 3] +pmaddubsw m3, m0, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 582 * 16], m3 +pmaddubsw m3, m1, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 583 * 16], m3 + +; mode 11 [row 4] +pmaddubsw m3, m0, [r5 + 22 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 22 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 584 * 16], m3 + +; mode 12 [row 1 - first half] +movu [r0 + 642 * 16], m3 + +pmaddubsw m3, m1, [r5 + 22 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 22 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 585 * 16], m3 + +; mode 12 [row 1 - second half] +movu [r0 + 643 * 16], m3 + +; mode 11 [row 5] +pmaddubsw m3, m0, [r5 + 20 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 586 * 16], m3 +pmaddubsw m3, m1, [r5 + 20 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 587 * 16], m3 + +; mode 11 [row 6] +pmaddubsw m3, m0, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 588 * 16], m3 +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 589 * 16], m3 + +; mode 11 [row 7] +pmaddubsw m3, m0, [r5 + 16 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 590 * 16], m3 +pmaddubsw m3, m1, [r5 + 16 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 591 * 16], m3 + +; mode 11 [row 8] +pmaddubsw m3, m0, [r5 + 14 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 592 * 16], m3 + +; mode 13 [row 1 - first half] +movu [r0 + 706 * 16], m3 + +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 593 * 16], m3 + +; mode 13 [row 1 - second half] +movu [r0 + 707 * 16], m3 + +; mode 11 [row 9] +pmaddubsw m3, m0, [r5 + 12 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 12 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 594 * 16], m3 + +; mode 12 [row 3 - first half] +movu [r0 + 646 * 16], m3 + +pmaddubsw m3, m1, [r5 + 12 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 595 * 16], m3 + +; mode 12 [row 3 - second half] +movu [r0 + 647 * 16], m3 + +; mode 11 [row 10] +pmaddubsw m3, m0, [r5 + 10 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 10 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 596 * 16], m3 +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 597 * 16], m3 + +; mode 11 [row 11] +pmaddubsw m3, m0, [r5 + 8 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 598 * 16], m3 +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 599 * 16], m3 + +; mode 11 [row 12] +pmaddubsw m3, m0, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 600 * 16], m3 + +; mode 14 [row 1 - first half] +movu [r0 + 770 * 16], m3 + +pmaddubsw m3, m1, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 601 * 16], m3 + +; mode 14 [row 1 - second half] +movu [r0 + 771 * 16], m3 + +; mode 11 [row 13] +pmaddubsw m3, m0, [r5 + 4 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 602 * 16], m3 +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 603 * 16], m3 + +; mode 11 [row 14] +pmaddubsw m3, m0, [r5 + 2 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 2 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 604 * 16], m3 + +; mode 13 [row 5 - first half] +movu [r0 + 650 * 16], m3 + +pmaddubsw m3, m1, [r5 + 2 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 605 * 16], m3 + +; mode 13 [row 5 - second half] +movu [r0 + 651 * 16], m3 + +; mode 12 [row 0] +pmaddubsw m3, m0, [r5 + 27 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 27 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 640 * 16], m3 +pmaddubsw m3, m1, [r5 + 27 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 27 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 641 * 16], m3 + +; mode 12 [row 2] +pmaddubsw m3, m0, [r5 + 17 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 17 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 644 * 16], m3 +pmaddubsw m3, m1, [r5 + 17 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 645 * 16], m3 + +; mode 12 [row 4] +pmaddubsw m3, m0, [r5 + 7 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 7 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 648 * 16], m3 +pmaddubsw m3, m1, [r5 + 7 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 7 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 649 * 16], m3 + +; mode 13 [row 0] +pmaddubsw m3, m0, [r5 + 23 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 23 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 704 * 16], m3 +pmaddubsw m3, m1, [r5 + 23 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 23 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 705 * 16], m3 + +; mode 13 [row 2] +pmaddubsw m3, m0, [r5 + 5 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 5 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 708 * 16], m3 +pmaddubsw m3, m1, [r5 + 5 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 5 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 709 * 16], m3 + +; mode 14 [row 0] +pmaddubsw m3, m0, [r5 + 19 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 19 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 768 * 16], m3 +pmaddubsw m3, m1, [r5 + 19 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 19 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 769 * 16], m3 + +; mode 15 [row 0] +pmaddubsw m3, m0, [r5 + 15 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 15 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 832 * 16], m3 +pmaddubsw m3, m1, [r5 + 15 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 833 * 16], m3 + +; mode 11 [row 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r3 + 16], 0 +pmaddubsw m3, m0, [r5 + 30 * 16] +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 8], 1 +pinsrb m2, [r4 + 7], 0 +pmaddubsw m5, m2, [r5 + 30 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 608 * 16], m3 +pslldq m1, 2 +pinsrb m1, [r4 + 16], 1 +pinsrb m1, [r4 + 15], 0 +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrb m4, [r4 + 24], 1 +pinsrb m4, [r4 + 23], 0 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 609 * 16], m3 + +; mode 11 [row 17] +pmaddubsw m3, m0, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 610 * 16], m3 +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 611 * 16], m3 + +; mode 11 [row 18] +pmaddubsw m3, m0, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 612 * 16], m3 +pmaddubsw m3, m1, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 613 * 16], m3 + +; mode 11 [row 19] +pmaddubsw m3, m0, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 614 * 16], m3 +pmaddubsw m3, m1, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 615 * 16], m3 + +; mode 11 [row 20] +pmaddubsw m3, m0, [r5 + 22 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 22 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 616 * 16], m3 +pmaddubsw m3, m1, [r5 + 22 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 22 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 617 * 16], m3 + +; mode 11 [row 21] +pmaddubsw m3, m0, [r5 + 20 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 618 * 16], m3 +pmaddubsw m3, m1, [r5 + 20 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 619 * 16], m3 + +; mode 11 [row 22] +pmaddubsw m3, m0, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 620 * 16], m3 +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 621 * 16], m3 + +; mode 11 [row 23] +pmaddubsw m3, m0, [r5 + 16 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 622 * 16], m3 +pmaddubsw m3, m1, [r5 + 16 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 623 * 16], m3 + +; mode 11 [row 24] +pmaddubsw m3, m0, [r5 + 14 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 624 * 16], m3 +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 625 * 16], m3 + +; mode 11 [row 25] +pmaddubsw m3, m0, [r5 + 12 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 12 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 626 * 16], m3 +pmaddubsw m3, m1, [r5 + 12 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 627 * 16], m3 + +; mode 11 [row 26] +pmaddubsw m3, m0, [r5 + 10 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 10 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 628 * 16], m3 +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 629 * 16], m3 + +; mode 11 [row 27] +pmaddubsw m3, m0, [r5 + 8 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 630 * 16], m3 +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 631 * 16], m3 + +; mode 11 [row 28] +pmaddubsw m3, m0, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 632 * 16], m3 +pmaddubsw m3, m1, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 633 * 16], m3 + +; mode 11 [row 29] +pmaddubsw m3, m0, [r5 + 4 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 634 * 16], m3 +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 635 * 16], m3 + +; mode 11 [row 30] +pmaddubsw m3, m0, [r5 + 2 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 2 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 636 * 16], m3 +pmaddubsw m3, m1, [r5 + 2 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 637 * 16], m3 + +; mode 12 [row 6] +pinsrb m0, [r3 + 6], 0 +pmaddubsw m3, m0, [r5 + 29 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 29 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 652 * 16], m3 +pmaddubsw m3, m1, [r5 + 29 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 29 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 653 * 16], m3 + +; mode 12 [row 7] +pmaddubsw m3, m0, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 654 * 16], m3 +pmaddubsw m3, m1, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 655 * 16], m3 + +; mode 12 [row 8] +pmaddubsw m3, m0, [r5 + 19 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 19 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 656 * 16], m3 +pmaddubsw m3, m1, [r5 + 19 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 19 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 657 * 16], m3 + +; mode 12 [row 9] +pmaddubsw m3, m0, [r5 + 14 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 658 * 16], m3 +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 659 * 16], m3 + +; mode 12 [row 10] +pmaddubsw m3, m0, [r5 + 9 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 9 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 660 * 16], m3 +pmaddubsw m3, m1, [r5 + 9 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 9 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 661 * 16], m3 + +; mode 12 [row 11] +pmaddubsw m3, m0, [r5 + 4 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 662 * 16], m3 +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 663 * 16], m3 + +; mode 13 [row 3] +movu m6, m0 +pinsrb m6, [r3 + 4], 0 +pmaddubsw m3, m6, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 710 * 16], m3 +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 711 * 16], m3 + +; mode 13 [row 4] +pmaddubsw m3, m6, [r5 + 19 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 19 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 712 * 16], m3 +pmaddubsw m3, m1, [r5 + 19 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 19 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 713 * 16], m3 + +; mode 13 [row 5] +pmaddubsw m3, m6, [r5 + 10 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 10 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 714 * 16], m3 +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 715 * 16], m3 + +; mode 13 [row 6] +pmaddubsw m3, m6, [r5 + 1 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 1 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 716 * 16], m3 +pmaddubsw m3, m1, [r5 + 1 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 1 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 717 * 16], m3 + +; mode 14 [row 2] +movu m6, m0 +pinsrb m6, [r4 + 0], 1 +pinsrb m6, [r3 + 2], 0 +pmaddubsw m3, m6, [r5 + 25 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 25 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 772 * 16], m3 +pmaddubsw m3, m1, [r5 + 25 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 773 * 16], m3 + +; mode 14 [row 3] +pmaddubsw m3, m6, [r5 + 12 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 12 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 774 * 16], m3 +pmaddubsw m3, m1, [r5 + 12 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 775 * 16], m3 + +; mode 15 [row 1] +pmaddubsw m3, m6, [r5 + 30 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 30 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 834 * 16], m3 +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 835 * 16], m3 + +; mode 15 [row 2] +pmaddubsw m3, m6, [r5 + 13 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 13 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 836 * 16], m3 +pmaddubsw m3, m1, [r5 + 13 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 13 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 837 * 16], m3 + +; mode 15 [row 3] +pslldq m6, 2 +pinsrb m6, [r3 + 2], 1 +pinsrb m6, [r3 + 4], 0 +pmaddubsw m3, m6, [r5 + 28 * 16] +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 7], 1 +pinsrb m2, [r4 + 6], 0 +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 838 * 16], m3 +pslldq m1, 2 +pinsrb m1, [r4 + 15], 1 +pinsrb m1, [r4 + 14], 0 +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrb m4, [r4 + 23], 1 +pinsrb m4, [r4 + 22], 0 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 839 * 16], m3 + +; mode 15 [row 4] +pmaddubsw m3, m6, [r5 + 11 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 11 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 840 * 16], m3 +pmaddubsw m3, m1, [r5 + 11 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 11 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 841 * 16], m3 + +; mode 15 [row 5, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 4], 1 +pinsrb m6, [r3 + 6], 0 +pmaddubsw m3, m6, [r5 + 26 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 842 * 16], m3 + +; mode 15 [row 6, 0-7] +pmaddubsw m3, m6, [r5 + 9 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 844 * 16], m3 + +; mode 15 [row 7, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 6], 1 +pinsrb m6, [r3 + 8], 0 +pmaddubsw m3, m6, [r5 + 24 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 846 * 16], m3 + +; mode 15 [row 8, 0-7] +pmaddubsw m3, m6, [r5 + 7 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 848 * 16], m3 + +; mode 15 [row 9, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 8], 1 +pinsrb m6, [r3 + 9], 0 +pmaddubsw m3, m6, [r5 + 22 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 850 * 16], m3 + +; mode 15 [row 10, 0-7] +pmaddubsw m3, m6, [r5 + 5 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 852 * 16], m3 + +; mode 15 [row 11, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 9], 1 +pinsrb m6, [r3 + 11], 0 +pmaddubsw m3, m6, [r5 + 20 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 854 * 16], m3 + +; mode 15 [row 12, 0-7] +pmaddubsw m3, m6, [r5 + 3 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 856 * 16], m3 + +; mode 15 [row 13, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 11], 1 +pinsrb m6, [r3 + 13], 0 +pmaddubsw m3, m6, [r5 + 18 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 858 * 16], m3 + +; mode 15 [row 14, 0-7] +pmaddubsw m3, m6, [r5 + 1 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 860 * 16], m3 + +; mode 15 [row 15, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 13], 1 +pinsrb m6, [r3 + 15], 0 +pmaddubsw m3, m6, [r5 + 16 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 862 * 16], m3 + +; mode 15 [row 16, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 15], 1 +pinsrb m6, [r3 + 17], 0 +pmaddubsw m3, m6, [r5 + 31 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 864 * 16], m3 + +; mode 15 [row 17, 0-7] +pmaddubsw m3, m6, [r5 + 14 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 866 * 16], m3 + +; mode 15 [row 18, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 17], 1 +pinsrb m6, [r3 + 19], 0 +pmaddubsw m3, m6, [r5 + 29 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 868 * 16], m3 + +; mode 15 [row 19, 0-7] +pmaddubsw m3, m6, [r5 + 12 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 870 * 16], m3 + +; mode 15 [row 20, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 19], 1 +pinsrb m6, [r3 + 21], 0 +pmaddubsw m3, m6, [r5 + 27 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 872 * 16], m3 + +; mode 15 [row 21, 0-7] +pmaddubsw m3, m6, [r5 + 10 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 874 * 16], m3 + +; mode 15 [row 22, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 21], 1 +pinsrb m6, [r3 + 23], 0 +pmaddubsw m3, m6, [r5 + 25 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 876 * 16], m3 + +; mode 15 [row 23, 0-7] +pmaddubsw m3, m6, [r5 + 8 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 878 * 16], m3 + +; mode 15 [row 24, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 23], 1 +pinsrb m6, [r3 + 24], 0 +pmaddubsw m3, m6, [r5 + 23 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 880 * 16], m3 + +; mode 15 [row 25, 0-7] +pmaddubsw m3, m6, [r5 + 6 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 882 * 16], m3 + +; mode 15 [row 26, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 24], 1 +pinsrb m6, [r3 + 26], 0 +pmaddubsw m3, m6, [r5 + 21 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 884 * 16], m3 + +; mode 15 [row 27, 0-7] +pmaddubsw m3, m6, [r5 + 4 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 886 * 16], m3 + +; mode 15 [row 28, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 26], 1 +pinsrb m6, [r3 + 28], 0 +pmaddubsw m3, m6, [r5 + 19 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 888 * 16], m3 + +; mode 15 [row 29, 0-7] +pmaddubsw m3, m6, [r5 + 2 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 890 * 16], m3 + +; mode 15 [row 30, 0-7] +pslldq m6, 2 +pinsrb m6, [r3 + 28], 1 +pinsrb m6, [r3 + 30], 0 +pmaddubsw m3, m6, [r5 + 17 * 16] +pmulhrsw m3, m7 +packuswb m3, m3 +movh [r0 + 892 * 16], m3 + +; mode 15 [row 31, 0-7] +pshufb m3, m6, [tab_S2] +movh [r0 + 894 * 16], m3 + +; mode 12 [row 12] +pslldq m0, 2 +pinsrb m0, [r3 + 6], 1 +pinsrb m0, [r3 + 13], 0 +pmaddubsw m3, m0, [r5 + 31 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 31 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 664 * 16], m3 +pmaddubsw m3, m1, [r5 + 31 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 31 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 665 * 16], m3 + +; mode 12 [row 13] +pmaddubsw m3, m0, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 666 * 16], m3 +pmaddubsw m3, m1, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 667 * 16], m3 + +; mode 12 [row 14] +pmaddubsw m3, m0, [r5 + 21 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 21 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 668 * 16], m3 +pmaddubsw m3, m1, [r5 + 21 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 669 * 16], m3 + +; mode 12 [row 15] +pmaddubsw m3, m0, [r5 + 16 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 670 * 16], m3 +pmaddubsw m3, m1, [r5 + 16 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 671 * 16], m3 + +; mode 12 [row 16] +pmaddubsw m3, m0, [r5 + 11 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 11 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 672 * 16], m3 +pmaddubsw m3, m1, [r5 + 11 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 11 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 673 * 16], m3 + +; mode 12 [row 17] +pmaddubsw m3, m0, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 674 * 16], m3 +pmaddubsw m3, m1, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 675 * 16], m3 + +; mode 12 [row 18] +pmaddubsw m3, m0, [r5 + 1 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 1 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 676 * 16], m3 +pmaddubsw m3, m1, [r5 + 1 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 1 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 677 * 16], m3 + +; mode 13 [row 7] +movu m6, m0 +pinsrb m6, [r3 + 4], 2 +pinsrb m6, [r3 + 4], 1 +pinsrb m6, [r3 + 7], 0 +pmaddubsw m3, m6, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 718 * 16], m3 +pmaddubsw m3, m1, [r5 + 24 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 719 * 16], m3 + +; mode 13 [row 8] +pmaddubsw m3, m6, [r5 + 15 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 15 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 720 * 16], m3 +pmaddubsw m3, m1, [r5 + 15 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 721 * 16], m3 + +; mode 13 [row 9] +pmaddubsw m3, m6, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 722 * 16], m3 +pmaddubsw m3, m1, [r5 + 6 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 723 * 16], m3 + +; mode 14 [row 4] +pinsrb m6, [r3 + 2], 2 +pinsrb m6, [r3 + 2], 1 +pinsrb m6, [r3 + 5], 0 +pmaddubsw m3, m6, [r5 + 31 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 31 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 776 * 16], m3 +pmaddubsw m3, m1, [r5 + 31 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 31 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 777 * 16], m3 + +; mode 14 [row 5] +pmaddubsw m3, m6, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 778 * 16], m3 +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 779 * 16], m3 + +; mode 14 [row 6] +pmaddubsw m3, m6, [r5 + 5 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 5 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 780 * 16], m3 +pmaddubsw m3, m1, [r5 + 5 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 5 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 781 * 16], m3 + +; mode 14 [row 7] +pslldq m6, 2 +pinsrb m6, [r3 + 5], 1 +pinsrb m6, [r3 + 7], 0 +pmaddubsw m3, m6, [r5 + 24 * 16] +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 5], 0 +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 782 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 13], 0 +pmaddubsw m3, m1, [r5 + 24 * 16] +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 21], 0 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 783 * 16], m3 + +; mode 14 [row 8] +pmaddubsw m3, m6, [r5 + 11 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 11 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 784 * 16], m3 +pmaddubsw m3, m1, [r5 + 11 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 11 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 785 * 16], m3 + +; mode 15 [row 5, 8-31] +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m5, m5 +movh [r0 + 842 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 26 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 843 * 16], m3 + +; mode 15 [row 6, 8-31] +pmaddubsw m5, m2, [r5 + 9 * 16] +pmulhrsw m5, m7 +packuswb m5, m5 +movh [r0 + 844 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 9 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 9 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 845 * 16], m3 + +; mode 12 [row 19] +pslldq m0, 2 +pinsrb m0, [r3 + 13], 1 +pinsrb m0, [r3 + 19], 0 +pmaddubsw m3, m0, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 678 * 16], m3 +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 679 * 16], m3 + +; mode 12 [row 20] +pmaddubsw m3, m0, [r5 + 23 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 23 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 680 * 16], m3 +pmaddubsw m3, m1, [r5 + 23 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 23 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 681 * 16], m3 + +; mode 12 [row 21] +pmaddubsw m3, m0, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 682 * 16], m3 +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 683 * 16], m3 + +; mode 12 [row 22] +pmaddubsw m3, m0, [r5 + 13 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 13 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 684 * 16], m3 +pmaddubsw m3, m1, [r5 + 13 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 13 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 685 * 16], m3 + +; mode 12 [row 23] +pmaddubsw m3, m0, [r5 + 8 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 686 * 16], m3 +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 687 * 16], m3 + +; mode 12 [row 24] +pmaddubsw m3, m0, [r5 + 3 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m2, [r5 + 3 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 688 * 16], m3 +pmaddubsw m3, m1, [r5 + 3 * 16] +pmulhrsw m3, m7 +pmaddubsw m5, m4, [r5 + 3 * 16] +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 689 * 16], m3 + +; mode 13 [row 10] +movu m7, m6 +movu m6, m0 +pinsrb m6, [r3 + 4], 4 +pinsrb m6, [r3 + 4], 3 +pinsrb m6, [r3 + 7], 2 +pinsrb m6, [r3 + 7], 1 +pinsrb m6, [r3 + 11], 0 +pmaddubsw m3, m6, [r5 + 29 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 29 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 724 * 16], m3 +pmaddubsw m3, m1, [r5 + 29 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 29 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 725 * 16], m3 + +; mode 13 [row 11] +pmaddubsw m3, m6, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 726 * 16], m3 +pmaddubsw m3, m1, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 727 * 16], m3 + +; mode 13 [row 12] +pmaddubsw m3, m6, [r5 + 11 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 11 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 728 * 16], m3 +pmaddubsw m3, m1, [r5 + 11 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 11 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 729 * 16], m3 + +; mode 13 [row 13] +pmaddubsw m3, m6, [r5 + 2 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 2 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 730 * 16], m3 +pmaddubsw m3, m1, [r5 + 2 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 731 * 16], m3 + +; mode 14 [row 9] +pslldq m7, 2 +pinsrb m7, [r3 + 7], 1 +pinsrb m7, [r3 + 10], 0 +pmaddubsw m3, m7, [r5 + 30 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrw m2, [r4 + 4], 0 +pmaddubsw m5, m2, [r5 + 30 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 786 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 12], 0 +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrb m4, [r4 + 21], 1 +pinsrb m4, [r4 + 20], 0 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 787 * 16], m3 + +; mode 14 [row 10] +pmaddubsw m3, m7, [r5 + 17 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 17 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 788 * 16], m3 +pmaddubsw m3, m1, [r5 + 17 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 789 * 16], m3 + +; mode 14 [row 11] +pmaddubsw m3, m7, [r5 + 4 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 790 * 16], m3 +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 791 * 16], m3 + +movu m6, [pw_1024] + +; mode 15 [row 7, 8-31] +pmaddubsw m5, m2, [r5 + 24 * 16] +pmulhrsw m5, m6 +packuswb m5, m5 +movh [r0 + 846 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 24 * 16] +pmulhrsw m3, m6 +pmaddubsw m5, m4, [r5 + 24 * 16] +pmulhrsw m5, m6 +packuswb m3, m5 +movu [r0 + 847 * 16], m3 + +; mode 15 [row 8, 8-31] +pmaddubsw m5, m2, [r5 + 7 * 16] +pmulhrsw m5, m6 +packuswb m5, m5 +movh [r0 + 848 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 7 * 16] +pmulhrsw m3, m6 +pmaddubsw m5, m4, [r5 + 7 * 16] +pmulhrsw m5, m6 +packuswb m3, m5 +movu [r0 + 849 * 16], m3 + +; mode 12 [row 25] +pslldq m0, 2 +pinsrb m0, [r3 + 19], 1 +pinsrb m0, [r3 + 26], 0 +pmaddubsw m3, m0, [r5 + 30 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 30 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 690 * 16], m3 +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 691 * 16], m3 + +; mode 12 [row 26] +pmaddubsw m3, m0, [r5 + 25 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 25 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 692 * 16], m3 +pmaddubsw m3, m1, [r5 + 25 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 693 * 16], m3 + +; mode 12 [row 27] +pmaddubsw m3, m0, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 694 * 16], m3 +pmaddubsw m3, m1, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 695 * 16], m3 + +; mode 12 [row 28] +pmaddubsw m3, m0, [r5 + 15 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 15 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 696 * 16], m3 +pmaddubsw m3, m1, [r5 + 15 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 697 * 16], m3 + +; mode 12 [row 29] +pmaddubsw m3, m0, [r5 + 10 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 10 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 698 * 16], m3 +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 699 * 16], m3 + +; mode 12 [row 30] +pmaddubsw m3, m0, [r5 + 5 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 5 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 700 * 16], m3 +pmaddubsw m3, m1, [r5 + 5 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 5 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 701 * 16], m3 + +; mode 13 [row 14] +movu m6, m0 +pinsrb m6, [r3 + 4], 6 +pinsrb m6, [r3 + 4], 5 +pinsrb m6, [r3 + 7], 4 +pinsrb m6, [r3 + 7], 3 +pinsrb m6, [r3 + 11], 2 +pinsrb m6, [r3 + 11], 1 +pinsrb m6, [r3 + 14], 0 +pmaddubsw m3, m6, [r5 + 25 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 25 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 732 * 16], m3 +pmaddubsw m3, m1, [r5 + 25 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 733 * 16], m3 + +; mode 13 [row 15] +pmaddubsw m3, m6, [r5 + 16 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 734 * 16], m3 +pmaddubsw m3, m1, [r5 + 16 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 735 * 16], m3 + +; mode 13 [row 16] +pmaddubsw m3, m6, [r5 + 7 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 7 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 736 * 16], m3 +pmaddubsw m3, m1, [r5 + 7 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 7 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 737 * 16], m3 + +; mode 13 [row 17] +pslldq m6, 2 +pinsrb m6, [r3 + 14], 1 +pinsrb m6, [r3 + 18], 0 +pmaddubsw m3, m6, [r5 + 30 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrw m2, [r4 + 3], 0 +pmaddubsw m5, m2, [r5 + 30 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 738 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 11], 0 +pmaddubsw m3, m1, [r5 + 30 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 19], 0 +pmaddubsw m5, m4, [r5 + 30 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 739 * 16], m3 + +; mode 13 [row 18] +pmaddubsw m3, m6, [r5 + 21 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 21 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 740 * 16], m3 +pmaddubsw m3, m1, [r5 + 21 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 741 * 16], m3 + +; mode 13 [row 19] +pmaddubsw m3, m6, [r5 + 12 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 12 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 742 * 16], m3 +pmaddubsw m3, m1, [r5 + 12 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 743 * 16], m3 + +; mode 13 [row 20] +pmaddubsw m3, m6, [r5 + 3 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 3 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 744 * 16], m3 +pmaddubsw m3, m1, [r5 + 3 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 3 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 745 * 16], m3 + +; mode 14 [row 12] +pslldq m7, 2 +pinsrb m7, [r3 + 10], 1 +pinsrb m7, [r3 + 12], 0 +pmaddubsw m3, m7, [r5 + 23 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 23 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 792 * 16], m3 +pmaddubsw m3, m1, [r5 + 23 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 23 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 793 * 16], m3 + +; mode 14 [row 13] +pmaddubsw m3, m7, [r5 + 10 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 10 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 794 * 16], m3 +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 795 * 16], m3 + +; mode 15 [row 9] +pmaddubsw m5, m2, [r5 + 22 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movu [r0 + 850 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 22 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 22 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 851 * 16], m3 + +; mode 15 [row 10] +pmaddubsw m5, m2, [r5 + 5 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movu [r0 + 852 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 5 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 5 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 853 * 16], m3 + +; mode 13 [row 21] +pslldq m6, 2 +pinsrb m6, [r3 + 18], 1 +pinsrb m6, [r3 + 21], 0 +pmaddubsw m3, m6, [r5 + 26 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrw m2, [r4 + 2], 0 +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 746 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 10], 0 +pmaddubsw m3, m1, [r5 + 26 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 18], 0 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 747 * 16], m3 + +; mode 13 [row 22] +pmaddubsw m3, m6, [r5 + 17 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 17 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 748 * 16], m3 +pmaddubsw m3, m1, [r5 + 17 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 749 * 16], m3 + +; mode 13 [row 23] +pmaddubsw m3, m6, [r5 + 8 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 750 * 16], m3 +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 751 * 16], m3 + +; mode 14 [row 14] +pslldq m7, 2 +pinsrb m7, [r3 + 12], 1 +pinsrb m7, [r3 + 15], 0 +pmaddubsw m3, m7, [r5 + 29 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 29 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 796 * 16], m3 +pmaddubsw m3, m1, [r5 + 29 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 29 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 797 * 16], m3 + +; mode 14 [row 15] +pmaddubsw m3, m7, [r5 + 16 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 798 * 16], m3 +pmaddubsw m3, m1, [r5 + 16 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 799 * 16], m3 + +; mode 14 [row 16] +pmaddubsw m3, m7, [r5 + 3 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 3 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 800 * 16], m3 +pmaddubsw m3, m1, [r5 + 3 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 3 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 801 * 16], m3 + +; mode 15 [row 11] +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 854 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 855 * 16], m3 + +; mode 15 [row 12] +pmaddubsw m5, m2, [r5 + 3 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 856 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 3 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 3 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 857 * 16], m3 + +; mode 13 [row 24] +pslldq m6, 2 +pinsrb m6, [r3 + 21], 1 +pinsrb m6, [r3 + 25], 0 +pmaddubsw m3, m6, [r5 + 31 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrw m2, [r4 + 1], 0 +pmaddubsw m5, m2, [r5 + 31 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 752 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 9], 0 +pmaddubsw m3, m1, [r5 + 31 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 17], 0 +pmaddubsw m5, m4, [r5 + 31 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 753 * 16], m3 + +; mode 13 [row 25] +pmaddubsw m3, m6, [r5 + 22 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 22 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 754 * 16], m3 +pmaddubsw m3, m1, [r5 + 22 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 22 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 755 * 16], m3 + +; mode 13 [row 26] +pmaddubsw m3, m6, [r5 + 13 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 13 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 756 * 16], m3 +pmaddubsw m3, m1, [r5 + 13 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 13 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 757 * 16], m3 + +; mode 13 [row 27] +pmaddubsw m3, m6, [r5 + 4 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 4 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 758 * 16], m3 +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 759 * 16], m3 + +; mode 14 [row 17] +pslldq m7, 2 +pinsrb m7, [r3 + 15], 1 +pinsrb m7, [r3 + 17], 0 +pmaddubsw m3, m7, [r5 + 22 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 22 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 802 * 16], m3 +pmaddubsw m3, m1, [r5 + 22 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 22 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 803 * 16], m3 + +; mode 14 [row 18] +pmaddubsw m3, m7, [r5 + 9 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 9 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 804 * 16], m3 +pmaddubsw m3, m1, [r5 + 9 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 9 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 805 * 16], m3 + +; mode 15 [row 13] +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 858 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 859 * 16], m3 + +; mode 15 [row 14] +pmaddubsw m5, m2, [r5 + 1 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 860 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 1 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 1 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 861 * 16], m3 + +; mode 13 [row 28] +pslldq m6, 2 +pinsrb m6, [r3 + 25], 1 +pinsrb m6, [r3 + 28], 0 +pmaddubsw m3, m6, [r5 + 27 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrw m2, [r4 + 0], 0 +pmaddubsw m5, m2, [r5 + 27 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 760 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 8], 0 +pmaddubsw m3, m1, [r5 + 27 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 16], 0 +pmaddubsw m5, m4, [r5 + 27 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 761 * 16], m3 + +; mode 13 [row 29] +pmaddubsw m3, m6, [r5 + 18 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 18 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 762 * 16], m3 +pmaddubsw m3, m1, [r5 + 18 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 18 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 763 * 16], m3 + +; mode 13 [row 30] +pmaddubsw m3, m6, [r5 + 9 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 9 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 764 * 16], m3 +pmaddubsw m3, m1, [r5 + 9 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 9 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 765 * 16], m3 + +; mode 14 [row 19] +pslldq m7, 2 +pinsrb m7, [r3 + 17], 1 +pinsrb m7, [r3 + 20], 0 +pmaddubsw m3, m7, [r5 + 28 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 28 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 806 * 16], m3 +pmaddubsw m3, m1, [r5 + 28 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 28 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 807 * 16], m3 + +; mode 14 [row 20] +pmaddubsw m3, m7, [r5 + 15 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 15 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 808 * 16], m3 +pmaddubsw m3, m1, [r5 + 15 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 15 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 809 * 16], m3 + +; mode 14 [row 21] +pmaddubsw m3, m7, [r5 + 2 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 2 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 810 * 16], m3 +pmaddubsw m3, m1, [r5 + 2 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 811 * 16], m3 + +; mode 15 [row 15] +pmaddubsw m5, m2, [r5 + 16 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 862 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 16 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 16 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 863 * 16], m3 + +; mode 14 [row 22] +pslldq m7, 2 +pinsrb m7, [r3 + 20], 1 +pinsrb m7, [r3 + 22], 0 +pmaddubsw m3, m7, [r5 + 21 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrb m2, [r4 + 0], 1 +pinsrb m2, [r3 + 2], 0 +pmaddubsw m5, m2, [r5 + 21 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 812 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 7], 0 +pmaddubsw m3, m1, [r5 + 21 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 15], 0 +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 813 * 16], m3 + +; mode 14 [row 23] +pmaddubsw m3, m7, [r5 + 8 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 8 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 814 * 16], m3 +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 815 * 16], m3 + +; mode 15 [row 16] +pmaddubsw m5, m2, [r5 + 31 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 864 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 31 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 31 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 865 * 16], m3 + +; mode 15 [row 17] +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 866 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 867 * 16], m3 + +; mode 14 [row 24] +pslldq m7, 2 +pinsrb m7, [r3 + 22], 1 +pinsrb m7, [r3 + 25], 0 +pmaddubsw m3, m7, [r5 + 27 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrb m2, [r3 + 2], 1 +pinsrb m2, [r3 + 5], 0 +pmaddubsw m5, m2, [r5 + 27 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 816 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 6], 0 +pmaddubsw m3, m1, [r5 + 27 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 14], 0 +pmaddubsw m5, m4, [r5 + 27 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 817 * 16], m3 + +; mode 14 [row 25] +pmaddubsw m3, m7, [r5 + 14 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 14 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 818 * 16], m3 +pmaddubsw m3, m1, [r5 + 14 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 14 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 819 * 16], m3 + +; mode 14 [row 26] +pmaddubsw m3, m7, [r5 + 1 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 1 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 820 * 16], m3 +pmaddubsw m3, m1, [r5 + 1 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 1 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 821 * 16], m3 + +; mode 15 [row 18] +pinsrb m2, [r3 + 4], 0 +pmaddubsw m5, m2, [r5 + 29 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 868 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 29 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 29 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 869 * 16], m3 + +; mode 15 [row 19] +pmaddubsw m5, m2, [r5 + 12 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 870 * 16 + 8], m5 +pmaddubsw m3, m1, [r5 + 12 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 12 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 871 * 16], m3 + +; mode 15 [row 20 - 8 to 15] +pslldq m3, m2, 2 +pinsrb m3, [r3 + 4], 1 +pinsrb m3, [r3 + 6], 0 +pmaddubsw m5, m3, [r5 + 27 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 872 * 16 + 8], m5 + +; mode 15 [row 21 - 8 to 15] +pmaddubsw m5, m3, [r5 + 10 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 874 * 16 + 8], m5 + +; mode 15 [row 22 - 8 to 15] +pslldq m3, 2 +pinsrb m3, [r3 + 6], 1 +pinsrb m3, [r3 + 8], 0 +pmaddubsw m5, m3, [r5 + 25 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 876 * 16 + 8], m5 + +; mode 15 [row 23 - 8 to 15] +pmaddubsw m5, m3, [r5 + 8 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 878 * 16 + 8], m5 + +; mode 15 [row 24 - 8 to 15] +pslldq m3, 2 +pinsrb m3, [r3 + 8], 1 +pinsrb m3, [r3 + 9], 0 +pmaddubsw m5, m3, [r5 + 23 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 880 * 16 + 8], m5 + +; mode 15 [row 25 - 8 to 15] +pmaddubsw m5, m3, [r5 + 6 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 882 * 16 + 8], m5 + +; mode 15 [row 26 - 8 to 15] +pslldq m3, 2 +pinsrb m3, [r3 + 9], 1 +pinsrb m3, [r3 + 11], 0 +pmaddubsw m5, m3, [r5 + 21 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 884 * 16 + 8], m5 + +; mode 15 [row 27 - 8 to 15] +pmaddubsw m5, m3, [r5 + 4 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 886 * 16 + 8], m5 + +; mode 15 [row 28 - 8 to 15] +pslldq m3, 2 +pinsrb m3, [r3 + 11], 1 +pinsrb m3, [r3 + 13], 0 +pmaddubsw m5, m3, [r5 + 19 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 888 * 16 + 8], m5 + +; mode 15 [row 29 - 8 to 15] +pmaddubsw m5, m3, [r5 + 2 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 890 * 16 + 8], m5 + +; mode 15 [row 30 - 8 to 15] +pslldq m3, 2 +pinsrb m3, [r3 + 13], 1 +pinsrb m3, [r3 + 15], 0 +pmaddubsw m5, m3, [r5 + 17 * 16] +pmulhrsw m5, [pw_1024] +packuswb m5, m5 +movh [r0 + 892 * 16 + 8], m5 + +; mode 15 [row 31, 8 to 15] +pshufb m5, m3, [tab_S2] +movh [r0 + 894 * 16 + 8], m5 + +; mode 14 [row 27] +pinsrb m2, [r3 + 5], 0 +pslldq m7, 2 +pinsrb m7, [r3 + 25], 1 +pinsrb m7, [r3 + 27], 0 +pmaddubsw m3, m7, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrb m2, [r3 + 5], 1 +pinsrb m2, [r3 + 7], 0 +pmaddubsw m5, m2, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 822 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 5], 0 +pmaddubsw m3, m1, [r5 + 20 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 13], 0 +pmaddubsw m5, m4, [r5 + 20 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 823 * 16], m3 + +; mode 15 [row 20 - 16 to 31] +pmaddubsw m3, m1, [r5 + 27 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 27 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 873 * 16], m3 + +; mode 15 [row 21 - 16 to 31] +pmaddubsw m3, m1, [r5 + 10 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 10 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 875 * 16], m3 + +; mode 14 [row 28] +pmaddubsw m3, m7, [r5 + 7 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 7 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 824 * 16], m3 +pmaddubsw m3, m1, [r5 + 7 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 7 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 825 * 16], m3 + +; mode 14 [row 29] +pslldq m7, 2 +pinsrb m7, [r3 + 27], 1 +pinsrb m7, [r3 + 30], 0 +pmaddubsw m3, m7, [r5 + 26 * 16] +pmulhrsw m3, [pw_1024] +pslldq m2, 2 +pinsrb m2, [r3 + 7], 1 +pinsrb m2, [r3 + 10], 0 +pmaddubsw m5, m2, [r5 + 26 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 826 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 4], 0 +pmaddubsw m3, m1, [r5 + 26 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 12], 0 +pmaddubsw m5, m4, [r5 + 26 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 827 * 16], m3 + +; mode 14 [row 30] +pmaddubsw m3, m7, [r5 + 13 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m2, [r5 + 13 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 828 * 16], m3 +pmaddubsw m3, m1, [r5 + 13 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 13 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 829 * 16], m3 + +; mode 15 [row 22] +pmaddubsw m3, m1, [r5 + 25 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 25 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 877 * 16], m3 + +; mode 15 [row 23] +pmaddubsw m3, m1, [r5 + 8 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 8 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 879 * 16], m3 + +; mode 14 [row 31] +pshufb m3, m7, [tab_S2] +movh [r0 + 830 * 16], m3 +pshufb m3, m2, [tab_S2] +movh [r0 + 830 * 16 + 8], m3 +pshufb m3, m1, [tab_S2] +movh [r0 + 831 * 16], m3 +pshufb m3, m4, [tab_S2] +movh [r0 + 831 * 16 + 8], m3 + +; mode 13 [row 31] +pshufb m0, m6, [tab_S2] +movh [r0 + 766 * 16], m0 +movh m0, [r4] +movh [r0 + 766 * 16 + 8], m0 +movu m0, [r4 + 8] +movu [r0 + 767 * 16], m0 + +; mode 15 [row 24] +pslldq m1, 2 +pinsrw m1, [r4 + 3], 0 +pmaddubsw m3, m1, [r5 + 23 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 11], 0 +pmaddubsw m5, m4, [r5 + 23 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 881 * 16], m3 + +; mode 15 [row 25] +pmaddubsw m3, m1, [r5 + 6 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 6 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 883 * 16], m3 + +; mode 15 [row 26] +pslldq m1, 2 +pinsrw m1, [r4 + 2], 0 +pmaddubsw m3, m1, [r5 + 21 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 10], 0 +pmaddubsw m5, m4, [r5 + 21 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 885 * 16], m3 + +; mode 15 [row 27] +pmaddubsw m3, m1, [r5 + 4 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 4 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 887 * 16], m3 + +; mode 15 [row 28] +pslldq m1, 2 +pinsrw m1, [r4 + 1], 0 +pmaddubsw m3, m1, [r5 + 19 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 9], 0 +pmaddubsw m5, m4, [r5 + 19 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 889 * 16], m3 + +; mode 15 [row 29] +pmaddubsw m3, m1, [r5 + 2 * 16] +pmulhrsw m3, [pw_1024] +pmaddubsw m5, m4, [r5 + 2 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 891 * 16], m3 + +; mode 15 [row 30] +pslldq m1, 2 +pinsrw m1, [r4 + 0], 0 +pmaddubsw m3, m1, [r5 + 17 * 16] +pmulhrsw m3, [pw_1024] +pslldq m4, 2 +pinsrw m4, [r4 + 8], 0 +pmaddubsw m5, m4, [r5 + 17 * 16] +pmulhrsw m5, [pw_1024] +packuswb m3, m5 +movu [r0 + 893 * 16], m3 + +; mode 15 [row 31] +pshufb m5, m1, [tab_S2] +movh [r0 + 895 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 895 * 16 + 8], m5 + +; mode 16 [row 0] +movu m6, [r5 + 11 * 16] +movu m7, [pw_1024] +movh m0, [r4 ] +movh m1, [r4 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movh m2, [r4 + 8] +movh m3, [r4 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 896 * 16], m1 + +movh m1, [r4 + 16] +movh m3, [r4 + 17] +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movh m4, [r4 + 24] +movh m5, [r4 + 25] +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 897 * 16], m3 + +; mode16 [row 1] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r4], 1 +pinsrb m0, [r3 + 2], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 898 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 15], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 23], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 899 * 16], m3 + +; mode16 [row 2] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 900 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 901 * 16], m3 + +; mode16 [row 3] +movu m6, [r5 + 12 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 2], 1 +pinsrb m0, [r3 + 3], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 902 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 14], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 22], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 903 * 16], m3 + +; mode16 [row 4] +movu m6, [r5 + 23 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 3], 1 +pinsrb m0, [r3 + 5], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 904 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 13], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 21], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 905 * 16], m3 + +; mode16 [row 5] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 906 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 907 * 16], m3 + +; mode16 [row 6] +movu m6, [r5 + 13 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 5], 1 +pinsrb m0, [r3 + 6], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 5], 1 +pinsrb m2, [r4 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 908 * 16], m3 +pslldq m1, 2 +pinsrw m1, [r4 + 12], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 20], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 909 * 16], m3 + +; mode16 [row 7] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 6], 1 +pinsrb m0, [r3 + 8], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 910 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 11], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 19], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 911 * 16], m3 + +; mode16 [row 8] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 912 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 913 * 16], m3 + +; mode16 [row 9] +movu m6, [r5 + 14 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 8], 1 +pinsrb m0, [r3 + 9], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 914 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 10], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 18], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 915 * 16], m3 + +; mode16 [row 10] +movu m6, [r5 + 25 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 9], 1 +pinsrb m0, [r3 + 11], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 916 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 9], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrb m4, [r4 + 18], 1 +pinsrb m4, [r4 + 17], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 917 * 16], m3 + +; mode16 [row 11] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 918 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 919 * 16], m3 + +; mode16 [row 12] +movu m6, [r5 + 15 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 11], 1 +pinsrb m0, [r3 + 12], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 920 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 8], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 16], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 921 * 16], m3 + +; mode16 [row 13] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 12], 1 +pinsrb m0, [r3 + 14], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 0], 1 +pinsrb m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 922 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 7], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 15], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 923 * 16], m3 + +; mode16 [row 14] +movu m6, [r5 + 5 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 924 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 925 * 16], m3 + +; mode16 [row 15] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 14], 1 +pinsrb m0, [r3 + 15], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 2], 1 +pinsrb m2, [r3 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 926 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 6], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 14], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 927 * 16], m3 + +; mode16 [row 16] +movu m6, [r5 + 27 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 15], 1 +pinsrb m0, [r3 + 17], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 3], 1 +pinsrb m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 928 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 5], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 13], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 929 * 16], m3 + +; mode16 [row 17] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 930 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 931 * 16], m3 + +; mode16 [row 18] +movu m6, [r5 + 17 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 17], 1 +pinsrb m0, [r3 + 18], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 5], 1 +pinsrb m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 932 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 4], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 12], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 933 * 16], m3 + +; mode16 [row 19] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 18], 1 +pinsrb m0, [r3 + 20], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 6], 1 +pinsrb m2, [r3 + 8], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 934 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 3], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 11], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 935 * 16], m3 + +; mode16 [row 20] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 936 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 937 * 16], m3 + +; mode16 [row 21] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 20], 1 +pinsrb m0, [r3 + 21], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 8], 1 +pinsrb m2, [r3 + 9], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 938 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 2], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 10], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 939 * 16], m3 + +; mode16 [row 22] +movu m6, [r5 + 29 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 21], 1 +pinsrb m0, [r3 + 23], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 9], 1 +pinsrb m2, [r3 + 11], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 940 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 1], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 9], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 941 * 16], m3 + +; mode16 [row 23] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 942 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 943 * 16], m3 + +; mode16 [row 24] +movu m6, [r5 + 19 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 23], 1 +pinsrb m0, [r3 + 24], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 11], 1 +pinsrb m2, [r3 + 12], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 944 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 0], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 8], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 945 * 16], m3 + +; mode16 [row 25] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 24], 1 +pinsrb m0, [r3 + 26], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 12], 1 +pinsrb m2, [r3 + 14], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 946 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r4 + 0], 1 +pinsrb m1, [r3 + 2], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 7], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 947 * 16], m3 + +; mode16 [row 26] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 948 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 949 * 16], m3 + +; mode16 [row 27] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 26], 1 +pinsrb m0, [r3 + 27], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 14], 1 +pinsrb m2, [r3 + 15], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 950 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 2], 1 +pinsrb m1, [r3 + 3], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 6], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 951 * 16], m3 + +; mode16 [row 28] +movu m6, [r5 + 31 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 27], 1 +pinsrb m0, [r3 + 29], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 15], 1 +pinsrb m2, [r3 + 17], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 952 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 3], 1 +pinsrb m1, [r3 + 5], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 5], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 953 * 16], m3 + +; mode16 [row 29] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 954 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 955 * 16], m3 + +; mode16 [row 30] +movu m6, [r5 + 21 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 29], 1 +pinsrb m0, [r3 + 30], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 17], 1 +pinsrb m2, [r3 + 18], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 956 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 5], 1 +pinsrb m1, [r3 + 6], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 4], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 957 * 16], m3 + +; mode16 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 958 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 958 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 959 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 959 * 16 + 8], m5 + +; mode 17 [row 0] +movu m6, [r5 + 6 * 16] +movu m7, [pw_1024] +movh m0, [r4 ] +movh m1, [r4 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movh m2, [r4 + 8] +movh m3, [r4 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 960 * 16], m1 + +movh m1, [r4 + 16] +movh m3, [r4 + 17] +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movh m4, [r4 + 24] +movh m5, [r4 + 25] +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 961 * 16], m3 + +; mode17 [row 1] +movu m6, [r5 + 12 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 0], 1 +pinsrb m0, [r3 + 1], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 962 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 15], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 23], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 963 * 16], m3 + +; mode17 [row 2] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 1], 1 +pinsrb m0, [r3 + 2], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 964 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 14], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 22], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 965 * 16], m3 + +; mode17 [row 3] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 2], 1 +pinsrb m0, [r3 + 4], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 966 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 13], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 21], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 967 * 16], m3 + +; mode17 [row 4] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 4], 1 +pinsrb m0, [r3 + 5], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 968 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 12], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 20], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 969 * 16], m3 + +; mode17 [row 5] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 970 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 971 * 16], m3 + +; mode17 [row 6] +movu m6, [r5 + 10 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 5], 1 +pinsrb m0, [r3 + 6], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 972 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 11], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 19], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 973 * 16], m3 + +; mode17 [row 7] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 6], 1 +pinsrb m0, [r3 + 7], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 974 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 10], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 18], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 975 * 16], m3 + +; mode17 [row 8] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 7], 1 +pinsrb m0, [r3 + 9], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 976 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 9], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 17], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 977 * 16], m3 + +; mode17 [row 9] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 9], 1 +pinsrb m0, [r3 + 10], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrw m2, [r4 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 978 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 8], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 16], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 979 * 16], m3 + +; mode17 [row 10] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 980 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 981 * 16], m3 + +; mode17 [row 11] +movu m6, [r5 + 8 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 10], 1 +pinsrb m0, [r3 + 11], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 0], 1 +pinsrb m2, [r3 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 982 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 7], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 15], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 983 * 16], m3 + +; mode17 [row 12] +movu m6, [r5 + 14 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 11], 1 +pinsrb m0, [r3 + 12], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 1], 1 +pinsrb m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 984 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 6], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 14], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 985 * 16], m3 + +; mode17 [row 13] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 12], 1 +pinsrb m0, [r3 + 14], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 2], 1 +pinsrb m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 986 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 5], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 13], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 987 * 16], m3 + +; mode17 [row 14] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 14], 1 +pinsrb m0, [r3 + 15], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 4], 1 +pinsrb m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 988 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 4], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 12], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 989 * 16], m3 + +; mode17 [row 15] +pshufb m5, m0, [tab_S2] +movh [r0 + 990 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 990 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 991 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 991 * 16 + 8], m5 + +; mode17 [row 16] +movu m6, [r5 + 6 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 15], 1 +pinsrb m0, [r3 + 16], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 5], 1 +pinsrb m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 992 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 3], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 11], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 993 * 16], m3 + +; mode17 [row 17] +movu m6, [r5 + 12 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 16], 1 +pinsrb m0, [r3 + 17], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 6], 1 +pinsrb m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 994 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 2], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 10], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 995 * 16], m3 + +; mode17 [row 18] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 17], 1 +pinsrb m0, [r3 + 18], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 7], 1 +pinsrb m2, [r3 + 9], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 996 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 1], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 9], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 997 * 16], m3 + +; mode17 [row 19] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 18], 1 +pinsrb m0, [r3 + 20], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 9], 1 +pinsrb m2, [r3 + 10], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 998 * 16], m3 + +pslldq m1, 2 +pinsrw m1, [r4 + 0], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 8], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 999 * 16], m3 + +; mode17 [row 20] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 20], 1 +pinsrb m0, [r3 + 21], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 10], 1 +pinsrb m2, [r3 + 11], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1000 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r4 + 0], 1 +pinsrb m1, [r3 + 1], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +;pinsrb m4, [r4 + 8], 1 +;pinsrb m4, [r4 + 7], 0 +pinsrw m4, [r4 + 7], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1001 * 16], m3 + +; mode17 [row 21] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1002 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1003 * 16], m3 + +; mode17 [row 22] +movu m6, [r5 + 10 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 21], 1 +pinsrb m0, [r3 + 22], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 11], 1 +pinsrb m2, [r3 + 12], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1004 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 1], 1 +pinsrb m1, [r3 + 2], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 6], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1005 * 16], m3 + +; mode17 [row 23] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 22], 1 +pinsrb m0, [r3 + 23], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 12], 1 +pinsrb m2, [r3 + 14], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1006 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 2], 1 +pinsrb m1, [r3 + 4], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 5], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1007 * 16], m3 + +; mode17 [row 24] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 23], 1 +pinsrb m0, [r3 + 25], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 14], 1 +pinsrb m2, [r3 + 15], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1008 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 4], 1 +pinsrb m1, [r3 + 5], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 4], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1009 * 16], m3 + +; mode17 [row 25] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 25], 1 +pinsrb m0, [r3 + 26], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 15], 1 +pinsrb m2, [r3 + 16], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1010 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 5], 1 +pinsrb m1, [r3 + 6], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 3], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1011 * 16], m3 + +; mode17 [row 26] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1012 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1013 * 16], m3 + +; mode17 [row 27] +movu m6, [r5 + 8 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 26], 1 +pinsrb m0, [r3 + 27], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 16], 1 +pinsrb m2, [r3 + 17], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1014 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 6], 1 +pinsrb m1, [r3 + 7], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 2], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1015 * 16], m3 + +; mode17 [row 28] +movu m6, [r5 + 14 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 27], 1 +pinsrb m0, [r3 + 28], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 17], 1 +pinsrb m2, [r3 + 18], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1016 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 7], 1 +pinsrb m1, [r3 + 9], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 1], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1017 * 16], m3 + +; mode17 [row 29] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 28], 1 +pinsrb m0, [r3 + 30], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 18], 1 +pinsrb m2, [r3 + 20], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1018 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 9], 1 +pinsrb m1, [r3 + 10], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrw m4, [r4 + 0], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1019 * 16], m3 + +; mode17 [row 30] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r3 + 30], 1 +pinsrb m0, [r3 + 31], 0 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 20], 1 +pinsrb m2, [r3 + 21], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1020 * 16], m3 + +pslldq m1, 2 +pinsrb m1, [r3 + 10], 1 +pinsrb m1, [r3 + 11], 0 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pslldq m4, 2 +pinsrb m4, [r4 + 0], 1 +pinsrb m4, [r3 + 1], 0 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1021 * 16], m3 + +; mode17 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1022 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1022 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1023 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1023 * 16 + 8], m5 + +;mode 18[row 0] +movu m0, [r3] +movu [r0 + 1024 * 16], m0 +movu m1, [r3 + 16] +movu [r0 + 1025 * 16], m1 + +;mode 18[row 1] +pslldq m0, 1 +pinsrb m0, [r4 + 1], 0 +movu [r0 + 1026 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 15], 0 +movu [r0 + 1027 * 16], m1 + +;mode 18[row 2] +pslldq m0, 1 +pinsrb m0, [r4 + 2], 0 +movu [r0 + 1028 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 14], 0 +movu [r0 + 1029 * 16], m1 + +;mode 18[row 3] +pslldq m0, 1 +pinsrb m0, [r4 + 3], 0 +movu [r0 + 1030 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 13], 0 +movu [r0 + 1031 * 16], m1 + +;mode 18[row 4] +pslldq m0, 1 +pinsrb m0, [r4 + 4], 0 +movu [r0 + 1032 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 12], 0 +movu [r0 + 1033 * 16], m1 + +;mode 18[row 5] +pslldq m0, 1 +pinsrb m0, [r4 + 5], 0 +movu [r0 + 1034 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 11], 0 +movu [r0 + 1035 * 16], m1 + +;mode 18[row 6] +pslldq m0, 1 +pinsrb m0, [r4 + 6], 0 +movu [r0 + 1036 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 10], 0 +movu [r0 + 1037 * 16], m1 + +;mode 18[row 7] +pslldq m0, 1 +pinsrb m0, [r4 + 7], 0 +movu [r0 + 1038 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 9], 0 +movu [r0 + 1039 * 16], m1 + +;mode 18[row 8] +pslldq m0, 1 +pinsrb m0, [r4 + 8], 0 +movu [r0 + 1040 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 8], 0 +movu [r0 + 1041 * 16], m1 + +;mode 18[row 9] +pslldq m0, 1 +pinsrb m0, [r4 + 9], 0 +movu [r0 + 1042 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 7], 0 +movu [r0 + 1043 * 16], m1 + +;mode 18[row 10] +pslldq m0, 1 +pinsrb m0, [r4 + 10], 0 +movu [r0 + 1044 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 6], 0 +movu [r0 + 1045 * 16], m1 + +;mode 18[row 11] +pslldq m0, 1 +pinsrb m0, [r4 + 11], 0 +movu [r0 + 1046 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 5], 0 +movu [r0 + 1047 * 16], m1 + +;mode 18[row 12] +pslldq m0, 1 +pinsrb m0, [r4 + 12], 0 +movu [r0 + 1048 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 4], 0 +movu [r0 + 1049 * 16], m1 + +;mode 18[row 13] +pslldq m0, 1 +pinsrb m0, [r4 + 13], 0 +movu [r0 + 1050 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 3], 0 +movu [r0 + 1051 * 16], m1 + +;mode 18[row 14] +pslldq m0, 1 +pinsrb m0, [r4 + 14], 0 +movu [r0 + 1052 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 2], 0 +movu [r0 + 1053 * 16], m1 + +;mode 18[row 15] +pslldq m0, 1 +pinsrb m0, [r4 + 15], 0 +movu [r0 + 1054 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 1], 0 +movu [r0 + 1055 * 16], m1 + +;mode 18[row 16] +pslldq m0, 1 +pinsrb m0, [r4 + 16], 0 +movu [r0 + 1056 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r3 + 0], 0 +movu [r0 + 1057 * 16], m1 + +;mode 18[row 17] +pslldq m0, 1 +pinsrb m0, [r4 + 17], 0 +movu [r0 + 1058 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 1], 0 +movu [r0 + 1059 * 16], m1 + +;mode 18[row 18] +pslldq m0, 1 +pinsrb m0, [r4 + 18], 0 +movu [r0 + 1060 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 2], 0 +movu [r0 + 1061 * 16], m1 + +;mode 18[row 19] +pslldq m0, 1 +pinsrb m0, [r4 + 19], 0 +movu [r0 + 1062 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 3], 0 +movu [r0 + 1063 * 16], m1 + +;mode 18[row 20] +pslldq m0, 1 +pinsrb m0, [r4 + 20], 0 +movu [r0 + 1064 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 4], 0 +movu [r0 + 1065 * 16], m1 + +;mode 18[row 21] +pslldq m0, 1 +pinsrb m0, [r4 + 21], 0 +movu [r0 + 1066 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 5], 0 +movu [r0 + 1067 * 16], m1 + +;mode 18[row 22] +pslldq m0, 1 +pinsrb m0, [r4 + 22], 0 +movu [r0 + 1068 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 6], 0 +movu [r0 + 1069 * 16], m1 + +;mode 18[row 23] +pslldq m0, 1 +pinsrb m0, [r4 + 23], 0 +movu [r0 + 1070 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 7], 0 +movu [r0 + 1071 * 16], m1 + +;mode 18[row 24] +pslldq m0, 1 +pinsrb m0, [r4 + 24], 0 +movu [r0 + 1072 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 8], 0 +movu [r0 + 1073 * 16], m1 + +;mode 18[row 25] +pslldq m0, 1 +pinsrb m0, [r4 + 25], 0 +movu [r0 + 1074 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 9], 0 +movu [r0 + 1075 * 16], m1 + +;mode 18[row 26] +pslldq m0, 1 +pinsrb m0, [r4 + 26], 0 +movu [r0 + 1076 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 10], 0 +movu [r0 + 1077 * 16], m1 + +;mode 18[row 27] +pslldq m0, 1 +pinsrb m0, [r4 + 27], 0 +movu [r0 + 1078 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 11], 0 +movu [r0 + 1079 * 16], m1 + +;mode 18[row 28] +pslldq m0, 1 +pinsrb m0, [r4 + 28], 0 +movu [r0 + 1080 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 12], 0 +movu [r0 + 1081 * 16], m1 + +;mode 18[row 29] +pslldq m0, 1 +pinsrb m0, [r4 + 29], 0 +movu [r0 + 1082 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 13], 0 +movu [r0 + 1083 * 16], m1 + +;mode 18[row 30] +pslldq m0, 1 +pinsrb m0, [r4 + 30], 0 +movu [r0 + 1084 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 14], 0 +movu [r0 + 1085 * 16], m1 + +;mode 18[row 31] +pslldq m0, 1 +pinsrb m0, [r4 + 31], 0 +movu [r0 + 1086 * 16], m0 +pslldq m1, 1 +pinsrb m1, [r4 + 15], 0 +movu [r0 + 1087 * 16], m1 + +; mode 19 [row 0] +movu m6, [r5 + 6 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 1088 * 16], m1 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1089 * 16], m4 + +; mode 19 [row 1] +movu m6, [r5 + 12 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 1], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1090 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1091 * 16], m4 + +; mode 19 [row 2] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 1], 1 +pinsrb m0, [r4 + 2], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1092 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 14], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 22], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1093 * 16], m4 + +; mode 19 [row 3] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 2], 1 +pinsrb m0, [r4 + 4], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1094 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 13], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 21], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1095 * 16], m4 + +; mode 19 [row 4] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 4], 1 +pinsrb m0, [r4 + 5], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1096 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 12], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 20], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1097 * 16], m4 + +; mode 19 [row 5] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1098 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1099 * 16], m4 + +; mode 19 [row 6] +movu m6, [r5 + 10 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 5], 1 +pinsrb m0, [r4 + 6], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1100 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 11], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 19], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1101 * 16], m4 + +; mode 19 [row 7] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 6], 1 +pinsrb m0, [r4 + 7], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1102 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 10], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 18], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1103 * 16], m4 + +; mode 19 [row 8] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 7], 1 +pinsrb m0, [r4 + 9], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1104 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 9], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 17], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1105 * 16], m4 + +; mode 19 [row 9] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 9], 1 +pinsrb m0, [r4 + 10], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1106 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 8], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 16], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1107 * 16], m4 + +; mode 19 [row 10] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1108 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1109 * 16], m4 + +; mode 19 [row 11] +movu m6, [r5 + 8 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 10], 1 +pinsrb m0, [r4 + 11], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 0], 1 +pinsrb m2, [r4 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1110 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 7], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 15], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1111 * 16], m4 + +; mode 19 [row 12] +movu m6, [r5 + 14 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 11], 1 +pinsrb m0, [r4 + 12], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 1], 1 +pinsrb m2, [r4 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1112 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 6], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 14], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1113 * 16], m4 + +; mode 19 [row 13] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 12], 1 +pinsrb m0, [r4 + 14], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 2], 1 +pinsrb m2, [r4 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1114 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 5], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 13], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1115 * 16], m4 + +; mode 19 [row 14] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 14], 1 +pinsrb m0, [r4 + 15], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 4], 1 +pinsrb m2, [r4 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1116 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 4], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 12], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1117 * 16], m4 + +; mode19 [row 15] +pshufb m5, m0, [tab_S2] +movh [r0 + 1118 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1118 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1119 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1119 * 16 + 8], m5 + +; mode 19 [row 16] +movu m6, [r5 + 6 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 15], 1 +pinsrb m0, [r4 + 16], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 5], 1 +pinsrb m2, [r4 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1120 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 3], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 11], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1121 * 16], m4 + +; mode 19 [row 17] +movu m6, [r5 + 12 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 16], 1 +pinsrb m0, [r4 + 17], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 6], 1 +pinsrb m2, [r4 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1122 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 2], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 10], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1123 * 16], m4 + +; mode 19 [row 18] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 17], 1 +pinsrb m0, [r4 + 18], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 7], 1 +pinsrb m2, [r4 + 9], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1124 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 1], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 9], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1125 * 16], m4 + +; mode 19 [row 19] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 18], 1 +pinsrb m0, [r4 + 20], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 9], 1 +pinsrb m2, [r4 + 10], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1126 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 0], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 8], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1127 * 16], m4 + +; mode 19 [row 20] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 20], 1 +pinsrb m0, [r4 + 21], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 10], 1 +pinsrb m2, [r4 + 11], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1128 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 0], 1 +pinsrb m1, [r4 + 1], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrb m3, [r3 + 8], 1 +pinsrb m3, [r3 + 7], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1129 * 16], m4 + +; mode 19 [row 21] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1130 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1131 * 16], m4 + +; mode 19 [row 22] +movu m6, [r5 + 10 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 21], 1 +pinsrb m0, [r4 + 22], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 11], 1 +pinsrb m2, [r4 + 12], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1132 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 1], 1 +pinsrb m1, [r4 + 2], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 6], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1133 * 16], m4 + +; mode 19 [row 23] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 22], 1 +pinsrb m0, [r4 + 23], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 12], 1 +pinsrb m2, [r4 + 14], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1134 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 2], 1 +pinsrb m1, [r4 + 4], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 5], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1135 * 16], m4 + +; mode 19 [row 24] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 23], 1 +pinsrb m0, [r4 + 25], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 14], 1 +pinsrb m2, [r4 + 15], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1136 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 4], 1 +pinsrb m1, [r4 + 5], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 4], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1137 * 16], m4 + +; mode 19 [row 25] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 25], 1 +pinsrb m0, [r4 + 26], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 15], 1 +pinsrb m2, [r4 + 16], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1138 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 5], 1 +pinsrb m1, [r4 + 6], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 3], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1139 * 16], m4 + +; mode 19 [row 26] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1140 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1141 * 16], m4 + +; mode 19 [row 27] +movu m6, [r5 + 8 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 26], 1 +pinsrb m0, [r4 + 27], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 16], 1 +pinsrb m2, [r4 + 17], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1142 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 6], 1 +pinsrb m1, [r4 + 7], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 2], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1143 * 16], m4 + +; mode 19 [row 28] +movu m6, [r5 + 14 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 27], 1 +pinsrb m0, [r4 + 28], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 17], 1 +pinsrb m2, [r4 + 18], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1144 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 7], 1 +pinsrb m1, [r4 + 9], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 1], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1145 * 16], m4 + +; mode 19 [row 29] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 28], 1 +pinsrb m0, [r4 + 30], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 18], 1 +pinsrb m2, [r4 + 20], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1146 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 9], 1 +pinsrb m1, [r4 + 10], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 0], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1147 * 16], m4 + +; mode 19 [row 30] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 30], 1 +pinsrb m0, [r4 + 31], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 20], 1 +pinsrb m2, [r4 + 21], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1148 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 10], 1 +pinsrb m1, [r4 + 11], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrb m3, [r4 + 0], 1 +pinsrb m3, [r4 + 1], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1149 * 16], m4 + +; mode19 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1150 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1150 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1151 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1151 * 16 + 8], m5 + +; mode 20 [row 0] +movu m6, [r5 + 11 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 1152 * 16], m1 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1153 * 16], m4 + +; mode 20 [row 1] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 2], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1154 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1155 * 16], m4 + +; mode 20 [row 2] +movu m6, [r5 + 1 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1156 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1157 * 16], m4 + +; mode 20 [row 3] +movu m6, [r5 + 12 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 2], 1 +pinsrb m0, [r4 + 3], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1158 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 14], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 22], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1159 * 16], m4 + +; mode 20 [row 4] +movu m6, [r5 + 23 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 3], 1 +pinsrb m0, [r4 + 5], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1160 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 13], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 21], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1161 * 16], m4 + +; mode 20 [row 5] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1162 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1163 * 16], m4 + +; mode 20 [row 6] +movu m6, [r5 + 13 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 5], 1 +pinsrb m0, [r4 + 6], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1164 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 12], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 20], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1165 * 16], m4 + +; mode 20 [row 7] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 6], 1 +pinsrb m0, [r4 + 8], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1166 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 11], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 19], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1167 * 16], m4 + +; mode 20 [row 8] +movu m6, [r5 + 3 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1168 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1169 * 16], m4 + +; mode 20 [row 9] +movu m6, [r5 + 14 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 8], 1 +pinsrb m0, [r4 + 9], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 3], 1 +pinsrb m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1170 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 10], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 18], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1171 * 16], m4 + +; mode 20 [row 10] +movu m6, [r5 + 25 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 9], 1 +pinsrb m0, [r4 + 11], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1172 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 9], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 17], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1173 * 16], m4 + +; mode 20 [row 11] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1174 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1175 * 16], m4 + +; mode 20 [row 12] +movu m6, [r5 + 15 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 11], 1 +pinsrb m0, [r4 + 12], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r3 + 1], 1 +pinsrb m2, [r3 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1176 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 8], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 16], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1177 * 16], m4 + +; mode 20 [row 13] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 12], 1 +pinsrb m0, [r4 + 14], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 0], 1 +pinsrb m2, [r4 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1178 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 7], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 15], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1179 * 16], m4 + +; mode 20 [row 14] +movu m6, [r5 + 5 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1180 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1181 * 16], m4 + +; mode 20 [row 15] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 14], 1 +pinsrb m0, [r4 + 15], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 2], 1 +pinsrb m2, [r4 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1182 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 6], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 14], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1183 * 16], m4 + +; mode 20 [row 16] +movu m6, [r5 + 27 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 15], 1 +pinsrb m0, [r4 + 17], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 3], 1 +pinsrb m2, [r4 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1184 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 5], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 13], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1185 * 16], m4 + +; mode 20 [row 17] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1186 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1187 * 16], m4 + +; mode 20 [row 18] +movu m6, [r5 + 17 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 17], 1 +pinsrb m0, [r4 + 18], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 5], 1 +pinsrb m2, [r4 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1188 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 4], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 12], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1189 * 16], m4 + +; mode 20 [row 19] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 18], 1 +pinsrb m0, [r4 + 20], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 6], 1 +pinsrb m2, [r4 + 8], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1190 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 3], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 11], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1191 * 16], m4 + +; mode 20 [row 20] +movu m6, [r5 + 7 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1192 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1193 * 16], m4 + +; mode 20 [row 21] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 20], 1 +pinsrb m0, [r4 + 21], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 8], 1 +pinsrb m2, [r4 + 9], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1194 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 2], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 10], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1195 * 16], m4 + +; mode 20 [row 22] +movu m6, [r5 + 29 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 21], 1 +pinsrb m0, [r4 + 23], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 9], 1 +pinsrb m2, [r4 + 11], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1196 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 1], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 9], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1197 * 16], m4 + +; mode 20 [row 23] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1198 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1199 * 16], m4 + +; mode 20 [row 24] +movu m6, [r5 + 19 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 23], 1 +pinsrb m0, [r4 + 24], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 11], 1 +pinsrb m2, [r4 + 12], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1200 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 0], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 8], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1201 * 16], m4 + +; mode 20 [row 25] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 24], 1 +pinsrb m0, [r4 + 26], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 12], 1 +pinsrb m2, [r4 + 14], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1202 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 0], 1 +pinsrb m1, [r4 + 2], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 7], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1203 * 16], m4 + +; mode 20 [row 26] +movu m6, [r5 + 9 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1204 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1205 * 16], m4 + +; mode 20 [row 27] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 26], 1 +pinsrb m0, [r4 + 27], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 14], 1 +pinsrb m2, [r4 + 15], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1206 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 2], 1 +pinsrb m1, [r4 + 3], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 6], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1207 * 16], m4 + +; mode 20 [row 28] +movu m6, [r5 + 31 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 27], 1 +pinsrb m0, [r4 + 29], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 15], 1 +pinsrb m2, [r4 + 17], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1208 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 3], 1 +pinsrb m1, [r4 + 5], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 5], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1209 * 16], m4 + +; mode 20 [row 29] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1210 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1211 * 16], m4 + +; mode 20 [row 30] +movu m6, [r5 + 21 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 29], 1 +pinsrb m0, [r4 + 30], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 17], 1 +pinsrb m2, [r4 + 18], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1212 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r4 + 5], 1 +pinsrb m1, [r4 + 6], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 4], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1213 * 16], m4 + +; mode20 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1214 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1214 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1215 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1215 * 16 + 8], m5 + +; mode 21 [row 0] +movu m6, [r5 + 15 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 1216 * 16], m1 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1217 * 16], m4 + +; mode 21 [row 1] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 2], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1218 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1219 * 16], m4 + +; mode 21 [row 2] +movu m6, [r5 + 13 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1220 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1221 * 16], m4 + +; mode 21 [row 3] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 2], 1 +pinsrb m0, [r4 + 4], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1222 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 14], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 22], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1223 * 16], m4 + +; mode 21 [row 4] +movu m6, [r5 + 11 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1224 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1225 * 16], m4 + +; mode 21 [row 5] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 4], 1 +pinsrb m0, [r4 + 6], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1226 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 13], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 21], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1227 * 16], m4 + +; mode 21 [row 6] +movu m6, [r5 + 9 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1228 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1229 * 16], m4 + +; mode 21 [row 7] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 6], 1 +pinsrb m0, [r4 + 8], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1230 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 12], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 20], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1231 * 16], m4 + +; mode 21 [row 8] +movu m6, [r5 + 7 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1232 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1233 * 16], m4 + +; mode 21 [row 9] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 8], 1 +pinsrb m0, [r4 + 9], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1234 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 11], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 19], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1235 * 16], m4 + +; mode 21 [row 10] +movu m6, [r5 + 5 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1236 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1237 * 16], m4 + +; mode 21 [row 11] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 9], 1 +pinsrb m0, [r4 + 11], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1238 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 10], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 18], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1239 * 16], m4 + +; mode 21 [row 12] +movu m6, [r5 + 3 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1240 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1241 * 16], m4 + +; mode 21 [row 13] +movu m6, [r5 + 18 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 11], 1 +pinsrb m0, [r4 + 13], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1242 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 9], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 17], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1243 * 16], m4 + +; mode 21 [row 14] +movu m6, [r5 + 1 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1244 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1245 * 16], m4 + +; mode 21 [row 15] +movu m6, [r5 + 16 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 13], 1 +pinsrb m0, [r4 + 15], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1246 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 8], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 16], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1247 * 16], m4 + +; mode 21 [row 16] +movu m6, [r5 + 31 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 15], 1 +pinsrb m0, [r4 + 17], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 0], 1 +pinsrb m2, [r4 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1248 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 7], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 15], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1249 * 16], m4 + +; mode 21 [row 17] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1250 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1251 * 16], m4 + +; mode 21 [row 18] +movu m6, [r5 + 29 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 17], 1 +pinsrb m0, [r4 + 19], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 2], 1 +pinsrb m2, [r4 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1252 * 16], m4 +pslldq m1, 2 +pinsrb m1, [r3 + 7], 1 +pinsrb m1, [r3 + 6], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrb m3, [r3 + 15], 1 +pinsrb m3, [r3 + 14], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1253 * 16], m4 + +; mode 21 [row 19] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1254 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1255 * 16], m4 + +; mode 21 [row 20] +movu m6, [r5 + 27 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 19], 1 +pinsrb m0, [r4 + 21], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 4], 1 +pinsrb m2, [r4 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1256 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 5], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 13], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1257 * 16], m4 + +; mode 21 [row 21] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1258 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1259 * 16], m4 + +; mode 21 [row 22] +movu m6, [r5 + 25 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 21], 1 +pinsrb m0, [r4 + 23], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 6], 1 +pinsrb m2, [r4 + 8], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1260 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 4], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 12], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1261 * 16], m4 + +; mode 21 [row 23] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1262 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1263 * 16], m4 + +; mode 21 [row 24] +movu m6, [r5 + 23 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 23], 1 +pinsrb m0, [r4 + 24], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 8], 1 +pinsrb m2, [r4 + 9], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1264 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 3], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 11], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1265 * 16], m4 + +; mode 21 [row 25] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1266 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1267 * 16], m4 + +; mode 21 [row 26] +movu m6, [r5 + 21 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 24], 1 +pinsrb m0, [r4 + 26], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 9], 1 +pinsrb m2, [r4 + 11], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1268 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 2], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 10], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1269 * 16], m4 + +; mode 21 [row 27] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1270 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1271 * 16], m4 + +; mode 21 [row 28] +movu m6, [r5 + 19 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 26], 1 +pinsrb m0, [r4 + 28], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 11], 1 +pinsrb m2, [r4 + 13], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1272 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 1], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 9], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1273 * 16], m4 + +; mode 21 [row 29] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1274 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1275 * 16], m4 + +; mode 21 [row 30] +movu m6, [r5 + 17 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 28], 1 +pinsrb m0, [r4 + 30], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 13], 1 +pinsrb m2, [r4 + 15], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1276 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 0], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 8], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1277 * 16], m4 + +; mode21 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1278 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1278 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1279 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1279 * 16 + 8], m5 + +; mode 22 [row 0] +movu m6, [r5 + 19 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 1280 * 16], m1 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1281 * 16], m4 + +; mode 22 [row 1] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1282 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1283 * 16], m4 + +; mode 22 [row 2] +movu m6, [r5 + 25 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 2], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1284 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1285 * 16], m4 + +; mode 22 [row 3] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1286 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1287 * 16], m4 + +; mode 22 [row 4] +movu m6, [r5 + 31 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 2], 1 +pinsrb m0, [r4 + 5], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1288 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 14], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 22], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1289 * 16], m4 + +; mode 22 [row 5] +movu m6, [r5 + 18 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1290 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1291 * 16], m4 + +; mode 22 [row 6] +movu m6, [r5 + 5 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1292 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1293 * 16], m4 + +; mode 22 [row 7] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 5], 1 +pinsrb m0, [r4 + 7], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1294 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 13], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 21], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1295 * 16], m4 + +; mode 22 [row 8] +movu m6, [r5 + 11 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1296 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1297 * 16], m4 + +; mode 22 [row 9] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 7], 1 +pinsrb m0, [r4 + 10], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1298 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 12], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 20], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1299 * 16], m4 + +; mode 22 [row 10] +movu m6, [r5 + 17 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1300 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1301 * 16], m4 + +; mode 22 [row 11] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1302 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1303 * 16], m4 + +; mode 22 [row 12] +movu m6, [r5 + 23 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 10], 1 +pinsrb m0, [r4 + 12], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1304 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 11], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 19], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1305 * 16], m4 + +; mode 22 [row 13] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1306 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1307 * 16], m4 + +; mode 22 [row 14] +movu m6, [r5 + 29 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 12], 1 +pinsrb m0, [r4 + 15], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1308 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 10], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 18], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1309 * 16], m4 + +; mode 22 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1310 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1311 * 16], m4 + +; mode 22 [row 16] +movu m6, [r5 + 3 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1312 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1313 * 16], m4 + +; mode 22 [row 17] +movu m6, [r5 + 22 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 15], 1 +pinsrb m0, [r4 + 17], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1314 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 9], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 17], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1315 * 16], m4 + +; mode 22 [row 18] +movu m6, [r5 + 9 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1316 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1317 * 16], m4 + +; mode 22 [row 19] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 17], 1 +pinsrb m0, [r4 + 20], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1318 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 8], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 16], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1319 * 16], m4 + +; mode 22 [row 20] +movu m6, [r5 + 15 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1320 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1321 * 16], m4 + +; mode 22 [row 21] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1322 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1323 * 16], m4 + +; mode 22 [row 22] +movu m6, [r5 + 21 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 20], 1 +pinsrb m0, [r4 + 22], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 0], 1 +pinsrb m2, [r4 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1324 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 7], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 15], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1325 * 16], m4 + +; mode 22 [row 23] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1326 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1327 * 16], m4 + +; mode 22 [row 24] +movu m6, [r5 + 27 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 22], 1 +pinsrb m0, [r4 + 25], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 2], 1 +pinsrb m2, [r4 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1328 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 6], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 14], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1329 * 16], m4 + +; mode 22 [row 25] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1330 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1331 * 16], m4 + +; mode 22 [row 26] +movu m6, [r5 + 1 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1332 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1333 * 16], m4 + +; mode 22 [row 27] +movu m6, [r5 + 20 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 25], 1 +pinsrb m0, [r4 + 27], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 5], 1 +pinsrb m2, [r4 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1334 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 5], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 13], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1335 * 16], m4 + +; mode 22 [row 28] +movu m6, [r5 + 7 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1336 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1337 * 16], m4 + +; mode 22 [row 29] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 27], 1 +pinsrb m0, [r4 + 30], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrb m2, [r4 + 7], 1 +pinsrb m2, [r4 + 10], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1338 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 4], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 12], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1339 * 16], m4 + +; mode 22 [row 30] +movu m6, [r5 + 13 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1340 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1341 * 16], m4 + +; mode22 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1342 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1342 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1343 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1343 * 16 + 8], m5 + +; mode 23 [row 0] +movu m6, [r5 + 23 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m1, m0, m6 +pmulhrsw m1, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m3, m2, m6 +pmulhrsw m3, m7 +packuswb m1, m3 +movu [r0 + 1344 * 16], m1 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1345 * 16], m4 + +; mode 23 [row 1] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1346 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1347 * 16], m4 + +; mode 23 [row 2] +movu m6, [r5 + 5 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1348 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1349 * 16], m4 + +; mode 23 [row 3] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 4], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1350 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1351 * 16], m4 + +; mode 23 [row 4] +movu m6, [r5 + 19 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1352 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1353 * 16], m4 + +; mode 23 [row 5] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1354 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1355 * 16], m4 + +; mode 23 [row 6] +movu m6, [r5 + 1 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1356 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1357 * 16], m4 + +; mode 23 [row 7] +movu m6, [r5 + 24 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 4], 1 +pinsrb m0, [r4 + 7], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1358 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 14], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 22], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1359 * 16], m4 + +; mode 23 [row 8] +movu m6, [r5 + 15 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1360 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1361 * 16], m4 + +; mode 23 [row 9] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1362 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1363 * 16], m4 + +; mode 23 [row 10] +movu m6, [r5 + 29 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 7], 1 +pinsrb m0, [r4 + 11], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1364 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 13], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 21], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1365 * 16], m4 + +; mode 23 [row 11] +movu m6, [r5 + 20 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1366 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1367 * 16], m4 + +; mode 23 [row 12] +movu m6, [r5 + 11 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1368 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1369 * 16], m4 + +; mode 23 [row 13] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1370 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1371 * 16], m4 + +; mode 23 [row 14] +movu m6, [r5 + 25 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 11], 1 +pinsrb m0, [r4 + 14], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1372 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 12], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 20], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1373 * 16], m4 + +; mode 23 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1374 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1375 * 16], m4 + +; mode 23 [row 16] +movu m6, [r5 + 7 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1376 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1377 * 16], m4 + +; mode 23 [row 17] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 14], 1 +pinsrb m0, [r4 + 18], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 3], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1378 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 11], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 19], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1379 * 16], m4 + +; mode 23 [row 18] +movu m6, [r5 + 21 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1380 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1381 * 16], m4 + +; mode 23 [row 19] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1382 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1383 * 16], m4 + +; mode 23 [row 20] +movu m6, [r5 + 3 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1384 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1385 * 16], m4 + +; mode 23 [row 21] +movu m6, [r5 + 26 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 18], 1 +pinsrb m0, [r4 + 21], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 2], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1386 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 10], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 18], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1387 * 16], m4 + +; mode 23 [row 22] +movu m6, [r5 + 17 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1388 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1389 * 16], m4 + +; mode 23 [row 23] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1390 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1391 * 16], m4 + +; mode 23 [row 24] +movu m6, [r5 + 31 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 21], 1 +pinsrb m0, [r4 + 25], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 1], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1392 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 9], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 17], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1393 * 16], m4 + +; mode 23 [row 25] +movu m6, [r5 + 22 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1394 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1395 * 16], m4 + +; mode 23 [row 26] +movu m6, [r5 + 13 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1396 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1397 * 16], m4 + +; mode 23 [row 27] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1398 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1399 * 16], m4 + +; mode 23 [row 28] +movu m6, [r5 + 27 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 25], 1 +pinsrb m0, [r4 + 28], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 0], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1400 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 8], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 16], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1401 * 16], m4 + +; mode 23 [row 29] +movu m6, [r5 + 18 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1402 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1403 * 16], m4 + +; mode 23 [row 30] +movu m6, [r5 + 9 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1404 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1405 * 16], m4 + +; mode23 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1406 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1406 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1407 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1407 * 16 + 8], m5 + +; mode 24 [row 0] +movu m6, [r5 + 27 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1408 * 16], m4 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1409 * 16], m4 + +; mode 24 [row 1] +movu m6, [r5 + 22 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1410 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1411 * 16], m4 + +; mode 24 [row 2] +movu m6, [r5 + 17 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1412 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1413 * 16], m4 + +; mode 24 [row 3] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1414 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1415 * 16], m4 + +; mode 24 [row 4] +movu m6, [r5 + 7 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1416 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1417 * 16], m4 + +; mode 24 [row 5] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1418 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1419 * 16], m4 + +; mode 24 [row 6] +movu m6, [r5 + 29 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 6], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1420 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1421 * 16], m4 + +; mode 24 [row 7] +movu m6, [r5 + 24 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1422 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1423 * 16], m4 + +; mode 24 [row 8] +movu m6, [r5 + 19 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1424 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1425 * 16], m4 + +; mode 24 [row 9] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1426 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1427 * 16], m4 + +; mode 24 [row 10] +movu m6, [r5 + 9 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1428 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1429 * 16], m4 + +; mode 24 [row 11] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1430 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1431 * 16], m4 + +; mode 24 [row 12] +movu m6, [r5 + 31 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 6], 1 +pinsrb m0, [r4 + 13], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 6], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1432 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 14], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 22], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1433 * 16], m4 + +; mode 24 [row 13] +movu m6, [r5 + 26 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1434 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1435 * 16], m4 + +; mode 24 [row 14] +movu m6, [r5 + 21 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1436 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1437 * 16], m4 + +; mode 24 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1438 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1439 * 16], m4 + +; mode 24 [row 16] +movu m6, [r5 + 11 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1440 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1441 * 16], m4 + +; mode 24 [row 17] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1442 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1443 * 16], m4 + +; mode 24 [row 18] +movu m6, [r5 + 1 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1444 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1445 * 16], m4 + +; mode 24 [row 19] +movu m6, [r5 + 28 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 13], 1 +pinsrb m0, [r4 + 19], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 5], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1446 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 13], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 21], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1447 * 16], m4 + +; mode 24 [row 20] +movu m6, [r5 + 23 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1448 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1449 * 16], m4 + +; mode 24 [row 21] +movu m6, [r5 + 18 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1450 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1451 * 16], m4 + +; mode 24 [row 22] +movu m6, [r5 + 13 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1452 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1453 * 16], m4 + +; mode 24 [row 23] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1454 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1455 * 16], m4 + +; mode 24 [row 24] +movu m6, [r5 + 3 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1456 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1457 * 16], m4 + +; mode 24 [row 25] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 19], 1 +pinsrb m0, [r4 + 26], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 4], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1458 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 12], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 20], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1459 * 16], m4 + +; mode 24 [row 26] +movu m6, [r5 + 25 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1460 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1461 * 16], m4 + +; mode 24 [row 27] +movu m6, [r5 + 20 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1462 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1463 * 16], m4 + +; mode 24 [row 28] +movu m6, [r5 + 15 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1464 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1465 * 16], m4 + +; mode 24 [row 29] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1466 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1467 * 16], m4 + +; mode 24 [row 30] +movu m6, [r5 + 5 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1468 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1469 * 16], m4 + +; mode 24 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1470 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1470 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1471 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1471 * 16 + 8], m5 + +; mode 25 [row 0] +movu m6, [r5 + 30 * 16] +movu m0, [r3 ] +movu m1, [r3 + 1 ] +punpcklbw m0, m1 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +movu m2, [r3 + 8] +movu m3, [r3 + 9] +punpcklbw m2, m3 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1472 * 16], m4 + +movu m1, [r3 + 16] +movu m3, [r3 + 17] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 24] +movu m5, [r3 + 25] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1473 * 16], m4 + +; mode 25 [row 1] +movu m6, [r5 + 28 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1474 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1475 * 16], m4 + +; mode 25 [row 2] +movu m6, [r5 + 26 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1476 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1477 * 16], m4 + +; mode 25 [row 3] +movu m6, [r5 + 24 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1478 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1479 * 16], m4 + +; mode 25 [row 4] +movu m6, [r5 + 22 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1480 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1481 * 16], m4 + +; mode 25 [row 5] +movu m6, [r5 + 20 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1482 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1483 * 16], m4 + +; mode 25 [row 6] +movu m6, [r5 + 18 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1484 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1485 * 16], m4 + +; mode 25 [row 7] +movu m6, [r5 + 16 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1486 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1487 * 16], m4 + +; mode 25 [row 8] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1488 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1489 * 16], m4 + +; mode 25 [row 9] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1490 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1491 * 16], m4 + +; mode 25 [row 10] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1492 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1493 * 16], m4 + +; mode 25 [row 11] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1494 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1495 * 16], m4 + +; mode 25 [row 12] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1496 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1497 * 16], m4 + +; mode 25 [row 13] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1498 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1499 * 16], m4 + +; mode 25 [row 14] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1500 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1501 * 16], m4 + +; mode 25 [row 15] +pshufb m5, m0, [tab_S2] +movh [r0 + 1502 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1502 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1503 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1503 * 16 + 8], m5 + +; mode 25 [row 16] +movu m6, [r5 + 30 * 16] +pslldq m0, 2 +pinsrb m0, [r4 + 0], 1 +pinsrb m0, [r4 + 16], 0 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pslldq m2, 2 +pinsrw m2, [r3 + 7], 0 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1504 * 16], m4 +pslldq m1, 2 +pinsrw m1, [r3 + 15], 0 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pslldq m3, 2 +pinsrw m3, [r3 + 23], 0 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1505 * 16], m4 + +; mode 25 [row 17] +movu m6, [r5 + 28 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1506 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1507 * 16], m4 + +; mode 25 [row 18] +movu m6, [r5 + 26 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1508 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1509 * 16], m4 + +; mode 25 [row 19] +movu m6, [r5 + 24 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1510 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1511 * 16], m4 + +; mode 25 [row 20] +movu m6, [r5 + 22 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1512 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1513 * 16], m4 + +; mode 25 [row 21] +movu m6, [r5 + 20 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1514 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1515 * 16], m4 + +; mode 25 [row 22] +movu m6, [r5 + 18 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1516 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1517 * 16], m4 + +; mode 25 [row 23] +movu m6, [r5 + 16 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1518 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1519 * 16], m4 + +; mode 25 [row 24] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1520 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1521 * 16], m4 + +; mode 25 [row 25] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1522 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1523 * 16], m4 + +; mode 25 [row 26] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1524 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1525 * 16], m4 + +; mode 25 [row 27] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1526 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1527 * 16], m4 + +; mode 25 [row 28] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1528 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1529 * 16], m4 + +; mode 25 [row 29] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1530 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1531 * 16], m4 + +; mode 25 [row 30] +movu m6, [r5 + 2 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1532 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1533 * 16], m4 + +; mode 25 [row 31] +pshufb m5, m0, [tab_S2] +movh [r0 + 1534 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1534 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1535 * 16], m5 +pshufb m5, m3, [tab_S2] +movh [r0 + 1535 * 16 + 8], m5 + +; mode 26 +movu m1, [r1 + 1] +movu m2, [r1 + 17] +movu [r0 + 1536 * 16], m1 +movu [r0 + 1537 * 16], m2 +movu [r0 + 1538 * 16], m1 +movu [r0 + 1539 * 16], m2 +movu [r0 + 1540 * 16], m1 +movu [r0 + 1541 * 16], m2 +movu [r0 + 1542 * 16], m1 +movu [r0 + 1543 * 16], m2 +movu [r0 + 1544 * 16], m1 +movu [r0 + 1545 * 16], m2 +movu [r0 + 1546 * 16], m1 +movu [r0 + 1547 * 16], m2 +movu [r0 + 1548 * 16], m1 +movu [r0 + 1549 * 16], m2 +movu [r0 + 1550 * 16], m1 +movu [r0 + 1551 * 16], m2 + +movu [r0 + 1552 * 16], m1 +movu [r0 + 1553 * 16], m2 +movu [r0 + 1554 * 16], m1 +movu [r0 + 1555 * 16], m2 +movu [r0 + 1556 * 16], m1 +movu [r0 + 1557 * 16], m2 +movu [r0 + 1558 * 16], m1 +movu [r0 + 1559 * 16], m2 +movu [r0 + 1560 * 16], m1 +movu [r0 + 1561 * 16], m2 +movu [r0 + 1562 * 16], m1 +movu [r0 + 1563 * 16], m2 +movu [r0 + 1564 * 16], m1 +movu [r0 + 1565 * 16], m2 +movu [r0 + 1566 * 16], m1 +movu [r0 + 1567 * 16], m2 + +movu [r0 + 1568 * 16], m1 +movu [r0 + 1569 * 16], m2 +movu [r0 + 1570 * 16], m1 +movu [r0 + 1571 * 16], m2 +movu [r0 + 1572 * 16], m1 +movu [r0 + 1573 * 16], m2 +movu [r0 + 1574 * 16], m1 +movu [r0 + 1575 * 16], m2 +movu [r0 + 1576 * 16], m1 +movu [r0 + 1577 * 16], m2 +movu [r0 + 1578 * 16], m1 +movu [r0 + 1579 * 16], m2 +movu [r0 + 1580 * 16], m1 +movu [r0 + 1581 * 16], m2 +movu [r0 + 1582 * 16], m1 +movu [r0 + 1583 * 16], m2 + +movu [r0 + 1584 * 16], m1 +movu [r0 + 1585 * 16], m2 +movu [r0 + 1586 * 16], m1 +movu [r0 + 1587 * 16], m2 +movu [r0 + 1588 * 16], m1 +movu [r0 + 1589 * 16], m2 +movu [r0 + 1590 * 16], m1 +movu [r0 + 1591 * 16], m2 +movu [r0 + 1592 * 16], m1 +movu [r0 + 1593 * 16], m2 +movu [r0 + 1594 * 16], m1 +movu [r0 + 1595 * 16], m2 +movu [r0 + 1596 * 16], m1 +movu [r0 + 1597 * 16], m2 +movu [r0 + 1598 * 16], m1 +movu [r0 + 1599 * 16], m2 + +; mode 27 [row 0] +movu m6, [r5 + 2 * 16] +movu m0, [r3 + 1 ] +movu m1, [r3 + 2 ] +punpcklbw m0, m1 +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +movu m2, [r3 + 9] +movu m3, [r3 + 10] +punpcklbw m2, m3 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1600 * 16], m4 + +movu m1, [r3 + 17] +movu m3, [r3 + 18] +punpcklbw m1, m3 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +movu m3, [r3 + 25] +movu m5, [r3 + 26] +punpcklbw m3, m5 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1601 * 16], m4 + +; mode 27 [row 1] +movu m6, [r5 + 4 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1602 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1603 * 16], m4 + +; mode 27 [row 2] +movu m6, [r5 + 6 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1604 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1605 * 16], m4 + +; mode 27 [row 3] +movu m6, [r5 + 8 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1606 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1607 * 16], m4 + +; mode 27 [row 4] +movu m6, [r5 + 10 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1608 * 16], m4 + +; mode 28 [row 1 -first half] +movu [r0 + 1666 * 16], m4 + +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1609 * 16], m4 + +; mode 28 [row 1 - second half] +movu [r0 + 1667 * 16], m4 + +; mode 27 [row 5] +movu m6, [r5 + 12 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1610 * 16], m4 + +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1611 * 16], m4 + +; mode 27 [row 6] +movu m6, [r5 + 14 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1612 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1613 * 16], m4 + +; mode 27 [row 7] +movu m6, [r5 + 16 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1614 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1615 * 16], m4 + +; mode 27 [row 8] +movu m6, [r5 + 18 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1616 * 16], m4 + +; mode 29 [row 1 - first half] +movu [r0 + 1730 * 16], m4 + +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1617 * 16], m4 + +; mode 29 [row 1 - second half] +movu [r0 + 1731 * 16], m4 + +; mode 27 [row 9] +movu m6, [r5 + 20 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1618 * 16], m4 + +; mode 28 [row 3 -first half] +movu [r0 + 1670 * 16], m4 + +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1619 * 16], m4 + +; mode 28 [row 3 -second half] +movu [r0 + 1671 * 16], m4 + +; mode 27 [row 10] +movu m6, [r5 + 22 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1620 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1621 * 16], m4 + +; mode 27 [row 11] +movu m6, [r5 + 24 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1622 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1623 * 16], m4 + +; mode 27 [row 12] +movu m6, [r5 + 26 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1624 * 16], m4 + +; mode 30 [row 1 - first half] +movu [r0 + 1794 * 16], m4 + +; mode 33 [row 0 - first half] +movu [r0 + 1984 * 16], m4 + +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1625 * 16], m4 + +; mode 30 [row 1 - second half] +movu [r0 + 1795 * 16], m4 + +; mode 33 [row 0 - second half] +movu [r0 + 1985 * 16], m4 + +; mode 27 [row 13] +movu m6, [r5 + 28 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1626 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1627 * 16], m4 + +; mode 27 [row 14] +movu m6, [r5 + 30 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1628 * 16], m4 + +; mode 28 [row 5 first half] +movu [r0 + 1674 * 16], m4 + +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1629 * 16], m4 + +; mode 28 [row 5 second half] +movu [r0 + 1675 * 16], m4 + +; mode 28 [row 0] +movu m6, [r5 + 5 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1664 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1665 * 16], m4 + +; mode 28 [row 2] +movu m6, [r5 + 15 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1668 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1669 * 16], m4 + +; mode 28 [row 4] +movu m6, [r5 + 25 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1672 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1673 * 16], m4 + +; mode 30 [row 0] +movu m6, [r5 + 13 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1792 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1793 * 16], m4 + +; mode 29 [row 0] +movu m6, [r5 + 9 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1728 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1729 * 16], m4 + +; mode 29 [row 2] +movu m6, [r5 + 27 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1732 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1733 * 16], m4 + +; mode 31 [row 0] +movu m6, [r5 + 17 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1856 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1857 * 16], m4 + +; mode 32 [row 0] +movu m6, [r5 + 21 * 16] +pmaddubsw m4, m0, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1920 * 16], m4 +pmaddubsw m4, m1, m6 +pmulhrsw m4, m7 +pmaddubsw m5, m3, m6 +pmulhrsw m5, m7 +packuswb m4, m5 +movu [r0 + 1921 * 16], m4 + +; mode 27 [row 15] +movu m0, [r3 + 2] +movd m1, [r3 + 3] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 10] +movd m3, [r3 + 11] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 18] +movd m3, [r3 + 19] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 26] +movd m5, [r3 + 27] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1630 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1630 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1631 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1631 * 16 + 8], m5 + +; mode 27 [row 16] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1632 * 16], m3 + +; mode 31 [row 1 - first half] +movu [r0 + 1858 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1633 * 16], m3 + +; mode 31 [row 1 - second half] +movu [r0 + 1859 * 16], m3 + +; mode 27 [row 17] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1634 * 16], m3 + +; mode 29 [row 3 - first half] +movu [r0 + 1734 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1635 * 16], m3 + +; mode 29 [row 3 - second half] +movu [r0 + 1735 * 16], m3 + +; mode 27 [row 18] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1636 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1637 * 16], m3 + +; mode 27 [row 19] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1638 * 16], m3 + +; mode 28 [row 7 - first half] +movu [r0 + 1678 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1639 * 16], m3 + +; mode 28 [row 7 - second half] +movu [r0 + 1679 * 16], m3 + +; mode 27 [row 20] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1640 * 16], m3 + +; mode 32 [row 1 - first half] +movu [r0 + 1922 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1641 * 16], m3 + +; mode 32 [row 1 - second half] +movu [r0 + 1923 * 16], m3 + +; mode 27 [row 21] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1642 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1643 * 16], m3 + +; mode 27 [row 22] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1644 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1645 * 16], m3 + +; mode 27 [row 23] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1646 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1647 * 16], m3 + +; mode 27 [row 24] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1648 * 16], m3 + +; mode 28 [row 9 - first half] +movu [r0 + 1682 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1649 * 16], m3 + +; mode 28 [row 9 - second half] +movu [r0 + 1683 * 16], m3 + +; mode 27 [row 25] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1650 * 16], m3 + +; mode 30 [row 3 - first half] +movu [r0 + 1798 * 16], m3 + +; mode 33 [row 1 - first half] +movu [r0 + 1986 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1651 * 16], m3 + +; mode 30 [row 3 - second half] +movu [r0 + 1799 * 16], m3 + +; mode 33 [row 1 - second half] +movu [r0 + 1987 * 16], m3 + +; mode 27 [row 26] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1652 * 16], m3 + +; mode 29 [row 5 - first half] +movu [r0 + 1738 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1653 * 16], m3 + +; mode 29 [row 5 - second half] +movu [r0 + 1739 * 16], m3 + +; mode 27 [row 27] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1654 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1655 * 16], m3 + +; mode 27 [row 28] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1656 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1657 * 16], m3 + +; mode 27 [row 29] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1658 * 16], m3 + +; mode 28 [row 11 - first half] +movu [r0 + 1686 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1659 * 16], m3 + +; mode 28 [row 11 - second half] +movu [r0 + 1687 * 16], m3 + +; mode 27 [row 30] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1660 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1661 * 16], m3 + +; mode 28 [row 6] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1676 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1677 * 16], m3 + +; mode 28 [row 8] +movu m6, [r5 + 13 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1680 * 16], m3 + +; mode 29 [row 4 - first half] +movu [r0 + 1736 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1681 * 16], m3 + +; mode 29 [row 4 - second half] +movu [r0 + 1737 * 16], m3 + +; mode 28 [row 10] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1684 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1685 * 16], m3 + +; mode 29 [row 6] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1740 * 16], m3 + +; mode 32 [row 2 - first half] +movu [r0 + 1924 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1741 * 16], m3 + +; mode 32 [row 2 - second half] +movu [r0 + 1925 * 16], m3 + +; mode 30 [row 2] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1796 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1797 * 16], m3 + +; mode 31 [row 2] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1860 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1861 * 16], m3 + +; mode 27 [row 15] +movu m0, [r3 + 3] +movd m1, [r3 + 4] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 11] +movd m3, [r3 + 12] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 19] +movd m3, [r3 + 20] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 27] +movd m5, [r3 + 28] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1662 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1662 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1663 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1663 * 16 + 8], m5 + +; mode 28 [row 12] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1688 * 16], m3 + +; mode 30 [row 4 - first half] +movu [r0 + 1800 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1689 * 16], m3 + +; mode 30 [row 4 - second half] +movu [r0 + 1801 * 16], m3 + +; mode 28 [row 13] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1690 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1691 * 16], m3 + +; mode 28 [row 14] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1692 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1693 * 16], m3 + +; mode 28 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1694 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1695 * 16], m3 + +; mode 28 [row 16] +movu m6, [r5 + 21 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1696 * 16], m3 + +; mode 31 [row 4 - first half] +movu [r0 + 1864 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1697 * 16], m3 + +; mode 31 [row 4 - second half] +movu [r0 + 1865 * 16], m3 + +; mode 28 [row 17] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1698 * 16], m3 + +; mode 29 [row 9 - first half] +movu [r0 + 1746 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1699 * 16], m3 + +; mode 29 [row 9 - second half] +movu [r0 + 1747 * 16], m3 + +; mode 28 [row 18] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1700 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1701 * 16], m3 + +; mode 29 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1742 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1743 * 16], m3 + +; mode 29 [row 8] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1744 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1745 * 16], m3 + +; mode 30 [row 5] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1802 * 16], m3 + +; mode 33 [row 2 - first half] +movu [r0 + 1988 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1803 * 16], m3 + +; mode 33 [row 2 - second half] +movu [r0 + 1989 * 16], m3 + +; mode 30 [row 6] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1804 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1805 * 16], m3 + +; mode 31 [row 3] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1862 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1863 * 16], m3 + +; mode 32 [row 3] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1926 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1927 * 16], m3 + +; mode 28 [row 19] +movu m6, [r5 + 4 * 16] +movu m0, [r3 + 4] +movd m1, [r3 + 5] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 12] +movd m4, [r3 + 13] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1702 * 16], m3 + +movu m1, [r3 + 20] +movd m3, [r3 + 21] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 28] +movd m5, [r3 + 29] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1703 * 16], m3 + +; mode 28 [row 20] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1704 * 16], m3 + +; mode 32 [row 4 - first half] +movu [r0 + 1928 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1705 * 16], m3 + +; mode 32 [row 4 - second half] +movu [r0 + 1929 * 16], m3 + +; mode 28 [row 21] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1706 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1707 * 16], m3 + +; mode 28 [row 22] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1708 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1709 * 16], m3 + +; mode 28 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1710 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1711 * 16], m3 + +; mode 28 [row 24] +movu m6, [r5 + 29 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1712 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1713 * 16], m3 + +; mode 29 [row 10] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1748 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1749 * 16], m3 + +; mode 29 [row 11] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1750 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1751 * 16], m3 + +; mode 29 [row 12] +movu m6, [r5 + 21 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1752 * 16], m3 + +; mode 30 [row 8 -first half] +movu [r0 + 1808 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1753 * 16], m3 + +; mode 30 [row 8 -second half] +movu [r0 + 1809 * 16], m3 + +; mode 29 [row 13] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1754 * 16], m3 + +; mode 32 [row 5 - first half] +movu [r0 + 1930 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1755 * 16], m3 + +; mode 32 [row 5 - second half] +movu [r0 + 1931 * 16], m3 + +; mode 30 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1806 * 16], m3 + +; mode 33 [row 3 - first half] +movu [r0 + 1990 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1807 * 16], m3 + +; mode 33 [row 3 - second half] +movu [r0 + 1991 * 16], m3 + +; mode 31 [row 5] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1866 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1867 * 16], m3 + +; mode 31 [row 6] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1868 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1869 * 16], m3 + +; mode 28 [row 25] +movu m6, [r5 + 2 * 16] +movu m0, [r3 + 5] +movd m1, [r3 + 6] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 13] +movd m4, [r3 + 14] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1714 * 16], m3 + +movu m1, [r3 + 21] +movd m3, [r3 + 22] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 29] +movd m5, [r3 + 30] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1715 * 16], m3 + +; mode 28 [row 26] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1716 * 16], m3 + +; mode 29 [row 14 - first half] +movu [r0 + 1756 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1717 * 16], m3 + +; mode 29 [row 14 - second half] +movu [r0 + 1757 * 16], m3 + +; mode 28 [row 27] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1718 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1719 * 16], m3 + +; mode 28 [row 28] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1720 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1721 * 16], m3 + +; mode 28 [row 29] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1722 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1723 * 16], m3 + +; mode 28 [row 30] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1724 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1725 * 16], m3 + +; mode 29 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1758 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1759 * 16], m3 + +; mode 29 [row 16] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1760 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1761 * 16], m3 + +; mode 30 [row 9] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1810 * 16], m3 + +; mode 33 [row 4 - first half] +movu [r0 + 1992 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1811 * 16], m3 + +; mode 33 [row 4 - second half] +movu [r0 + 1993 * 16], m3 + +; mode 30 [row 10] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1812 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1813 * 16], m3 + +; mode 31 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1870 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1871 * 16], m3 + +; mode 31 [row 8] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1872 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1873 * 16], m3 + +; mode 32 [row 6] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1932 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1933 * 16], m3 + +; mode 30 [row 11] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1814 * 16], m3 + +; mode 33 [row 5 - first half] +movu [r0 + 1994 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1815 * 16], m3 + +; mode 33 [row 5 - second half] +movu [r0 + 1995 * 16], m3 + +; mode 28 [row 31] +movu m0, [r3 + 6] +movd m1, [r3 + 7] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 14] +movd m3, [r3 + 15] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 22] +movd m3, [r3 + 23] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 30] +movd m5, [r3 + 31] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1726 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1726 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1727 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1727 * 16 + 8], m5 + +; mode 29 [row 17] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1762 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1763 * 16], m3 + +; mode 29 [row 18] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1764 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1765 * 16], m3 + +; mode 29 [row 19] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1766 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1767 * 16], m3 + +; mode 29 [row 20] +movu m6, [r5 + 29 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1768 * 16], m3 + +; mode 32 [row 8 - first halif] +movu [r0 + 1936 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1769 * 16], m3 + +; mode 32 [row 8 - second halif] +movu [r0 + 1937 * 16], m3 + +; mode 30 [row 12] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1816 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1817 * 16], m3 + +; mode 30 [row 13] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1818 * 16], m3 + +; mode 33 [row 6 - first half] +movu [r0 + 1996 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1819 * 16], m3 + +; mode 33 [row 6 - second half] +movu [r0 + 1997 * 16], m3 + +; mode 31 [row 9] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1874 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1875 * 16], m3 + +; mode 31 [row 10] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1876 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1877 * 16], m3 + +; mode 32 [row 7] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1934 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1935 * 16], m3 + +; mode 29 [row 21] +movu m6, [r5 + 6 * 16] +movu m0, [r3 + 7] +movd m1, [r3 + 8] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 15] +movd m4, [r3 + 16] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1770 * 16], m3 + +movu m1, [r3 + 23] +movd m3, [r3 + 24] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 31] +movd m5, [r3 + 32] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1771 * 16], m3 + +; mode 29 [row 22] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1772 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1773 * 16], m3 + +; mode 29 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1774 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1775 * 16], m3 + +; mode 30 [row 14] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1820 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1821 * 16], m3 + +; mode 30 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1822 * 16], m3 + +; mode 33 [row 7 - first half] +movu [r0 + 1998 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1823 * 16], m3 + +; mode 33 [row 7 - second half] +movu [r0 + 1999 * 16], m3 + +; mode 30 [row 16] +movu m6, [r5 + 29 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1824 * 16], m3 + +; mode 31 [row 12 - first half] +movu [r0 + 1880 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1825 * 16], m3 + +; mode 31 [row 12 - second half] +movu [r0 + 1881 * 16], m3 + +; mode 31 [row 11] +movu m6, [r5 + 12 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1878 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1879 * 16], m3 + +; mode 32 [row 9] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1938 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1939 * 16], m3 + +; mode 29 [row 24] +movu m6, [r5 + 1 * 16] +movu m0, [r3 + 8] +movd m1, [r3 + 9] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 16] +movd m4, [r3 + 17] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1776 * 16], m3 + +movu m1, [r3 + 24] +movd m3, [r3 + 25] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 32] +movd m5, [r3 + 33] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1777 * 16], m3 + +; mode 29 [row 25] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1778 * 16], m3 + +; mode 30 [row 17 - first half] +movu [r0 + 1826 * 16], m3 + +; mode 33 [row 8 - first half] +movu [r0 + 2000 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1779 * 16], m3 + +; mode 30 [row 17 - second half] +movu [r0 + 1827 * 16], m3 + +; mode 33 [row 8 - second half] +movu [r0 + 2001 * 16], m3 + +; mode 29 [row 26] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1780 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1781 * 16], m3 + +; mode 29 [row 27] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1782 * 16], m3 + +; mode 32 [row 11 - first half] +movu [r0 + 1942 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1783 * 16], m3 + +; mode 32 [row 11 - second half] +movu [r0 + 1943 * 16], m3 + +; mode 30 [row 18] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1828 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1829 * 16], m3 + +; mode 31 [row 13] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1882 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1883 * 16], m3 + +; mode 31 [row 14] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1884 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1885 * 16], m3 + +; mode 32 [row 10] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1940 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1941 * 16], m3 + +; mode 29 [row 28] +movu m6, [r5 + 5 * 16] +movu m0, [r3 + 9] +movd m1, [r3 + 10] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 17] +movd m4, [r3 + 18] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1784 * 16], m3 + +movu m1, [r3 + 25] +movd m3, [r3 + 26] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 33] +movd m5, [r3 + 34] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1785 * 16], m3 + +; mode 29 [row 29] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1786 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1787 * 16], m3 + +; mode 29 [row 30] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1788 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1789 * 16], m3 + +; mode 30 [row 19] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1830 * 16], m3 + +; mode 33 [row 9 - first half] +movu [r0 + 2002 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1831 * 16], m3 + +; mode 33 [row 9 - second half] +movu [r0 + 2003 * 16], m3 + +; mode 30 [row 20] +movu m6, [r5 + 17 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1832 * 16], m3 + +; mode 32 [row 12 - first half] +movu [r0 + 1944 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1833 * 16], m3 + +; mode 32 [row 12 - second half] +movu [r0 + 1945 * 16], m3 + +; mode 30 [row 21] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1834 * 16], m3 + +; mode 33 [row 10 - first half] +movu [r0 + 2004 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1835 * 16], m3 + +; mode 33 [row 10 - second half] +movu [r0 + 2005 * 16], m3 + +; mode 31 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1886 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1887 * 16], m3 + +; mode 29 [row 31] +movu m0, [r3 + 10] +movd m1, [r3 + 11] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 18] +movd m3, [r3 + 19] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 26] +movd m3, [r3 + 27] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 34] +movd m5, [r3 + 35] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1790 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1790 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1791 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1791 * 16 + 8], m5 + +; mode 30 [row 22] +movu m6, [r5 + 11 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1836 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1837 * 16], m3 + +; mode 30 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1838 * 16], m3 + +; mode 33 [row 11 - first half] +movu [r0 + 2006 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1839 * 16], m3 + +; mode 33 [row 11 - second half] +movu [r0 + 2007 * 16], m3 + +; mode 31 [row 16] +movu m6, [r5 + 1 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1888 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1889 * 16], m3 + +; mode 31 [row 17] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1890 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1891 * 16], m3 + +; mode 32 [row 13] +movu m6, [r5 + 6 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1946 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1947 * 16], m3 + +; mode 32 [row 14] +movu m6, [r5 + 27 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1948 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1949 * 16], m3 + +; mode 30 [row 24] +movu m6, [r5 + 5 * 16] +movu m0, [r3 + 11] +movd m1, [r3 + 12] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 19] +movd m4, [r3 + 20] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1840 * 16], m3 + +movu m1, [r3 + 27] +movd m3, [r3 + 28] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 35] +movd m5, [r3 + 36] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1841 * 16], m3 + +; mode 30 [row 25] +movu m6, [r5 + 18 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1842 * 16], m3 + +; mode 33 [row 12 - first half] +movu [r0 + 2008 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1843 * 16], m3 + +; mode 33 [row 12 - second half] +movu [r0 + 2009 * 16], m3 + +; mode 30 [row 26] +movu m6, [r5 + 31 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1844 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1845 * 16], m3 + +; mode 31 [row 18] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1892 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1893 * 16], m3 + +; mode 31 [row 19] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1894 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1895 * 16], m3 + +; mode 32 [row 15] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1950 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1951 * 16], m3 + +; mode 30 [row 27] +movu m6, [r5 + 12 * 16] +movu m0, [r3 + 12] +movd m1, [r3 + 13] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 20] +movd m4, [r3 + 21] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1846 * 16], m3 + +; mode 33 [row 13 - first half] +movu [r0 + 2010 * 16], m3 + +movu m1, [r3 + 28] +movd m3, [r3 + 29] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 36] +movd m5, [r3 + 37] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1847 * 16], m3 + +; mode 33 [row 13 - second half] +movu [r0 + 2011 * 16], m3 + +; mode 30 [row 28] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1848 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1849 * 16], m3 + +; mode 31 [row 20] +movu m6, [r5 + 5 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1896 * 16], m3 + +; mode 32 [row 16 - first half] +movu [r0 + 1952 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1897 * 16], m3 + +; mode 32 [row 16 - second half] +movu [r0 + 1953 * 16], m3 + +; mode 31 [row 21] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1898 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1899 * 16], m3 + +; mode 32 [row 17] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1954 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1955 * 16], m3 + +; mode 30 [row 29] +movu m6, [r5 + 6 * 16] +movu m0, [r3 + 13] +movd m1, [r3 + 14] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 21] +movd m4, [r3 + 22] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1850 * 16], m3 + +; mode 33 [row 14 - first half] +movu [r0 + 2012 * 16], m3 + +movu m1, [r3 + 29] +movd m3, [r3 + 30] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 37] +movd m5, [r3 + 38] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1851 * 16], m3 + +; mode 33 [row 14 - second half] +movu [r0 + 2013 * 16], m3 + +; mode 30 [row 30] +movu m6, [r5 + 19 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1852 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1853 * 16], m3 + +; mode 31 [row 22] +movu m6, [r5 + 7 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1900 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1901 * 16], m3 + +; mode 31 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1902 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1903 * 16], m3 + +; mode 32 [row 18] +movu m6, [r5 + 15 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1956 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1957 * 16], m3 + +; mode 30 [row 31] +movu m0, [r3 + 14] +movd m1, [r3 + 15] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 22] +movd m3, [r3 + 23] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 30] +movd m3, [r3 + 31] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 38] +movd m5, [r3 + 39] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1854 * 16], m5 + +; mode 33 [row 15 - first eight] +movh [r0 + 2014 * 16], m5 + +pshufb m5, m2, [tab_S2] +movh [r0 + 1854 * 16 + 8], m5 + +; mode 33 [row 15 - second eight] +movh [r0 + 2014 * 16 + 8], m5 + +pshufb m5, m1, [tab_S2] +movh [r0 + 1855 * 16], m5 + +; mode 33 [row 15 - third eight] +movh [r0 + 2015 * 16], m5 + +pshufb m5, m4, [tab_S2] +movh [r0 + 1855 * 16 + 8], m5 + +; mode 33 [row 15 - fourth eight] +movh [r0 + 2015 * 16 + 8], m5 + +; mode 31 [row 24] +movu m6, [r5 + 9 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1904 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1905 * 16], m3 + +; mode 31 [row 25] +movu m6, [r5 + 26 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1906 * 16], m3 + +; mode 33 [row 16 - first half] +movu [r0 + 2016 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1907 * 16], m3 + +; mode 33 [row 16 - second half] +movu [r0 + 2017 * 16], m3 + +; mode 32 [row 19] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1958 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1959 * 16], m3 + +; mode 32 [row 20] +movu m6, [r5 + 25 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1960 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1961 * 16], m3 + +; mode 31 [row 26] +movu m6, [r5 + 11 * 16] +movu m0, [r3 + 15] +movd m1, [r3 + 16] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 23] +movd m4, [r3 + 24] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1908 * 16], m3 + +movu m1, [r3 + 31] +movd m3, [r3 + 32] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 39] +movd m5, [r3 + 40] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1909 * 16], m3 + +; mode 31 [row 27] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1910 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1911 * 16], m3 + +; mode 32 [row 21] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1962 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1963 * 16], m3 + +; mode 33 [row 17] +movu m6, [r5 + 20 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2018 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2019 * 16], m3 + +; mode 31 [row 28] +movu m6, [r5 + 13 * 16] +movu m0, [r3 + 16] +movd m1, [r3 + 17] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 24] +movd m4, [r3 + 25] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1912 * 16], m3 + +movu m1, [r3 + 32] +movd m3, [r3 + 33] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 40] +movd m5, [r3 + 41] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1913 * 16], m3 + +; mode 31 [row 29] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1914 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1915 * 16], m3 + +; mode 32 [row 22] +movu m6, [r5 + 3 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1964 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1965 * 16], m3 + +; mode 32 [row 23] +movu m6, [r5 + 24 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1966 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1967 * 16], m3 + +; mode 33 [row 18] +movu m6, [r5 + 14 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2020 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2021 * 16], m3 + +; mode 31 [row 30] +movu m6, [r5 + 15 * 16] +movu m0, [r3 + 17] +movd m1, [r3 + 18] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 25] +movd m4, [r3 + 26] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1916 * 16], m3 + +movu m1, [r3 + 33] +movd m3, [r3 + 34] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 41] +movd m5, [r3 + 42] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1917 * 16], m3 + +; mode 32 [row 24] +movu m6, [r5 + 13 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1968 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1969 * 16], m3 + +; mode 33 [row 19] +movu m6, [r5 + 8 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2022 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2023 * 16], m3 + +; mode 31 [row 31] +movu m0, [r3 + 18] +movd m1, [r3 + 19] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 26] +movd m3, [r3 + 27] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 34] +movd m3, [r3 + 35] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 42] +movd m5, [r3 + 43] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1918 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1918 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1919 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1919 * 16 + 8], m5 + +; mode 32 [row 25] +movu m6, [r5 + 2 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1970 * 16], m3 + +; mode 33 [row 20 - first half] +movu [r0 + 2024 * 16], m3 + +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1971 * 16], m3 + +; mode 33 [row 20 - second half] +movu [r0 + 2025 * 16], m3 + +; mode 32 [row 26] +movu m6, [r5 + 23 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1972 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1973 * 16], m3 + +; mode 33 [row 21] +movu m6, [r5 + 28 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2026 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2027 * 16], m3 + +; mode 32 [row 27] +movu m6, [r5 + 12 * 16] +movu m0, [r3 + 19] +movd m1, [r3 + 20] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 27] +movd m4, [r3 + 28] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1974 * 16], m3 + +movu m1, [r3 + 35] +movd m3, [r3 + 36] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 43] +movd m5, [r3 + 44] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1975 * 16], m3 + +; mode 33 [row 22] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2028 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2029 * 16], m3 + +; mode 32 [row 28] +movu m6, [r5 + 1 * 16] +movu m0, [r3 + 20] +movd m1, [r3 + 21] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 28] +movd m4, [r3 + 29] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1976 * 16], m3 + +movu m1, [r3 + 36] +movd m3, [r3 + 37] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 44] +movd m5, [r3 + 45] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1977 * 16], m3 + +; mode 32 [row 29] +movu m6, [r5 + 22 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1978 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1979 * 16], m3 + +; mode 33 [row 23] +movu m6, [r5 + 16 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2030 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2031 * 16], m3 + +; mode 32 [row 30] +movu m6, [r5 + 11 * 16] +movu m0, [r3 + 21] +movd m1, [r3 + 22] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 29] +movd m4, [r3 + 30] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1980 * 16], m3 + +movu m1, [r3 + 37] +movd m3, [r3 + 38] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 45] +movd m5, [r3 + 46] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 1981 * 16], m3 + +; mode 33 [row 24] +movu m6, [r5 + 10 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2032 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2033 * 16], m3 + +; mode 32 [row 31] +movu m0, [r3 + 22] +movd m1, [r3 + 23] +palignr m1, m0, 1 +punpcklbw m0, m1 +movu m2, [r3 + 30] +movd m3, [r3 + 31] +palignr m3, m2, 1 +punpcklbw m2, m3 +movu m1, [r3 + 38] +movd m3, [r3 + 39] +palignr m3, m1, 1 +punpcklbw m1, m3 +movu m4, [r3 + 46] +movd m5, [r3 + 47] +palignr m5, m4, 1 +punpcklbw m4, m5 + +pshufb m5, m0, [tab_S2] +movh [r0 + 1982 * 16], m5 +pshufb m5, m2, [tab_S2] +movh [r0 + 1982 * 16 + 8], m5 +pshufb m5, m1, [tab_S2] +movh [r0 + 1983 * 16], m5 +pshufb m5, m4, [tab_S2] +movh [r0 + 1983 * 16 + 8], m5 + +; mode 33 [row 25] +movu m6, [r5 + 4 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2034 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2035 * 16], m3 + +; mode 33 [row 26] +movu m6, [r5 + 30 * 16] +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2036 * 16], m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2037 * 16], m3 + +; mode 33 [row 27] +movu m6, [r5 + 24 * 16] +movu m0, [r3 + 23] +movd m1, [r3 + 24] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 31] +movd m4, [r3 + 32] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2038 * 16], m3 + +movu m1, [r3 + 39] +movd m3, [r3 + 40] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 47] +movd m5, [r3 + 48] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2039 * 16], m3 + +; mode 33 [row 28] +movu m6, [r5 + 18 * 16] +movu m0, [r3 + 24] +movd m1, [r3 + 25] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 32] +movd m4, [r3 + 33] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2040 * 16], m3 + +movu m1, [r3 + 40] +movd m3, [r3 + 41] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 48] +movd m5, [r3 + 49] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2041 * 16], m3 + +; mode 33 [row 29] +movu m6, [r5 + 12 * 16] +movu m0, [r3 + 25] +movd m1, [r3 + 26] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 33] +movd m4, [r3 + 34] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2042 * 16], m3 + +movu m1, [r3 + 41] +movd m3, [r3 + 42] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 49] +movd m5, [r3 + 50] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2043 * 16], m3 + +; mode 33 [row 30] +movu m6, [r5 + 6 * 16] +movu m0, [r3 + 26] +movd m1, [r3 + 27] +palignr m1, m0, 1 +punpcklbw m0, m1 +pmaddubsw m3, m0, m6 +pmulhrsw m3, m7 +movu m2, [r3 + 34] +movd m4, [r3 + 35] +palignr m4, m2, 1 +punpcklbw m2, m4 +pmaddubsw m5, m2, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2044 * 16], m3 + +movu m1, [r3 + 42] +movd m3, [r3 + 43] +palignr m3, m1, 1 +punpcklbw m1, m3 +pmaddubsw m3, m1, m6 +pmulhrsw m3, m7 +movu m4, [r3 + 50] +movd m5, [r3 + 51] +palignr m5, m4, 1 +punpcklbw m4, m5 +pmaddubsw m5, m4, m6 +pmulhrsw m5, m7 +packuswb m3, m5 +movu [r0 + 2045 * 16], m3 + +; mode 33 [row 31] +movu m5, [r3 + 27] +movu [r0 + 2046 * 16], m5 +movu m5, [r3 + 43] +movu [r0 + 2047 * 16], m5 + +;mode 34 [row 0] +movu m0, [r3 + 2] +movu [r0 + 2048 * 16], m0 +movu m1, [r3 + 18] +movu [r0 + 2049 * 16], m1 + +;mode 34 [row 1] +movu m2, [r3 + 34] +palignr m3, m1, m0, 1 +movu [r0 + 2050 * 16], m3 +palignr m4, m2, m1, 1 +movu [r0 + 2051 * 16], m4 + +;mode 34 [row 2] +palignr m3, m1, m0, 2 +movu [r0 + 2052 * 16], m3 +palignr m4, m2, m1, 2 +movu [r0 + 2053 * 16], m4 + +;mode 34 [row 3] +palignr m3, m1, m0, 3 +movu [r0 + 2054 * 16], m3 +palignr m4, m2, m1, 3 +movu [r0 + 2055 * 16], m4 + +;mode 34 [row 4] +palignr m3, m1, m0, 4 +movu [r0 + 2056 * 16], m3 +palignr m4, m2, m1, 4 +movu [r0 + 2057 * 16], m4 + +;mode 34 [row 5] +palignr m3, m1, m0, 5 +movu [r0 + 2058 * 16], m3 +palignr m4, m2, m1, 5 +movu [r0 + 2059 * 16], m4 + +;mode 34 [row 6] +palignr m3, m1, m0, 6 +movu [r0 + 2060 * 16], m3 +palignr m4, m2, m1, 6 +movu [r0 + 2061 * 16], m4 + +;mode 34 [row 7] +palignr m3, m1, m0, 7 +movu [r0 + 2062 * 16], m3 +palignr m4, m2, m1, 7 +movu [r0 + 2063 * 16], m4 + +;mode 34 [row 8] +palignr m3, m1, m0, 8 +movu [r0 + 2064 * 16], m3 +palignr m4, m2, m1, 8 +movu [r0 + 2065 * 16], m4 + +;mode 34 [row 9] +palignr m3, m1, m0, 9 +movu [r0 + 2066 * 16], m3 +palignr m4, m2, m1, 9 +movu [r0 + 2067 * 16], m4 + +;mode 34 [row 10] +palignr m3, m1, m0, 10 +movu [r0 + 2068 * 16], m3 +palignr m4, m2, m1, 10 +movu [r0 + 2069 * 16], m4 + +;mode 34 [row 11] +palignr m3, m1, m0, 11 +movu [r0 + 2070 * 16], m3 +palignr m4, m2, m1, 11 +movu [r0 + 2071 * 16], m4 + +;mode 34 [row 12] +palignr m3, m1, m0, 12 +movu [r0 + 2072 * 16], m3 +palignr m4, m2, m1, 12 +movu [r0 + 2073 * 16], m4 + +;mode 34 [row 13] +palignr m3, m1, m0, 13 +movu [r0 + 2074 * 16], m3 +palignr m4, m2, m1, 13 +movu [r0 + 2075 * 16], m4 + +;mode 34 [row 14] +palignr m3, m1, m0, 14 +movu [r0 + 2076 * 16], m3 +palignr m4, m2, m1, 14 +movu [r0 + 2077 * 16], m4 + +;mode 34 [row 15] +palignr m3, m1, m0, 15 +movu [r0 + 2078 * 16], m3 +palignr m4, m2, m1, 15 +movu [r0 + 2079 * 16], m4 + +;mode 34 [row 16] +palignr m3, m1, m0, 16 +movu [r0 + 2080 * 16], m3 +palignr m4, m2, m1, 16 +movu [r0 + 2081 * 16], m4 + +;mode 34 [row 17] +movu m0, [r3 + 19] +movu [r0 + 2082 * 16], m0 +movu m1, [r3 + 35] +movu [r0 + 2083 * 16], m1 + +;mode 34 [row 18] +movu m2, [r3 + 51] +palignr m3, m1, m0, 1 +movu [r0 + 2084 * 16], m3 +palignr m4, m2, m1, 1 +movu [r0 + 2085 * 16], m4 + +;mode 34 [row 19] +palignr m3, m1, m0, 2 +movu [r0 + 2086 * 16], m3 +palignr m4, m2, m1, 2 +movu [r0 + 2087 * 16], m4 + +;mode 34 [row 20] +palignr m3, m1, m0, 3 +movu [r0 + 2088 * 16], m3 +palignr m4, m2, m1, 3 +movu [r0 + 2089 * 16], m4 + +;mode 34 [row 21] +palignr m3, m1, m0, 4 +movu [r0 + 2090 * 16], m3 +palignr m4, m2, m1, 4 +movu [r0 + 2091 * 16], m4 + +;mode 34 [row 22] +palignr m3, m1, m0, 5 +movu [r0 + 2092 * 16], m3 +palignr m4, m2, m1, 5 +movu [r0 + 2093 * 16], m4 + +;mode 34 [row 23] +palignr m3, m1, m0, 6 +movu [r0 + 2094 * 16], m3 +palignr m4, m2, m1, 6 +movu [r0 + 2095 * 16], m4 + +;mode 34 [row 24] +palignr m3, m1, m0, 7 +movu [r0 + 2096 * 16], m3 +palignr m4, m2, m1, 7 +movu [r0 + 2097 * 16], m4 + +;mode 34 [row 25] +palignr m3, m1, m0, 8 +movu [r0 + 2098 * 16], m3 +palignr m4, m2, m1, 8 +movu [r0 + 2099 * 16], m4 + +;mode 34 [row 26] +palignr m3, m1, m0, 9 +movu [r0 + 2100 * 16], m3 +palignr m4, m2, m1, 9 +movu [r0 + 2101 * 16], m4 + +;mode 34 [row 27] +palignr m3, m1, m0, 10 +movu [r0 + 2102 * 16], m3 +palignr m4, m2, m1, 10 +movu [r0 + 2103 * 16], m4 + +;mode 34 [row 28] +palignr m3, m1, m0, 11 +movu [r0 + 2104 * 16], m3 +palignr m4, m2, m1, 11 +movu [r0 + 2105 * 16], m4 + +;mode 34 [row 29] +palignr m3, m1, m0, 12 +movu [r0 + 2106 * 16], m3 +palignr m4, m2, m1, 12 +movu [r0 + 2107 * 16], m4 + +;mode 34 [row 30] +palignr m3, m1, m0, 13 +movu [r0 + 2108 * 16], m3 +palignr m4, m2, m1, 13 +movu [r0 + 2109 * 16], m4 + +;mode 34 [row 31] +palignr m3, m1, m0, 14 +movu [r0 + 2110 * 16], m3 +palignr m4, m2, m1, 14 +movu [r0 + 2111 * 16], m4 + +RET diff --git a/source/common/x86/ipfilter16.asm b/source/common/x86/ipfilter16.asm new file mode 100644 index 0000000..6089d75 --- /dev/null +++ b/source/common/x86/ipfilter16.asm @@ -0,0 +1,2894 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Nabajit Deka +;* Murugan Vairavel +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +tab_c_32: times 4 dd 32 +tab_c_n32768: times 4 dd -32768 +tab_c_524800: times 4 dd 524800 +tab_c_n8192: times 8 dw -8192 + +tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + +tab_ChromaCoeff: dw 0, 64, 0, 0 + dw -2, 58, 10, -2 + dw -4, 54, 16, -2 + dw -6, 46, 28, -4 + dw -4, 36, 36, -4 + dw -4, 28, 46, -6 + dw -2, 16, 54, -4 + dw -2, 10, 58, -2 + +tab_ChromaCoeffV: times 4 dw 0, 64 + times 4 dw 0, 0 + + times 4 dw -2, 58 + times 4 dw 10, -2 + + times 4 dw -4, 54 + times 4 dw 16, -2 + + times 4 dw -6, 46 + times 4 dw 28, -4 + + times 4 dw -4, 36 + times 4 dw 36, -4 + + times 4 dw -4, 28 + times 4 dw 46, -6 + + times 4 dw -2, 16 + times 4 dw 54, -4 + + times 4 dw -2, 10 + times 4 dw 58, -2 + +tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0 + dw -1, 4, -10, 58, 17, -5, 1, 0 + dw -1, 4, -11, 40, 40, -11, 4, -1 + dw 0, 1, -5, 17, 58, -10, 4, -1 + +tab_LumaCoeffV: times 4 dw 0, 0 + times 4 dw 0, 64 + times 4 dw 0, 0 + times 4 dw 0, 0 + + times 4 dw -1, 4 + times 4 dw -10, 58 + times 4 dw 17, -5 + times 4 dw 1, 0 + + times 4 dw -1, 4 + times 4 dw -11, 40 + times 4 dw 40, -11 + times 4 dw 4, -1 + + times 4 dw 0, 1 + times 4 dw -5, 17 + times 4 dw 58, -10 + times 4 dw 4, -1 + +SECTION .text + +cextern pd_32 +cextern pw_pixel_max +cextern pd_n32768 + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_HOR_LUMA_W4 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + add r1, r1 + add r3, r3 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] + pxor m6, m6 + mova m7, [pw_pixel_max] +%else + mova m1, [pd_n32768] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m3, m2, 6 ; m3 = src[3-10] + pmaddwd m3, m0 + phaddd m5, m3 + + phaddd m4, m5 + paddd m4, m1 +%ifidn %3, pp + psrad m4, 6 + packusdw m4, m4 + CLIPW m4, m6, m7 +%else + psrad m4, 2 + packssdw m4, m4 +%endif + + movh [r2], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------ +FILTER_HOR_LUMA_W4 4, 4, pp +FILTER_HOR_LUMA_W4 4, 8, pp +FILTER_HOR_LUMA_W4 4, 16, pp + +;--------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;--------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W4 4, 4, ps +FILTER_HOR_LUMA_W4 4, 8, ps +FILTER_HOR_LUMA_W4 4, 16, ps + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_HOR_LUMA_W8 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] + pxor m7, m7 +%else + mova m1, [pd_n32768] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m3, m2, 14 ; m3 = src[7-14] + pmaddwd m3, m0 + phaddd m6, m3 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + CLIPW m4, m7, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + + movu [r2], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------ +FILTER_HOR_LUMA_W8 8, 4, pp +FILTER_HOR_LUMA_W8 8, 8, pp +FILTER_HOR_LUMA_W8 8, 16, pp +FILTER_HOR_LUMA_W8 8, 32, pp + +;--------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;--------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W8 8, 4, ps +FILTER_HOR_LUMA_W8 8, 8, ps +FILTER_HOR_LUMA_W8 8, 16, ps +FILTER_HOR_LUMA_W8 8, 32, ps + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W12 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif +%ifidn %3, pp + mova m1, [pd_32] +%else + mova m1, [pd_n32768] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m2 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + + movu [r2], m4 + + movu m2, [r0 + 32] ; m2 = src[16-23] + + pmaddwd m4, m3, m0 ; m3 = src[8-15] + palignr m5, m2, m3, 2 ; m5 = src[9-16] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m2, m3, 4 ; m5 = src[10-17] + pmaddwd m5, m0 + palignr m2, m3, 6 ; m2 = src[11-18] + pmaddwd m2, m0 + phaddd m5, m2 + phaddd m4, m5 + paddd m4, m1 +%ifidn %3, pp + psrad m4, 6 + packusdw m4, m4 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + packssdw m4, m4 +%endif + + movh [r2 + 16], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W12 12, 16, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W12 12, 16, ps + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W16 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] +%else + mova m1, [pd_n32768] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: +%assign x 0 +%rep %1 / 16 + movu m2, [r0 + x] ; m2 = src[0-7] + movu m3, [r0 + 16 + x] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m2 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + movu [r2 + x], m4 + + movu m2, [r0 + 32 + x] ; m2 = src[16-23] + + pmaddwd m4, m3, m0 ; m3 = src[8-15] + palignr m5, m2, m3, 2 ; m5 = src[9-16] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m2, m3, 4 ; m5 = src[10-17] + pmaddwd m5, m0 + palignr m6, m2, m3, 6 ; m6 = src[11-18] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m2, m3, 8 ; m5 = src[12-19] + pmaddwd m5, m0 + palignr m6, m2, m3, 10 ; m6 = src[13-20] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m2, m3, 12 ; m6 = src[14-21] + pmaddwd m6, m0 + palignr m2, m3, 14 ; m3 = src[15-22] + pmaddwd m2, m0 + phaddd m6, m2 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + movu [r2 + 16 + x], m4 + +%assign x x+32 +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 16, 4, pp +FILTER_HOR_LUMA_W16 16, 8, pp +FILTER_HOR_LUMA_W16 16, 12, pp +FILTER_HOR_LUMA_W16 16, 16, pp +FILTER_HOR_LUMA_W16 16, 32, pp +FILTER_HOR_LUMA_W16 16, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 16, 4, ps +FILTER_HOR_LUMA_W16 16, 8, ps +FILTER_HOR_LUMA_W16 16, 12, ps +FILTER_HOR_LUMA_W16 16, 16, ps +FILTER_HOR_LUMA_W16 16, 32, ps +FILTER_HOR_LUMA_W16 16, 64, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 32, 8, pp +FILTER_HOR_LUMA_W16 32, 16, pp +FILTER_HOR_LUMA_W16 32, 24, pp +FILTER_HOR_LUMA_W16 32, 32, pp +FILTER_HOR_LUMA_W16 32, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 32, 8, ps +FILTER_HOR_LUMA_W16 32, 16, ps +FILTER_HOR_LUMA_W16 32, 24, ps +FILTER_HOR_LUMA_W16 32, 32, ps +FILTER_HOR_LUMA_W16 32, 64, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 48, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 48, 64, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 64, 16, pp +FILTER_HOR_LUMA_W16 64, 32, pp +FILTER_HOR_LUMA_W16 64, 48, pp +FILTER_HOR_LUMA_W16 64, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 64, 16, ps +FILTER_HOR_LUMA_W16 64, 32, ps +FILTER_HOR_LUMA_W16 64, 48, ps +FILTER_HOR_LUMA_W16 64, 64, ps + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W24 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif +%ifidn %3, pp + mova m1, [pd_32] +%else + mova m1, [pd_n32768] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m7 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + movu [r2], m4 + + movu m2, [r0 + 32] ; m2 = src[16-23] + + pmaddwd m4, m3, m0 ; m3 = src[8-15] + palignr m5, m2, m3, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m2, m3, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m2, m3, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m2, m3, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m2, m3, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m2, m3, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m2, m3, 14 ; m7 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + movu [r2 + 16], m4 + + movu m3, [r0 + 48] ; m3 = src[24-31] + + pmaddwd m4, m2, m0 ; m2 = src[16-23] + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m7 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, 2 + psrad m5, 2 + packssdw m4, m5 +%endif + movu [r2 + 32], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W24 24, 32, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W24 24, 32, ps + +%macro FILTER_W2_2 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + r1] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 +%ifidn %1, pp + psrad m3, 6 + packusdw m3, m3 + CLIPW m3, m7, m6 +%else + psrad m3, 2 + packssdw m3, m3 +%endif + movd [r2], m3 + pextrd [r2 + r3], m3, 1 +%endmacro + +%macro FILTER_W4_2 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + r1] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + r1 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + r3], m3 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_CHROMA_H 6 +INIT_XMM sse4 +cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5 + + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r%6, [tab_ChromaCoeff] + movh m0, [r%6 + r4 * 4] +%else + movh m0, [tab_ChromaCoeff + r4 * 4] +%endif + + punpcklqdq m0, m0 + mova m2, [tab_Tm16] + +%ifidn %3, ps + mova m1, [tab_c_n32768] + cmp r5m, byte 0 + je .skip + sub r0, r1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + + %if %1 == 4 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + %else + phaddd m3, m3 + %endif + + paddd m3, m1 + psrad m3, 2 + packssdw m3, m3 + + %if %1 == 2 + movd [r2], m3 + %else + movh [r2], m3 + %endif + + add r0, r1 + add r2, r3 + FILTER_W%1_2 %3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + +.skip: + +%else ;%ifidn %3, ps + pxor m7, m7 + mova m6, [pw_pixel_max] + mova m1, [tab_c_32] +%endif ;%ifidn %3, ps + + FILTER_W%1_2 %3 + +%rep (%2/2) - 1 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + FILTER_W%1_2 %3 +%endrep + +RET +%endmacro + +FILTER_CHROMA_H 2, 4, pp, 6, 8, 5 +FILTER_CHROMA_H 2, 8, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 2, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 4, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 8, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 16, pp, 6, 8, 5 + +FILTER_CHROMA_H 2, 4, ps, 7, 5, 6 +FILTER_CHROMA_H 2, 8, ps, 7, 5, 6 +FILTER_CHROMA_H 4, 2, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 4, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 8, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 16, ps, 7, 6, 6 + +FILTER_CHROMA_H 2, 16, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 32, pp, 6, 8, 5 +FILTER_CHROMA_H 2, 16, ps, 7, 5, 6 +FILTER_CHROMA_H 4, 32, ps, 7, 6, 6 + + +%macro FILTER_W6_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m4, [r0 + 8] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m4, m4 + paddd m4, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m4, 6 + packusdw m3, m4 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m4, 2 + packssdw m3, m4 +%endif + movh [r2], m3 + pextrd [r2 + 8], m3, 2 +%endmacro + +cglobal chroma_filter_pp_6x1_internal + FILTER_W6_1 pp + ret + +cglobal chroma_filter_ps_6x1_internal + FILTER_W6_1 ps + ret + +%macro FILTER_W8_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 +%endmacro + +cglobal chroma_filter_pp_8x1_internal + FILTER_W8_1 pp + ret + +cglobal chroma_filter_ps_8x1_internal + FILTER_W8_1 ps + ret + +%macro FILTER_W12_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + +%ifidn %1, pp + psrad m3, 6 + packusdw m3, m3 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + packssdw m3, m3 +%endif + movh [r2 + 16], m3 +%endmacro + +cglobal chroma_filter_pp_12x1_internal + FILTER_W12_1 pp + ret + +cglobal chroma_filter_ps_12x1_internal + FILTER_W12_1 ps + ret + +%macro FILTER_W16_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 +%endmacro + +cglobal chroma_filter_pp_16x1_internal + FILTER_W16_1 pp + ret + +cglobal chroma_filter_ps_16x1_internal + FILTER_W16_1 ps + ret + +%macro FILTER_W24_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 + + movu m3, [r0 + 32] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 36] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 40] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 44] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + 32], m3 + movhps [r2 + 40], m3 +%endmacro + +cglobal chroma_filter_pp_24x1_internal + FILTER_W24_1 pp + ret + +cglobal chroma_filter_ps_24x1_internal + FILTER_W24_1 ps + ret + +%macro FILTER_W32_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 + + movu m3, [r0 + 32] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 36] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 40] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 44] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + 32], m3 + movhps [r2 + 40], m3 + + movu m3, [r0 + 48] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 52] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 56] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 60] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + 48], m3 + movhps [r2 + 56], m3 +%endmacro + +cglobal chroma_filter_pp_32x1_internal + FILTER_W32_1 pp + ret + +cglobal chroma_filter_ps_32x1_internal + FILTER_W32_1 ps + ret + +%macro FILTER_W8o_1 2 + movu m3, [r0 + %2] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + %2 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + %2 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + %2 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, 6 + psrad m5, 6 + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, 2 + psrad m5, 2 + packssdw m3, m5 +%endif + movh [r2 + %2], m3 + movhps [r2 + %2 + 8], m3 +%endmacro + +%macro FILTER_W48_1 1 + FILTER_W8o_1 %1, 0 + FILTER_W8o_1 %1, 16 + FILTER_W8o_1 %1, 32 + FILTER_W8o_1 %1, 48 + FILTER_W8o_1 %1, 64 + FILTER_W8o_1 %1, 80 +%endmacro + +cglobal chroma_filter_pp_48x1_internal + FILTER_W48_1 pp + ret + +cglobal chroma_filter_ps_48x1_internal + FILTER_W48_1 ps + ret + +%macro FILTER_W64_1 1 + FILTER_W8o_1 %1, 0 + FILTER_W8o_1 %1, 16 + FILTER_W8o_1 %1, 32 + FILTER_W8o_1 %1, 48 + FILTER_W8o_1 %1, 64 + FILTER_W8o_1 %1, 80 + FILTER_W8o_1 %1, 96 + FILTER_W8o_1 %1, 112 +%endmacro + +cglobal chroma_filter_pp_64x1_internal + FILTER_W64_1 pp + ret + +cglobal chroma_filter_ps_64x1_internal + FILTER_W64_1 ps + ret + + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +INIT_XMM sse4 +%macro IPFILTER_CHROMA 6 +cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6 + + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r%4, [tab_ChromaCoeff] + movh m0, [r%4 + r4 * 4] +%else + movh m0, [tab_ChromaCoeff + r4 * 4] +%endif + + punpcklqdq m0, m0 + mova m2, [tab_Tm16] + +%ifidn %3, ps + mova m1, [tab_c_n32768] + cmp r5m, byte 0 + je .skip + sub r0, r1 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 +.skip: +%else + mova m1, [tab_c_32] + pxor m6, m6 + mova m7, [pw_pixel_max] +%endif + + call chroma_filter_%3_%1x1_internal +%rep %2 - 1 + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal +%endrep +RET +%endmacro +IPFILTER_CHROMA 6, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 2, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 4, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 6, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 12, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 4, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 12, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 24, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 24, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 32, pp, 5, 6, 8 + +IPFILTER_CHROMA 6, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 2, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 4, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 6, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 12, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 4, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 12, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 24, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 24, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 32, ps, 6, 7, 6 + +IPFILTER_CHROMA 6, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 12, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 12, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 24, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 24, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 48, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 6, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 12, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 12, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 24, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 24, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 48, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 64, ps, 6, 7, 6 + +IPFILTER_CHROMA 48, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 48, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 48, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 48, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 16, ps, 6, 7, 6 + + +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 16] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 16] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS 4 +INIT_XMM sse2 +cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 + +%ifnidn %3, ss + %ifnidn %3, ps + mova m7, [pw_pixel_max] + %ifidn %3, pp + mova m6, [tab_c_32] + %else + mova m6, [tab_c_524800] + %endif + %else + mova m6, [tab_c_n32768] + %endif +%endif + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + +%ifidn %3, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %3, ps + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + %ifidn %3, pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + %else + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + %endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, m7 +%endif + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS 4, 4, ss, 6 + FILTER_VER_CHROMA_SS 4, 8, ss, 6 + FILTER_VER_CHROMA_SS 16, 16, ss, 6 + FILTER_VER_CHROMA_SS 16, 8, ss, 6 + FILTER_VER_CHROMA_SS 16, 12, ss, 6 + FILTER_VER_CHROMA_SS 12, 16, ss, 6 + FILTER_VER_CHROMA_SS 16, 4, ss, 6 + FILTER_VER_CHROMA_SS 4, 16, ss, 6 + FILTER_VER_CHROMA_SS 32, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 16, ss, 6 + FILTER_VER_CHROMA_SS 16, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 24, ss, 6 + FILTER_VER_CHROMA_SS 24, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 8, ss, 6 + + FILTER_VER_CHROMA_SS 4, 4, ps, 7 + FILTER_VER_CHROMA_SS 4, 8, ps, 7 + FILTER_VER_CHROMA_SS 16, 16, ps, 7 + FILTER_VER_CHROMA_SS 16, 8, ps, 7 + FILTER_VER_CHROMA_SS 16, 12, ps, 7 + FILTER_VER_CHROMA_SS 12, 16, ps, 7 + FILTER_VER_CHROMA_SS 16, 4, ps, 7 + FILTER_VER_CHROMA_SS 4, 16, ps, 7 + FILTER_VER_CHROMA_SS 32, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 16, ps, 7 + FILTER_VER_CHROMA_SS 16, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 24, ps, 7 + FILTER_VER_CHROMA_SS 24, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 8, ps, 7 + + FILTER_VER_CHROMA_SS 4, 4, sp, 8 + FILTER_VER_CHROMA_SS 4, 8, sp, 8 + FILTER_VER_CHROMA_SS 16, 16, sp, 8 + FILTER_VER_CHROMA_SS 16, 8, sp, 8 + FILTER_VER_CHROMA_SS 16, 12, sp, 8 + FILTER_VER_CHROMA_SS 12, 16, sp, 8 + FILTER_VER_CHROMA_SS 16, 4, sp, 8 + FILTER_VER_CHROMA_SS 4, 16, sp, 8 + FILTER_VER_CHROMA_SS 32, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 16, sp, 8 + FILTER_VER_CHROMA_SS 16, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 24, sp, 8 + FILTER_VER_CHROMA_SS 24, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 8, sp, 8 + + FILTER_VER_CHROMA_SS 4, 4, pp, 8 + FILTER_VER_CHROMA_SS 4, 8, pp, 8 + FILTER_VER_CHROMA_SS 16, 16, pp, 8 + FILTER_VER_CHROMA_SS 16, 8, pp, 8 + FILTER_VER_CHROMA_SS 16, 12, pp, 8 + FILTER_VER_CHROMA_SS 12, 16, pp, 8 + FILTER_VER_CHROMA_SS 16, 4, pp, 8 + FILTER_VER_CHROMA_SS 4, 16, pp, 8 + FILTER_VER_CHROMA_SS 32, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 16, pp, 8 + FILTER_VER_CHROMA_SS 16, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 24, pp, 8 + FILTER_VER_CHROMA_SS 24, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 8, pp, 8 + + + FILTER_VER_CHROMA_SS 16, 24, ss, 6 + FILTER_VER_CHROMA_SS 12, 32, ss, 6 + FILTER_VER_CHROMA_SS 4, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 64, ss, 6 + FILTER_VER_CHROMA_SS 16, 64, ss, 6 + FILTER_VER_CHROMA_SS 32, 48, ss, 6 + FILTER_VER_CHROMA_SS 24, 64, ss, 6 + + FILTER_VER_CHROMA_SS 16, 24, ps, 7 + FILTER_VER_CHROMA_SS 12, 32, ps, 7 + FILTER_VER_CHROMA_SS 4, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 64, ps, 7 + FILTER_VER_CHROMA_SS 16, 64, ps, 7 + FILTER_VER_CHROMA_SS 32, 48, ps, 7 + FILTER_VER_CHROMA_SS 24, 64, ps, 7 + + FILTER_VER_CHROMA_SS 16, 24, sp, 8 + FILTER_VER_CHROMA_SS 12, 32, sp, 8 + FILTER_VER_CHROMA_SS 4, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 64, sp, 8 + FILTER_VER_CHROMA_SS 16, 64, sp, 8 + FILTER_VER_CHROMA_SS 32, 48, sp, 8 + FILTER_VER_CHROMA_SS 24, 64, sp, 8 + + FILTER_VER_CHROMA_SS 16, 24, pp, 8 + FILTER_VER_CHROMA_SS 12, 32, pp, 8 + FILTER_VER_CHROMA_SS 4, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 64, pp, 8 + FILTER_VER_CHROMA_SS 16, 64, pp, 8 + FILTER_VER_CHROMA_SS 32, 48, pp, 8 + FILTER_VER_CHROMA_SS 24, 64, pp, 8 + + + FILTER_VER_CHROMA_SS 48, 64, ss, 6 + FILTER_VER_CHROMA_SS 64, 48, ss, 6 + FILTER_VER_CHROMA_SS 64, 64, ss, 6 + FILTER_VER_CHROMA_SS 64, 32, ss, 6 + FILTER_VER_CHROMA_SS 64, 16, ss, 6 + + FILTER_VER_CHROMA_SS 48, 64, ps, 7 + FILTER_VER_CHROMA_SS 64, 48, ps, 7 + FILTER_VER_CHROMA_SS 64, 64, ps, 7 + FILTER_VER_CHROMA_SS 64, 32, ps, 7 + FILTER_VER_CHROMA_SS 64, 16, ps, 7 + + FILTER_VER_CHROMA_SS 48, 64, sp, 8 + FILTER_VER_CHROMA_SS 64, 48, sp, 8 + FILTER_VER_CHROMA_SS 64, 64, sp, 8 + FILTER_VER_CHROMA_SS 64, 32, sp, 8 + FILTER_VER_CHROMA_SS 64, 16, sp, 8 + + FILTER_VER_CHROMA_SS 48, 64, pp, 8 + FILTER_VER_CHROMA_SS 64, 48, pp, 8 + FILTER_VER_CHROMA_SS 64, 64, pp, 8 + FILTER_VER_CHROMA_SS 64, 32, pp, 8 + FILTER_VER_CHROMA_SS 64, 16, pp, 8 + + +%macro PROCESS_CHROMA_SP_W2_4R 1 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + + lea r0, [r0 + 2 * r1] + movd m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + punpcklqdq m0, m1 ;m0=[0 1 1 2] + pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m2, m1 ;m2=[2 3] + + lea r0, [r0 + 2 * r1] + movd m3, [r0] + punpcklwd m1, m3 ;m2=[3 4] + punpcklqdq m2, m1 ;m2=[2 3 3 4] + + pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 + pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 + paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m3, m1 ;m3=[4 5] + + movd m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[5 6] + punpcklqdq m3, m1 ;m2=[4 5 5 6] + pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 + paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 +%endmacro + +;--------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_%2_2x%1(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W2 3 +INIT_XMM sse4 +cglobal interp_4tap_vert_%2_2x%1, 5, 6, %3 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, (%1/4) +%ifnidn %2, ss + %ifnidn %2, ps + pxor m7, m7 + mova m6, [pw_pixel_max] + %ifidn %2, pp + mova m5, [tab_c_32] + %else + mova m5, [tab_c_524800] + %endif + %else + mova m5, [tab_c_n32768] + %endif +%endif + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 +%ifidn %2, ss + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 +%elifidn %2, ps + paddd m0, m5 + paddd m2, m5 + psrad m0, 2 + psrad m2, 2 + packssdw m0, m2 +%else + paddd m0, m5 + paddd m2, m5 + %ifidn %2, pp + psrad m0, 6 + psrad m2, 6 + %else + psrad m0, 10 + psrad m2, 10 + %endif + packusdw m0, m2 + CLIPW m0, m7, m6 +%endif + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_W2 4, ss, 5 +FILTER_VER_CHROMA_W2 8, ss, 5 + +FILTER_VER_CHROMA_W2 4, pp, 8 +FILTER_VER_CHROMA_W2 8, pp, 8 + +FILTER_VER_CHROMA_W2 4, ps, 6 +FILTER_VER_CHROMA_W2 8, ps, 6 + +FILTER_VER_CHROMA_W2 4, sp, 8 +FILTER_VER_CHROMA_W2 8, sp, 8 + +FILTER_VER_CHROMA_W2 16, ss, 5 +FILTER_VER_CHROMA_W2 16, pp, 8 +FILTER_VER_CHROMA_W2 16, ps, 6 +FILTER_VER_CHROMA_W2 16, sp, 8 + + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W4 3 +INIT_XMM sse4 +cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + +%ifnidn %2, 2 + mov r4d, %1/2 +%endif + +%ifnidn %2, ss + %ifnidn %2, ps + pxor m6, m6 + mova m5, [pw_pixel_max] + %ifidn %2, pp + mova m4, [tab_c_32] + %else + mova m4, [tab_c_524800] + %endif + %else + mova m4, [tab_c_n32768] + %endif +%endif + +%ifnidn %2, 2 +.loop: +%endif + + movh m0, [r0] + movh m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movh m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movh m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + + movh m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1=[1+2+3+4] Row2 done + +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 +%elifidn %2, ps + paddd m0, m4 + paddd m1, m4 + psrad m0, 2 + psrad m1, 2 + packssdw m0, m1 +%else + paddd m0, m4 + paddd m1, m4 + %ifidn %2, pp + psrad m0, 6 + psrad m1, 6 + %else + psrad m0, 10 + psrad m1, 10 + %endif + packusdw m0, m1 + CLIPW m0, m6, m5 +%endif + + movh [r2], m0 + movhps [r2 + r3], m0 + +%ifnidn %2, 2 + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop +%endif + + RET +%endmacro + +FILTER_VER_CHROMA_W4 2, ss, 4 +FILTER_VER_CHROMA_W4 2, pp, 7 +FILTER_VER_CHROMA_W4 2, ps, 5 +FILTER_VER_CHROMA_W4 2, sp, 7 + +FILTER_VER_CHROMA_W4 4, ss, 4 +FILTER_VER_CHROMA_W4 4, pp, 7 +FILTER_VER_CHROMA_W4 4, ps, 5 +FILTER_VER_CHROMA_W4 4, sp, 7 + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W6 3 +INIT_XMM sse4 +cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %1/4 + +%ifnidn %2, ss + %ifnidn %2, ps + mova m7, [pw_pixel_max] + %ifidn %2, pp + mova m6, [tab_c_32] + %else + mova m6, [tab_c_524800] + %endif + %else + mova m6, [tab_c_n32768] + %endif +%endif + +.loopH: + PROCESS_CHROMA_SP_W4_4R + +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %2, ps + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + %ifidn %2, pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + %else + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + %endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, m7 +%endif + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + PROCESS_CHROMA_SP_W2_4R r6 + +%ifidn %2, ss + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 +%elifidn %2, ps + paddd m0, m6 + paddd m2, m6 + psrad m0, 2 + psrad m2, 2 + packssdw m0, m2 +%else + paddd m0, m6 + paddd m2, m6 + %ifidn %2, pp + psrad m0, 6 + psrad m2, 6 + %else + psrad m0, 10 + psrad m2, 10 + %endif + packusdw m0, m2 + CLIPW m0, m5, m7 +%endif + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 2 * 4] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_W6 8, ss, 6 +FILTER_VER_CHROMA_W6 8, ps, 7 +FILTER_VER_CHROMA_W6 8, sp, 8 +FILTER_VER_CHROMA_W6 8, pp, 8 + +FILTER_VER_CHROMA_W6 16, ss, 6 +FILTER_VER_CHROMA_W6 16, ps, 7 +FILTER_VER_CHROMA_W6 16, sp, 8 +FILTER_VER_CHROMA_W6 16, pp, 8 + +%macro PROCESS_CHROMA_SP_W8_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l + paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h + paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l + paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h + paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%endmacro + +;---------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;---------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W8 4 +INIT_XMM sse2 +cglobal interp_4tap_vert_%3_%1x%2, 5, 6, %4 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %2/2 + +%ifidn %3, pp + mova m7, [tab_c_32] +%elifidn %3, sp + mova m7, [tab_c_524800] +%elifidn %3, ps + mova m7, [tab_c_n32768] +%endif + +.loopH: + PROCESS_CHROMA_SP_W8_2R + +%ifidn %3, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %3, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + %ifidn %3, pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + %else + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + %endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + mova m6, [pw_pixel_max] + CLIPW2 m0, m2, m5, m6 +%endif + + movu [r2], m0 + movu [r2 + r3], m2 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_W8 8, 2, ss, 7 +FILTER_VER_CHROMA_W8 8, 4, ss, 7 +FILTER_VER_CHROMA_W8 8, 6, ss, 7 +FILTER_VER_CHROMA_W8 8, 8, ss, 7 +FILTER_VER_CHROMA_W8 8, 16, ss, 7 +FILTER_VER_CHROMA_W8 8, 32, ss, 7 + +FILTER_VER_CHROMA_W8 8, 2, sp, 8 +FILTER_VER_CHROMA_W8 8, 4, sp, 8 +FILTER_VER_CHROMA_W8 8, 6, sp, 8 +FILTER_VER_CHROMA_W8 8, 8, sp, 8 +FILTER_VER_CHROMA_W8 8, 16, sp, 8 +FILTER_VER_CHROMA_W8 8, 32, sp, 8 + +FILTER_VER_CHROMA_W8 8, 2, ps, 8 +FILTER_VER_CHROMA_W8 8, 4, ps, 8 +FILTER_VER_CHROMA_W8 8, 6, ps, 8 +FILTER_VER_CHROMA_W8 8, 8, ps, 8 +FILTER_VER_CHROMA_W8 8, 16, ps, 8 +FILTER_VER_CHROMA_W8 8, 32, ps, 8 + +FILTER_VER_CHROMA_W8 8, 2, pp, 8 +FILTER_VER_CHROMA_W8 8, 4, pp, 8 +FILTER_VER_CHROMA_W8 8, 6, pp, 8 +FILTER_VER_CHROMA_W8 8, 8, pp, 8 +FILTER_VER_CHROMA_W8 8, 16, pp, 8 +FILTER_VER_CHROMA_W8 8, 32, pp, 8 + +FILTER_VER_CHROMA_W8 8, 12, ss, 7 +FILTER_VER_CHROMA_W8 8, 64, ss, 7 +FILTER_VER_CHROMA_W8 8, 12, sp, 8 +FILTER_VER_CHROMA_W8 8, 64, sp, 8 +FILTER_VER_CHROMA_W8 8, 12, ps, 8 +FILTER_VER_CHROMA_W8 8, 64, ps, 8 +FILTER_VER_CHROMA_W8 8, 12, pp, 8 +FILTER_VER_CHROMA_W8 8, 64, pp, 8 + + +INIT_XMM sse2 +cglobal chroma_p2s, 3, 7, 3 + + ; load width and height + mov r3d, r3m + mov r4d, r4m + add r1, r1 + + ; load constant + mova m2, [tab_c_n8192] + +.loopH: + + xor r5d, r5d +.loopW: + lea r6, [r0 + r5 * 2] + + movu m0, [r6] + psllw m0, 4 + paddw m0, m2 + + movu m1, [r6 + r1] + psllw m1, 4 + paddw m1, m2 + + add r5d, 8 + cmp r5d, r3d + lea r6, [r2 + r5 * 2] + jg .width4 + movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + je .nextH + jmp .loopW + +.width4: + test r3d, 4 + jz .width2 + test r3d, 2 + movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + lea r6, [r6 + 8] + pshufd m0, m0, 2 + pshufd m1, m1, 2 + jz .nextH + +.width2: + movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + +.nextH: + lea r0, [r0 + r1 * 2] + add r2, FENC_STRIDE / 2 * 4 + + sub r4d, 2 + jnz .loopH + + RET + +%macro PROCESS_LUMA_VER_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_PP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mova m7, [pd_32] + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + pxor m1, m1 + CLIPW2 m0, m2, m1, [pw_pixel_max] + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_PP 4, 4 + FILTER_VER_LUMA_PP 8, 8 + FILTER_VER_LUMA_PP 8, 4 + FILTER_VER_LUMA_PP 4, 8 + FILTER_VER_LUMA_PP 16, 16 + FILTER_VER_LUMA_PP 16, 8 + FILTER_VER_LUMA_PP 8, 16 + FILTER_VER_LUMA_PP 16, 12 + FILTER_VER_LUMA_PP 12, 16 + FILTER_VER_LUMA_PP 16, 4 + FILTER_VER_LUMA_PP 4, 16 + FILTER_VER_LUMA_PP 32, 32 + FILTER_VER_LUMA_PP 32, 16 + FILTER_VER_LUMA_PP 16, 32 + FILTER_VER_LUMA_PP 32, 24 + FILTER_VER_LUMA_PP 24, 32 + FILTER_VER_LUMA_PP 32, 8 + FILTER_VER_LUMA_PP 8, 32 + FILTER_VER_LUMA_PP 64, 64 + FILTER_VER_LUMA_PP 64, 32 + FILTER_VER_LUMA_PP 32, 64 + FILTER_VER_LUMA_PP 64, 48 + FILTER_VER_LUMA_PP 48, 64 + FILTER_VER_LUMA_PP 64, 16 + FILTER_VER_LUMA_PP 16, 64 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_PS 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mova m7, [pd_n32768] + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + + packssdw m0, m1 + packssdw m2, m3 + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +;--------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_PS 4, 4 + FILTER_VER_LUMA_PS 8, 8 + FILTER_VER_LUMA_PS 8, 4 + FILTER_VER_LUMA_PS 4, 8 + FILTER_VER_LUMA_PS 16, 16 + FILTER_VER_LUMA_PS 16, 8 + FILTER_VER_LUMA_PS 8, 16 + FILTER_VER_LUMA_PS 16, 12 + FILTER_VER_LUMA_PS 12, 16 + FILTER_VER_LUMA_PS 16, 4 + FILTER_VER_LUMA_PS 4, 16 + FILTER_VER_LUMA_PS 32, 32 + FILTER_VER_LUMA_PS 32, 16 + FILTER_VER_LUMA_PS 16, 32 + FILTER_VER_LUMA_PS 32, 24 + FILTER_VER_LUMA_PS 24, 32 + FILTER_VER_LUMA_PS 32, 8 + FILTER_VER_LUMA_PS 8, 32 + FILTER_VER_LUMA_PS 64, 64 + FILTER_VER_LUMA_PS 64, 32 + FILTER_VER_LUMA_PS 32, 64 + FILTER_VER_LUMA_PS 64, 48 + FILTER_VER_LUMA_PS 48, 64 + FILTER_VER_LUMA_PS 64, 16 + FILTER_VER_LUMA_PS 16, 64 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mova m7, [tab_c_524800] + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + + packssdw m0, m1 + packssdw m2, m3 + + pxor m1, m1 + CLIPW2 m0, m2, m1, [pw_pixel_max] + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_SP 4, 4 + FILTER_VER_LUMA_SP 8, 8 + FILTER_VER_LUMA_SP 8, 4 + FILTER_VER_LUMA_SP 4, 8 + FILTER_VER_LUMA_SP 16, 16 + FILTER_VER_LUMA_SP 16, 8 + FILTER_VER_LUMA_SP 8, 16 + FILTER_VER_LUMA_SP 16, 12 + FILTER_VER_LUMA_SP 12, 16 + FILTER_VER_LUMA_SP 16, 4 + FILTER_VER_LUMA_SP 4, 16 + FILTER_VER_LUMA_SP 32, 32 + FILTER_VER_LUMA_SP 32, 16 + FILTER_VER_LUMA_SP 16, 32 + FILTER_VER_LUMA_SP 32, 24 + FILTER_VER_LUMA_SP 24, 32 + FILTER_VER_LUMA_SP 32, 8 + FILTER_VER_LUMA_SP 8, 32 + FILTER_VER_LUMA_SP 64, 64 + FILTER_VER_LUMA_SP 64, 32 + FILTER_VER_LUMA_SP 32, 64 + FILTER_VER_LUMA_SP 64, 48 + FILTER_VER_LUMA_SP 48, 64 + FILTER_VER_LUMA_SP 64, 16 + FILTER_VER_LUMA_SP 16, 64 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SS 2 +INIT_XMM sse2 +cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R + + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + movlps [r2], m0 + movhps [r2 + r3], m0 + + psrad m2, 6 + psrad m3, 6 + packssdw m2, m3 + movlps [r2 + 2 * r3], m2 + lea r5, [3 * r3] + movhps [r2 + r5], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_LUMA_SS 4, 4 + FILTER_VER_LUMA_SS 8, 8 + FILTER_VER_LUMA_SS 8, 4 + FILTER_VER_LUMA_SS 4, 8 + FILTER_VER_LUMA_SS 16, 16 + FILTER_VER_LUMA_SS 16, 8 + FILTER_VER_LUMA_SS 8, 16 + FILTER_VER_LUMA_SS 16, 12 + FILTER_VER_LUMA_SS 12, 16 + FILTER_VER_LUMA_SS 16, 4 + FILTER_VER_LUMA_SS 4, 16 + FILTER_VER_LUMA_SS 32, 32 + FILTER_VER_LUMA_SS 32, 16 + FILTER_VER_LUMA_SS 16, 32 + FILTER_VER_LUMA_SS 32, 24 + FILTER_VER_LUMA_SS 24, 32 + FILTER_VER_LUMA_SS 32, 8 + FILTER_VER_LUMA_SS 8, 32 + FILTER_VER_LUMA_SS 64, 64 + FILTER_VER_LUMA_SS 64, 32 + FILTER_VER_LUMA_SS 32, 64 + FILTER_VER_LUMA_SS 64, 48 + FILTER_VER_LUMA_SS 48, 64 + FILTER_VER_LUMA_SS 64, 16 + FILTER_VER_LUMA_SS 16, 64 + +;-------------------------------------------------------------------------------------------------- +; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;-------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal luma_p2s, 3, 7, 5 + + add r1, r1 + + ; load width and height + mov r3d, r3m + mov r4d, r4m + + ; load constant + mova m4, [tab_c_n8192] + +.loopH: + + xor r5d, r5d +.loopW: + lea r6, [r0 + r5 * 2] + + movu m0, [r6] + psllw m0, 4 + paddw m0, m4 + + movu m1, [r6 + r1] + psllw m1, 4 + paddw m1, m4 + + movu m2, [r6 + r1 * 2] + psllw m2, 4 + paddw m2, m4 + + lea r6, [r6 + r1 * 2] + movu m3, [r6 + r1] + psllw m3, 4 + paddw m3, m4 + + add r5, 8 + cmp r5, r3 + jg .width4 + movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + je .nextH + jmp .loopW + +.width4: + movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 + + sub r4d, 4 + jnz .loopH + + RET diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm new file mode 100644 index 0000000..52fc42c --- /dev/null +++ b/source/common/x86/ipfilter8.asm @@ -0,0 +1,5599 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen +;* Nabajit Deka +;* Praveen Kumar Tiwari +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 +tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 + +ALIGN 32 +tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 + db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 + db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 + db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14 + +tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 + +tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 + +tab_c_512: times 8 dw 512 +tab_c_526336: times 4 dd 8192*64+2048 + +tab_ChromaCoeff: db 0, 64, 0, 0 + db -2, 58, 10, -2 + db -4, 54, 16, -2 + db -6, 46, 28, -4 + db -4, 36, 36, -4 + db -4, 28, 46, -6 + db -2, 16, 54, -4 + db -2, 10, 58, -2 + +tab_ChromaCoeffV: times 4 dw 0, 64 + times 4 dw 0, 0 + + times 4 dw -2, 58 + times 4 dw 10, -2 + + times 4 dw -4, 54 + times 4 dw 16, -2 + + times 4 dw -6, 46 + times 4 dw 28, -4 + + times 4 dw -4, 36 + times 4 dw 36, -4 + + times 4 dw -4, 28 + times 4 dw 46, -6 + + times 4 dw -2, 16 + times 4 dw 54, -4 + + times 4 dw -2, 10 + times 4 dw 58, -2 + +tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0 + db -1, 4, -10, 58, 17, -5, 1, 0 + db -1, 4, -11, 40, 40, -11, 4, -1 + db 0, 1, -5, 17, 58, -10, 4, -1 + +tab_LumaCoeffV: times 4 dw 0, 0 + times 4 dw 0, 64 + times 4 dw 0, 0 + times 4 dw 0, 0 + + times 4 dw -1, 4 + times 4 dw -10, 58 + times 4 dw 17, -5 + times 4 dw 1, 0 + + times 4 dw -1, 4 + times 4 dw -11, 40 + times 4 dw 40, -11 + times 4 dw 4, -1 + + times 4 dw 0, 1 + times 4 dw -5, 17 + times 4 dw 58, -10 + times 4 dw 4, -1 + +tab_LumaCoeffVer: times 8 db 0, 0 + times 8 db 0, 64 + times 8 db 0, 0 + times 8 db 0, 0 + + times 8 db -1, 4 + times 8 db -10, 58 + times 8 db 17, -5 + times 8 db 1, 0 + + times 8 db -1, 4 + times 8 db -11, 40 + times 8 db 40, -11 + times 8 db 4, -1 + + times 8 db 0, 1 + times 8 db -5, 17 + times 8 db 58, -10 + times 8 db 4, -1 + +tab_c_128: times 16 db 0x80 +tab_c_64_n64: times 8 db 64, -64 + + +SECTION .text + +cextern idct4_shuf1 +cextern pw_1 +cextern pw_512 +cextern pw_2000 + +%macro FILTER_H4_w2_2 3 + movh %2, [srcq - 1] + pshufb %2, %2, Tm0 + movh %1, [srcq + srcstrideq - 1] + pshufb %1, %1, Tm0 + punpcklqdq %2, %1 + pmaddubsw %2, coef2 + phaddw %2, %2 + pmulhrsw %2, %3 + packuswb %2, %2 + movd r4, %2 + mov [dstq], r4w + shr r4, 16 + mov [dstq + dststrideq], r4w +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +%rep 2 +FILTER_H4_w2_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +%endrep + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +%rep 4 +FILTER_H4_w2_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +%endrep + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +mov r5d, 16/2 + +.loop: +FILTER_H4_w2_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +dec r5d +jnz .loop + +RET + +%macro FILTER_H4_w4_2 3 + movh %2, [srcq - 1] + pshufb %2, %2, Tm0 + pmaddubsw %2, coef2 + movh %1, [srcq + srcstrideq - 1] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movd [dstq], %2 + palignr %2, %2, 4 + movd [dstq + dststrideq], %2 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +FILTER_H4_w4_2 t0, t1, t2 + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +%rep 2 +FILTER_H4_w4_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +%endrep + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +%rep 4 +FILTER_H4_w4_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +%endrep + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +%rep 8 +FILTER_H4_w4_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +%endrep + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] + +mov r5d, 32/2 + +.loop: +FILTER_H4_w4_2 t0, t1, t2 +lea srcq, [srcq + srcstrideq * 2] +lea dstq, [dstq + dststrideq * 2] +dec r5d +jnz .loop + +RET + + +%macro FILTER_H4_w6 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movd [dstq], %2 + pextrw [dstq + 4], %2, 2 +%endmacro + +%macro FILTER_H4_w8 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movh [dstq], %2 +%endmacro + +%macro FILTER_H4_w12 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + movu %1, [srcq - 1 + 8] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %1, %1 + pmulhrsw %1, %3 + packuswb %2, %1 + movh [dstq], %2 + pextrd [dstq + 8], %2, 2 +%endmacro + +%macro FILTER_H4_w16 4 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq], %2 +%endmacro + +%macro FILTER_H4_w24 4 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq], %2 + movu %1, [srcq - 1 + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movh [dstq + 16], %2 +%endmacro + +%macro FILTER_H4_w32 4 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq], %2 + movu %1, [srcq - 1 + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 24] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq + 16], %2 +%endmacro + +%macro FILTER_H4_w16o 5 + movu %1, [srcq + %5 - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + %5 - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq + %5], %2 +%endmacro + +%macro FILTER_H4_w48 4 + FILTER_H4_w16o %1, %2, %3, %4, 0 + FILTER_H4_w16o %1, %2, %3, %4, 16 + FILTER_H4_w16o %1, %2, %3, %4, 32 +%endmacro + +%macro FILTER_H4_w64 4 + FILTER_H4_w16o %1, %2, %3, %4, 0 + FILTER_H4_w16o %1, %2, %3, %4, 16 + FILTER_H4_w16o %1, %2, %3, %4, 32 + FILTER_H4_w16o %1, %2, %3, %4, 48 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro IPFILTER_CHROMA 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +mov r5d, %2 + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] +mova Tm1, [tab_Tm + 16] + +.loop: +FILTER_H4_w%1 t0, t1, t2 +add srcq, srcstrideq +add dstq, dststrideq + +dec r5d +jnz .loop + +RET +%endmacro + + +IPFILTER_CHROMA 6, 8 +IPFILTER_CHROMA 8, 2 +IPFILTER_CHROMA 8, 4 +IPFILTER_CHROMA 8, 6 +IPFILTER_CHROMA 8, 8 +IPFILTER_CHROMA 8, 16 +IPFILTER_CHROMA 8, 32 +IPFILTER_CHROMA 12, 16 + +IPFILTER_CHROMA 6, 16 +IPFILTER_CHROMA 8, 12 +IPFILTER_CHROMA 8, 64 +IPFILTER_CHROMA 12, 32 + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_W 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride +%define coef2 m6 +%define Tm0 m5 +%define Tm1 m4 +%define t3 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd coef2, [r5 + r4 * 4] +%else +movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + +mov r5d, %2 + +pshufd coef2, coef2, 0 +mova t2, [tab_c_512] +mova Tm0, [tab_Tm] +mova Tm1, [tab_Tm + 16] + +.loop: +FILTER_H4_w%1 t0, t1, t2, t3 +add srcq, srcstrideq +add dstq, dststrideq + +dec r5d +jnz .loop + +RET +%endmacro + +IPFILTER_CHROMA_W 16, 4 +IPFILTER_CHROMA_W 16, 8 +IPFILTER_CHROMA_W 16, 12 +IPFILTER_CHROMA_W 16, 16 +IPFILTER_CHROMA_W 16, 32 +IPFILTER_CHROMA_W 32, 8 +IPFILTER_CHROMA_W 32, 16 +IPFILTER_CHROMA_W 32, 24 +IPFILTER_CHROMA_W 24, 32 +IPFILTER_CHROMA_W 32, 32 + +IPFILTER_CHROMA_W 16, 24 +IPFILTER_CHROMA_W 16, 64 +IPFILTER_CHROMA_W 32, 48 +IPFILTER_CHROMA_W 24, 64 +IPFILTER_CHROMA_W 32, 64 + +IPFILTER_CHROMA_W 64, 64 +IPFILTER_CHROMA_W 64, 32 +IPFILTER_CHROMA_W 64, 48 +IPFILTER_CHROMA_W 48, 64 +IPFILTER_CHROMA_W 64, 16 + + +%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst + movu %1, %7 + pshufb %2, %1, [tab_Lm + 0] + pmaddubsw %2, %5 + pshufb %3, %1, [tab_Lm + 16] + pmaddubsw %3, %5 + phaddw %2, %3 + pshufb %4, %1, [tab_Lm + 32] + pmaddubsw %4, %5 + pshufb %1, %1, [tab_Lm + 48] + pmaddubsw %1, %5 + phaddw %4, %1 + phaddw %2, %4 + %if %0 == 8 + pmulhrsw %2, %6 + packuswb %2, %2 + movh %8, %2 + %endif +%endmacro + +%macro FILTER_H8_W4 2 + movu %1, [r0 - 3 + r5] + pshufb %2, %1, [tab_Lm] + pmaddubsw %2, m3 + pshufb m7, %1, [tab_Lm + 16] + pmaddubsw m7, m3 + phaddw %2, m7 + phaddw %2, %2 +%endmacro + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_LUMA 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 + + mov r4d, r4m + +%ifdef PIC + lea r6, [tab_LumaCoeff] + movh m3, [r6 + r4 * 8] +%else + movh m3, [tab_LumaCoeff + r4 * 8] +%endif + punpcklqdq m3, m3 + +%ifidn %3, pp + mova m2, [tab_c_512] +%else + mova m2, [pw_2000] +%endif + + mov r4d, %2 +%ifidn %3, ps + add r3, r3 + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + xor r5, r5 +%rep %1 / 8 + %ifidn %3, pp + FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5] + %else + FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5] + psubw m1, m2 + movu [r2 + 2 * r5], m1 + %endif + add r5, 8 +%endrep + +%rep (%1 % 8) / 4 + FILTER_H8_W4 m0, m1 + %ifidn %3, pp + pmulhrsw m1, m2 + packuswb m1, m1 + movd [r2 + r5], m1 + %else + psubw m1, m2 + movh [r2 + 2 * r5], m1 + %endif +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_4x4, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [tab_Lm] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + sub r0, 3 + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + + packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] + pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0] + + lea r0, [r3 * 3] + movd [r2], xm3 + pextrd [r2+r3], xm3, 2 + pextrd [r2+r3*2], xm3, 1 + pextrd [r2+r0], xm3, 3 + RET + + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + IPFILTER_LUMA 4, 4, pp + IPFILTER_LUMA 4, 8, pp + IPFILTER_LUMA 12, 16, pp + IPFILTER_LUMA 4, 16, pp + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_LUMA_PP_W8 2 +INIT_XMM sse4 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_LumaCoeff] + movh m3, [r5 + r4 * 8] +%else + movh m3, [tab_LumaCoeff + r4 * 8] +%endif + pshufd m0, m3, 0 ; m0 = coeff-L + pshufd m1, m3, 0x55 ; m1 = coeff-H + lea r5, [tab_Tm] ; r5 = shuffle + mova m2, [pw_512] ; m2 = 512 + + mov r4d, %2 +.loopH: +%assign x 0 +%rep %1 / 8 + movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0] + pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4] + pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8] + pmaddubsw m4, m0 + pmaddubsw m6, m5, m1 + pmaddubsw m5, m0 + pmaddubsw m3, m1 + paddw m4, m6 + paddw m5, m3 + phaddw m4, m5 + pmulhrsw m4, m2 + packuswb m4, m4 + movh [r2 + x], m4 +%assign x x+8 +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +IPFILTER_LUMA_PP_W8 8, 4 +IPFILTER_LUMA_PP_W8 8, 8 +IPFILTER_LUMA_PP_W8 8, 16 +IPFILTER_LUMA_PP_W8 8, 32 +IPFILTER_LUMA_PP_W8 16, 4 +IPFILTER_LUMA_PP_W8 16, 8 +IPFILTER_LUMA_PP_W8 16, 12 +IPFILTER_LUMA_PP_W8 16, 16 +IPFILTER_LUMA_PP_W8 16, 32 +IPFILTER_LUMA_PP_W8 16, 64 +IPFILTER_LUMA_PP_W8 24, 32 +IPFILTER_LUMA_PP_W8 32, 8 +IPFILTER_LUMA_PP_W8 32, 16 +IPFILTER_LUMA_PP_W8 32, 24 +IPFILTER_LUMA_PP_W8 32, 32 +IPFILTER_LUMA_PP_W8 32, 64 +IPFILTER_LUMA_PP_W8 48, 64 +IPFILTER_LUMA_PP_W8 64, 16 +IPFILTER_LUMA_PP_W8 64, 32 +IPFILTER_LUMA_PP_W8 64, 48 +IPFILTER_LUMA_PP_W8 64, 64 + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- + IPFILTER_LUMA 4, 4, ps + IPFILTER_LUMA 8, 8, ps + IPFILTER_LUMA 8, 4, ps + IPFILTER_LUMA 4, 8, ps + IPFILTER_LUMA 16, 16, ps + IPFILTER_LUMA 16, 8, ps + IPFILTER_LUMA 8, 16, ps + IPFILTER_LUMA 16, 12, ps + IPFILTER_LUMA 12, 16, ps + IPFILTER_LUMA 16, 4, ps + IPFILTER_LUMA 4, 16, ps + IPFILTER_LUMA 32, 32, ps + IPFILTER_LUMA 32, 16, ps + IPFILTER_LUMA 16, 32, ps + IPFILTER_LUMA 32, 24, ps + IPFILTER_LUMA 24, 32, ps + IPFILTER_LUMA 32, 8, ps + IPFILTER_LUMA 8, 32, ps + IPFILTER_LUMA 64, 64, ps + IPFILTER_LUMA 64, 32, ps + IPFILTER_LUMA 32, 64, ps + IPFILTER_LUMA 64, 48, ps + IPFILTER_LUMA 48, 64, ps + IPFILTER_LUMA 64, 16, ps + IPFILTER_LUMA 16, 64, ps + +;----------------------------------------------------------------------------- +; Interpolate HV +;----------------------------------------------------------------------------- +%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] + mova %5, [r0 + (%6 + 0) * 16] + mova %1, [r0 + (%6 + 1) * 16] + mova %2, [r0 + (%6 + 2) * 16] + punpcklwd %3, %5, %1 + punpckhwd %5, %1 + pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 + pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] + punpcklwd %4, %1, %2 + punpckhwd %1, %2 + pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 + pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] +%endmacro ; FILTER_HV8_START + +%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] + mova %8, [r0 + (%9 + 0) * 16] + mova %1, [r0 + (%9 + 1) * 16] + punpcklwd %7, %2, %8 + punpckhwd %2, %8 + pmaddwd %7, [r5 + %10 * 16] + pmaddwd %2, [r5 + %10 * 16] + paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 + paddd %5, %2 ; R0 = H[0+1+2+3] + punpcklwd %7, %8, %1 + punpckhwd %8, %1 + pmaddwd %7, [r5 + %10 * 16] + pmaddwd %8, [r5 + %10 * 16] + paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 + paddd %6, %8 ; R1 = H[1+2+3+4] +%endmacro ; FILTER_HV8_MID + +; Round and Saturate +%macro FILTER_HV8_END 4 ; output in [1, 3] + paddd %1, [tab_c_526336] + paddd %2, [tab_c_526336] + paddd %3, [tab_c_526336] + paddd %4, [tab_c_526336] + psrad %1, 12 + psrad %2, 12 + psrad %3, 12 + psrad %4, 12 + packssdw %1, %2 + packssdw %3, %4 + + ; TODO: is merge better? I think this way is short dependency link + packuswb %1, %3 +%endmacro ; FILTER_HV8_END + +;----------------------------------------------------------------------------- +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 +%define coef m7 +%define stk_buf rsp + + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_LumaCoeff] + movh coef, [r6 + r4 * 8] +%else + movh coef, [tab_LumaCoeff + r4 * 8] +%endif + punpcklqdq coef, coef + + ; move to row -3 + lea r6, [r1 + r1 * 2] + sub r0, r6 + + xor r6, r6 + mov r4, rsp + +.loopH: + FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3] + psubw m1, [pw_2000] + mova [r4], m1 + + add r0, r1 + add r4, 16 + inc r6 + cmp r6, 8+7 + jnz .loopH + + ; ready to phase V + ; Here all of mN is free + + ; load coeff table + shl r5, 6 + lea r6, [tab_LumaCoeffV] + lea r5, [r5 + r6] + + ; load intermedia buffer + mov r0, stk_buf + + ; register mapping + ; r0 - src + ; r5 - coeff + ; r6 - loop_i + + ; let's go + xor r6, r6 + + ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache +.loopV: + + FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 + FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 + FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 + FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 + FILTER_HV8_END m3, m0, m4, m1 + + movh [r2], m3 + movhps [r2 + r3], m3 + + lea r0, [r0 + 16 * 2] + lea r2, [r2 + r3 * 2] + + inc r6 + cmp r6, 8/2 + jnz .loopV + + RET + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif +lea r4, [r1 * 3] +lea r5, [r0 + 4 * r1] +pshufb m0, [tab_Cm] +mova m1, [tab_c_512] + +movd m2, [r0] +movd m3, [r0 + r1] +movd m4, [r0 + 2 * r1] +movd m5, [r0 + r4] + +punpcklbw m2, m3 +punpcklbw m6, m4, m5 +punpcklbw m2, m6 + +pmaddubsw m2, m0 + +movd m6, [r5] + +punpcklbw m3, m4 +punpcklbw m7, m5, m6 +punpcklbw m3, m7 + +pmaddubsw m3, m0 + +phaddw m2, m3 + +pmulhrsw m2, m1 + +movd m7, [r5 + r1] + +punpcklbw m4, m5 +punpcklbw m3, m6, m7 +punpcklbw m4, m3 + +pmaddubsw m4, m0 + +movd m3, [r5 + 2 * r1] + +punpcklbw m5, m6 +punpcklbw m7, m3 +punpcklbw m5, m7 + +pmaddubsw m5, m0 + +phaddw m4, m5 + +pmulhrsw m4, m1 +packuswb m2, m4 + +pextrw [r2], m2, 0 +pextrw [r2 + r3], m2, 2 +lea r2, [r2 + 2 * r3] +pextrw [r2], m2, 4 +pextrw [r2 + r3], m2, 6 + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W2_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m0, [tab_Cm] + +mova m1, [tab_c_512] + +mov r4d, %2 +lea r5, [3 * r1] + +.loop: +movd m2, [r0] +movd m3, [r0 + r1] +movd m4, [r0 + 2 * r1] +movd m5, [r0 + r5] + +punpcklbw m2, m3 +punpcklbw m6, m4, m5 +punpcklbw m2, m6 + +pmaddubsw m2, m0 + +lea r0, [r0 + 4 * r1] +movd m6, [r0] + +punpcklbw m3, m4 +punpcklbw m7, m5, m6 +punpcklbw m3, m7 + +pmaddubsw m3, m0 + +phaddw m2, m3 + +pmulhrsw m2, m1 + +movd m7, [r0 + r1] + +punpcklbw m4, m5 +punpcklbw m3, m6, m7 +punpcklbw m4, m3 + +pmaddubsw m4, m0 + +movd m3, [r0 + 2 * r1] + +punpcklbw m5, m6 +punpcklbw m7, m3 +punpcklbw m5, m7 + +pmaddubsw m5, m0 + +phaddw m4, m5 + +pmulhrsw m4, m1 +packuswb m2, m4 + +pextrw [r2], m2, 0 +pextrw [r2 + r3], m2, 2 +lea r2, [r2 + 2 * r3] +pextrw [r2], m2, 4 +pextrw [r2 + r3], m2, 6 + +lea r2, [r2 + 2 * r3] + +sub r4, 4 +jnz .loop +RET +%endmacro + +FILTER_V4_W2_H4 2, 8 + +FILTER_V4_W2_H4 2, 16 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m0, [tab_Cm] +lea r5, [r0 + 2 * r1] + +movd m2, [r0] +movd m3, [r0 + r1] +movd m4, [r5] +movd m5, [r5 + r1] + +punpcklbw m2, m3 +punpcklbw m1, m4, m5 +punpcklbw m2, m1 + +pmaddubsw m2, m0 + +movd m1, [r0 + 4 * r1] + +punpcklbw m3, m4 +punpcklbw m5, m1 +punpcklbw m3, m5 + +pmaddubsw m3, m0 + +phaddw m2, m3 + +pmulhrsw m2, [tab_c_512] +packuswb m2, m2 +movd [r2], m2 +pextrd [r2 + r3], m2, 1 + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m0, [tab_Cm] +mova m1, [tab_c_512] +lea r5, [r0 + 4 * r1] +lea r4, [r1 * 3] + +movd m2, [r0] +movd m3, [r0 + r1] +movd m4, [r0 + 2 * r1] +movd m5, [r0 + r4] + +punpcklbw m2, m3 +punpcklbw m6, m4, m5 +punpcklbw m2, m6 + +pmaddubsw m2, m0 + +movd m6, [r5] + +punpcklbw m3, m4 +punpcklbw m7, m5, m6 +punpcklbw m3, m7 + +pmaddubsw m3, m0 + +phaddw m2, m3 + +pmulhrsw m2, m1 + +movd m7, [r5 + r1] + +punpcklbw m4, m5 +punpcklbw m3, m6, m7 +punpcklbw m4, m3 + +pmaddubsw m4, m0 + +movd m3, [r5 + 2 * r1] + +punpcklbw m5, m6 +punpcklbw m7, m3 +punpcklbw m5, m7 + +pmaddubsw m5, m0 + +phaddw m4, m5 + +pmulhrsw m4, m1 + +packuswb m2, m4 +movd [r2], m2 +pextrd [r2 + r3], m2, 1 +lea r2, [r2 + 2 * r3] +pextrd [r2], m2, 2 +pextrd [r2 + r3], m2, 3 + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W4_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m0, [tab_Cm] + +mova m1, [tab_c_512] + +mov r4d, %2 + +lea r5, [3 * r1] + +.loop: +movd m2, [r0] +movd m3, [r0 + r1] +movd m4, [r0 + 2 * r1] +movd m5, [r0 + r5] + +punpcklbw m2, m3 +punpcklbw m6, m4, m5 +punpcklbw m2, m6 + +pmaddubsw m2, m0 + +lea r0, [r0 + 4 * r1] +movd m6, [r0] + +punpcklbw m3, m4 +punpcklbw m7, m5, m6 +punpcklbw m3, m7 + +pmaddubsw m3, m0 + +phaddw m2, m3 + +pmulhrsw m2, m1 + +movd m7, [r0 + r1] + +punpcklbw m4, m5 +punpcklbw m3, m6, m7 +punpcklbw m4, m3 + +pmaddubsw m4, m0 + +movd m3, [r0 + 2 * r1] + +punpcklbw m5, m6 +punpcklbw m7, m3 +punpcklbw m5, m7 + +pmaddubsw m5, m0 + +phaddw m4, m5 + +pmulhrsw m4, m1 +packuswb m2, m4 +movd [r2], m2 +pextrd [r2 + r3], m2, 1 +lea r2, [r2 + 2 * r3] +pextrd [r2], m2, 2 +pextrd [r2 + r3], m2, 3 + +lea r2, [r2 + 2 * r3] + +sub r4, 4 +jnz .loop +RET +%endmacro + +FILTER_V4_W4_H4 4, 8 +FILTER_V4_W4_H4 4, 16 + +FILTER_V4_W4_H4 4, 32 + +%macro FILTER_V4_W8_H2 0 +punpcklbw m1, m2 +punpcklbw m7, m3, m0 + +pmaddubsw m1, m6 +pmaddubsw m7, m5 + +paddw m1, m7 + +pmulhrsw m1, m4 +packuswb m1, m1 +%endmacro + +%macro FILTER_V4_W8_H3 0 +punpcklbw m2, m3 +punpcklbw m7, m0, m1 + +pmaddubsw m2, m6 +pmaddubsw m7, m5 + +paddw m2, m7 + +pmulhrsw m2, m4 +packuswb m2, m2 +%endmacro + +%macro FILTER_V4_W8_H4 0 +punpcklbw m3, m0 +punpcklbw m7, m1, m2 + +pmaddubsw m3, m6 +pmaddubsw m7, m5 + +paddw m3, m7 + +pmulhrsw m3, m4 +packuswb m3, m3 +%endmacro + +%macro FILTER_V4_W8_H5 0 +punpcklbw m0, m1 +punpcklbw m7, m2, m3 + +pmaddubsw m0, m6 +pmaddubsw m7, m5 + +paddw m0, m7 + +pmulhrsw m0, m4 +packuswb m0, m0 +%endmacro + +%macro FILTER_V4_W8_8x2 2 +FILTER_V4_W8 %1, %2 +movq m0, [r0 + 4 * r1] + +FILTER_V4_W8_H2 + +movh [r2 + r3], m1 +%endmacro + +%macro FILTER_V4_W8_8x4 2 +FILTER_V4_W8_8x2 %1, %2 +;8x3 +lea r6, [r0 + 4 * r1] +movq m1, [r6 + r1] + +FILTER_V4_W8_H3 + +movh [r2 + 2 * r3], m2 + +;8x4 +movq m2, [r6 + 2 * r1] + +FILTER_V4_W8_H4 + +lea r5, [r2 + 2 * r3] +movh [r5 + r3], m3 +%endmacro + +%macro FILTER_V4_W8_8x6 2 +FILTER_V4_W8_8x4 %1, %2 +;8x5 +lea r6, [r6 + 2 * r1] +movq m3, [r6 + r1] + +FILTER_V4_W8_H5 + +movh [r2 + 4 * r3], m0 + +;8x6 +movq m0, [r0 + 8 * r1] + +FILTER_V4_W8_H2 + +lea r5, [r2 + 4 * r3] +movh [r5 + r3], m1 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 + +mov r4d, r4m + +sub r0, r1 +movq m0, [r0] +movq m1, [r0 + r1] +movq m2, [r0 + 2 * r1] +lea r5, [r0 + 2 * r1] +movq m3, [r5 + r1] + +punpcklbw m0, m1 +punpcklbw m4, m2, m3 + +%ifdef PIC +lea r6, [tab_ChromaCoeff] +movd m5, [r6 + r4 * 4] +%else +movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m6, m5, [tab_Vm] +pmaddubsw m0, m6 + +pshufb m5, [tab_Vm + 16] +pmaddubsw m4, m5 + +paddw m0, m4 + +mova m4, [tab_c_512] + +pmulhrsw m0, m4 +packuswb m0, m0 +movh [r2], m0 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +FILTER_V4_W8_8x2 8, 2 + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +FILTER_V4_W8_8x4 8, 4 + +RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +FILTER_V4_W8_8x6 8, 6 + +RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 + +mov r4d, r4m +sub r0, r1 +add r3d, r3d + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m0, [tab_Cm] + +movd m2, [r0] +movd m3, [r0 + r1] +lea r5, [r0 + 2 * r1] +movd m4, [r5] +movd m5, [r5 + r1] + +punpcklbw m2, m3 +punpcklbw m1, m4, m5 +punpcklbw m2, m1 + +pmaddubsw m2, m0 + +movd m1, [r0 + 4 * r1] + +punpcklbw m3, m4 +punpcklbw m5, m1 +punpcklbw m3, m5 + +pmaddubsw m3, m0 + +phaddw m2, m3 + +psubw m2, [pw_2000] +movh [r2], m2 +movhps [r2 + r3], m2 + +RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + lea r4, [r1 * 3] + lea r5, [r0 + 4 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + movd m6, [r5] + + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + mova m1, [pw_2000] + + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 + + movd m2, [r5 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r5 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 + + RET + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W4_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_2000] + + mov r4d, %2/4 + lea r5, [3 * r1] + +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W4_H4 4, 8 +FILTER_V_PS_W4_H4 4, 16 + +FILTER_V_PS_W4_H4 4, 32 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W8_H8_H16_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + + mov r4d, %2/2 + lea r5, [3 * r1] + +.loopH: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m2, m5 + + paddw m0, m2 + + psubw m0, m4 + movu [r2], m0 + + movq m0, [r0 + 4 * r1] + + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m3, m5 + + paddw m1, m3 + psubw m1, m4 + + movu [r2 + r3], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_V_PS_W8_H8_H16_H2 8, 2 +FILTER_V_PS_W8_H8_H16_H2 8, 4 +FILTER_V_PS_W8_H8_H16_H2 8, 6 + +FILTER_V_PS_W8_H8_H16_H2 8, 12 +FILTER_V_PS_W8_H8_H16_H2 8, 64 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W8_H8_H16_H32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + + mov r4d, %2/4 + lea r5, [3 * r1] + +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 + + paddw m0, m7 + + psubw m0, m4 + movu [r2], m0 + + lea r0, [r0 + 4 * r1] + movq m0, [r0] + + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + + psubw m1, m4 + movu [r2 + r3], m1 + + movq m1, [r0 + r1] + + punpcklbw m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m0, m5 + + paddw m2, m0 + + psubw m2, m4 + lea r2, [r2 + 2 * r3] + movu [r2], m2 + + movq m2, [r0 + 2 * r1] + + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + psubw m3, m4 + + movu [r2 + r3], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W8_H8_H16_H32 8, 8 +FILTER_V_PS_W8_H8_H16_H32 8, 16 +FILTER_V_PS_W8_H8_H16_H32 8, 32 + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_V_PS_W6 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 + +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 + + paddw m0, m7 + psubw m0, m4 + + movh [r2], m0 + pshufd m0, m0, 2 + movd [r2 + 8], m0 + + lea r0, [r0 + 4 * r1] + movq m0, [r0] + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + psubw m1, m4 + + movh [r2 + r3], m1 + pshufd m1, m1, 2 + movd [r2 + r3 + 8], m1 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m0, m5 + + paddw m2, m0 + psubw m2, m4 + + lea r2,[r2 + 2 * r3] + movh [r2], m2 + pshufd m2, m2, 2 + movd [r2 + 8], m2 + + movq m2,[r0 + 2 * r1] + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + psubw m3, m4 + + movh [r2 + r3], m3 + pshufd m3, m3, 2 + movd [r2 + r3 + 8], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W6 6, 8 +FILTER_V_PS_W6 6, 16 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W12 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movh [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r0 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movh [r2 + r3 + 16], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W12 12, 16 +FILTER_V_PS_W12 12, 32 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W16 16, 4 +FILTER_V_PS_W16 16, 8 +FILTER_V_PS_W16 16, 12 +FILTER_V_PS_W16 16, 16 +FILTER_V_PS_W16 16, 32 + +FILTER_V_PS_W16 16, 24 +FILTER_V_PS_W16 16, 64 + +;-------------------------------------------------------------------------------------------------------------- +;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V4_PS_W24 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + movq m2, [r0 + 16] + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + + punpcklbw m2, m3 + punpcklbw m7, m4, m5 + + pmaddubsw m2, m1 + pmaddubsw m7, m0 + + paddw m2, m7 + psubw m2, m6 + + movu [r2 + 32], m2 + + movq m2, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 + punpcklbw m5, m2 + + pmaddubsw m3, m1 + pmaddubsw m5, m0 + + paddw m3, m5 + psubw m3, m6 + + movu [r2 + r3 + 32], m3 + + mov r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V4_PS_W24 24, 32 + +FILTER_V4_PS_W24 24, 64 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mova m7, [pw_2000] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m3, [r5] + movu m5, [r5 + r1] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + psubw m4, m7 + psubw m2, m7 + + movu [r2], m4 + movu [r2 + 16], m2 + + movu m2, [r0 + 16] + movu m3, [r0 + r1 + 16] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + movu m3, [r5 + 16] + movu m5, [r5 + r1 + 16] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + psubw m4, m7 + psubw m2, m7 + + movu [r2 + 32], m4 + movu [r2 + 48], m2 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W32 32, 8 +FILTER_V_PS_W32 32, 16 +FILTER_V_PS_W32 32, 24 +FILTER_V_PS_W32 32, 32 + +FILTER_V_PS_W32 32, 48 +FILTER_V_PS_W32 32, 64 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8_H8_H16_H32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m5, [r5 + r4 * 4] +%else +movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m6, m5, [tab_Vm] +pshufb m5, [tab_Vm + 16] +mova m4, [tab_c_512] +lea r5, [r1 * 3] + +mov r4d, %2 + +.loop: +movq m0, [r0] +movq m1, [r0 + r1] +movq m2, [r0 + 2 * r1] +movq m3, [r0 + r5] + +punpcklbw m0, m1 +punpcklbw m1, m2 +punpcklbw m2, m3 + +pmaddubsw m0, m6 +pmaddubsw m7, m2, m5 + +paddw m0, m7 + +pmulhrsw m0, m4 +packuswb m0, m0 +movh [r2], m0 + +lea r0, [r0 + 4 * r1] +movq m0, [r0] + +punpcklbw m3, m0 + +pmaddubsw m1, m6 +pmaddubsw m7, m3, m5 + +paddw m1, m7 + +pmulhrsw m1, m4 +packuswb m1, m1 +movh [r2 + r3], m1 + +movq m1, [r0 + r1] + +punpcklbw m0, m1 + +pmaddubsw m2, m6 +pmaddubsw m0, m5 + +paddw m2, m0 + +pmulhrsw m2, m4 + +movq m7, [r0 + 2 * r1] +punpcklbw m1, m7 + +pmaddubsw m3, m6 +pmaddubsw m1, m5 + +paddw m3, m1 + +pmulhrsw m3, m4 +packuswb m2, m3 + +lea r2, [r2 + 2 * r3] +movh [r2], m2 +movhps [r2 + r3], m2 + +lea r2, [r2 + 2 * r3] + +sub r4, 4 +jnz .loop +RET +%endmacro + +FILTER_V4_W8_H8_H16_H32 8, 8 +FILTER_V4_W8_H8_H16_H32 8, 16 +FILTER_V4_W8_H8_H16_H32 8, 32 + +FILTER_V4_W8_H8_H16_H32 8, 12 +FILTER_V4_W8_H8_H16_H32 8, 64 + + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m5, [r5 + r4 * 4] +%else +movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m6, m5, [tab_Vm] +pshufb m5, [tab_Vm + 16] +mova m4, [tab_c_512] + +mov r4d, %2 +lea r5, [3 * r1] + +.loop: +movq m0, [r0] +movq m1, [r0 + r1] +movq m2, [r0 + 2 * r1] +movq m3, [r0 + r5] + +punpcklbw m0, m1 +punpcklbw m1, m2 +punpcklbw m2, m3 + +pmaddubsw m0, m6 +pmaddubsw m7, m2, m5 + +paddw m0, m7 + +pmulhrsw m0, m4 +packuswb m0, m0 +movd [r2], m0 +pextrw [r2 + 4], m0, 2 + +lea r0, [r0 + 4 * r1] + +movq m0, [r0] +punpcklbw m3, m0 + +pmaddubsw m1, m6 +pmaddubsw m7, m3, m5 + +paddw m1, m7 + +pmulhrsw m1, m4 +packuswb m1, m1 +movd [r2 + r3], m1 +pextrw [r2 + r3 + 4], m1, 2 + +movq m1, [r0 + r1] +punpcklbw m7, m0, m1 + +pmaddubsw m2, m6 +pmaddubsw m7, m5 + +paddw m2, m7 + +pmulhrsw m2, m4 +packuswb m2, m2 +lea r2, [r2 + 2 * r3] +movd [r2], m2 +pextrw [r2 + 4], m2, 2 + +movq m2, [r0 + 2 * r1] +punpcklbw m1, m2 + +pmaddubsw m3, m6 +pmaddubsw m1, m5 + +paddw m3, m1 + +pmulhrsw m3, m4 +packuswb m3, m3 + +movd [r2 + r3], m3 +pextrw [r2 + r3 + 4], m3, 2 + +lea r2, [r2 + 2 * r3] + +sub r4, 4 +jnz .loop +RET +%endmacro + +FILTER_V4_W6_H4 6, 8 + +FILTER_V4_W6_H4 6, 16 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W12_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] + +mov r4d, %2 + +.loop: +movu m2, [r0] +movu m3, [r0 + r1] + +punpcklbw m4, m2, m3 +punpckhbw m2, m3 + +pmaddubsw m4, m1 +pmaddubsw m2, m1 + +lea r0, [r0 + 2 * r1] +movu m5, [r0] +movu m7, [r0 + r1] + +punpcklbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m4, m6 + +punpckhbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m2, m6 + +mova m6, [tab_c_512] + +pmulhrsw m4, m6 +pmulhrsw m2, m6 + +packuswb m4, m2 + +movh [r2], m4 +pextrd [r2 + 8], m4, 2 + +punpcklbw m4, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m4, m1 +pmaddubsw m3, m1 + +movu m5, [r0 + 2 * r1] + +punpcklbw m2, m7, m5 +punpckhbw m7, m5 + +pmaddubsw m2, m0 +pmaddubsw m7, m0 + +paddw m4, m2 +paddw m3, m7 + +pmulhrsw m4, m6 +pmulhrsw m3, m6 + +packuswb m4, m3 + +movh [r2 + r3], m4 +pextrd [r2 + r3 + 8], m4, 2 + +lea r2, [r2 + 2 * r3] + +sub r4, 2 +jnz .loop +RET +%endmacro + +FILTER_V4_W12_H2 12, 16 + +FILTER_V4_W12_H2 12, 32 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] + +mov r4d, %2/2 + +.loop: +movu m2, [r0] +movu m3, [r0 + r1] + +punpcklbw m4, m2, m3 +punpckhbw m2, m3 + +pmaddubsw m4, m1 +pmaddubsw m2, m1 + +lea r0, [r0 + 2 * r1] +movu m5, [r0] +movu m6, [r0 + r1] + +punpckhbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m2, m7 + +punpcklbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m4, m7 + +mova m7, [tab_c_512] + +pmulhrsw m4, m7 +pmulhrsw m2, m7 + +packuswb m4, m2 + +movu [r2], m4 + +punpcklbw m4, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m4, m1 +pmaddubsw m3, m1 + +movu m5, [r0 + 2 * r1] + +punpcklbw m2, m6, m5 +punpckhbw m6, m5 + +pmaddubsw m2, m0 +pmaddubsw m6, m0 + +paddw m4, m2 +paddw m3, m6 + +pmulhrsw m4, m7 +pmulhrsw m3, m7 + +packuswb m4, m3 + +movu [r2 + r3], m4 + +lea r2, [r2 + 2 * r3] + +dec r4d +jnz .loop +RET +%endmacro + +FILTER_V4_W16_H2 16, 4 +FILTER_V4_W16_H2 16, 8 +FILTER_V4_W16_H2 16, 12 +FILTER_V4_W16_H2 16, 16 +FILTER_V4_W16_H2 16, 32 + +FILTER_V4_W16_H2 16, 24 +FILTER_V4_W16_H2 16, 64 + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W24 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] + +mov r4d, %2 + +.loop: +movu m2, [r0] +movu m3, [r0 + r1] + +punpcklbw m4, m2, m3 +punpckhbw m2, m3 + +pmaddubsw m4, m1 +pmaddubsw m2, m1 + +lea r5, [r0 + 2 * r1] +movu m5, [r5] +movu m7, [r5 + r1] + +punpcklbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m4, m6 + +punpckhbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m2, m6 + +mova m6, [tab_c_512] + +pmulhrsw m4, m6 +pmulhrsw m2, m6 + +packuswb m4, m2 + +movu [r2], m4 + +punpcklbw m4, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m4, m1 +pmaddubsw m3, m1 + +movu m2, [r5 + 2 * r1] + +punpcklbw m5, m7, m2 +punpckhbw m7, m2 + +pmaddubsw m5, m0 +pmaddubsw m7, m0 + +paddw m4, m5 +paddw m3, m7 + +pmulhrsw m4, m6 +pmulhrsw m3, m6 + +packuswb m4, m3 + +movu [r2 + r3], m4 + +movq m2, [r0 + 16] +movq m3, [r0 + r1 + 16] +movq m4, [r5 + 16] +movq m5, [r5 + r1 + 16] + +punpcklbw m2, m3 +punpcklbw m4, m5 + +pmaddubsw m2, m1 +pmaddubsw m4, m0 + +paddw m2, m4 + +pmulhrsw m2, m6 + +movq m3, [r0 + r1 + 16] +movq m4, [r5 + 16] +movq m5, [r5 + r1 + 16] +movq m7, [r5 + 2 * r1 + 16] + +punpcklbw m3, m4 +punpcklbw m5, m7 + +pmaddubsw m3, m1 +pmaddubsw m5, m0 + +paddw m3, m5 + +pmulhrsw m3, m6 +packuswb m2, m3 + +movh [r2 + 16], m2 +movhps [r2 + r3 + 16], m2 + +mov r0, r5 +lea r2, [r2 + 2 * r3] + +sub r4, 2 +jnz .loop +RET +%endmacro + +FILTER_V4_W24 24, 32 + +FILTER_V4_W24 24, 64 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] + +mova m7, [tab_c_512] + +mov r4d, %2 + +.loop: +movu m2, [r0] +movu m3, [r0 + r1] + +punpcklbw m4, m2, m3 +punpckhbw m2, m3 + +pmaddubsw m4, m1 +pmaddubsw m2, m1 + +lea r5, [r0 + 2 * r1] +movu m3, [r5] +movu m5, [r5 + r1] + +punpcklbw m6, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m6, m0 +pmaddubsw m3, m0 + +paddw m4, m6 +paddw m2, m3 + +pmulhrsw m4, m7 +pmulhrsw m2, m7 + +packuswb m4, m2 + +movu [r2], m4 + +movu m2, [r0 + 16] +movu m3, [r0 + r1 + 16] + +punpcklbw m4, m2, m3 +punpckhbw m2, m3 + +pmaddubsw m4, m1 +pmaddubsw m2, m1 + +movu m3, [r5 + 16] +movu m5, [r5 + r1 + 16] + +punpcklbw m6, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m6, m0 +pmaddubsw m3, m0 + +paddw m4, m6 +paddw m2, m3 + +pmulhrsw m4, m7 +pmulhrsw m2, m7 + +packuswb m4, m2 + +movu [r2 + 16], m4 + +lea r0, [r0 + r1] +lea r2, [r2 + r3] + +dec r4 +jnz .loop +RET +%endmacro + +FILTER_V4_W32 32, 8 +FILTER_V4_W32 32, 16 +FILTER_V4_W32 32, 24 +FILTER_V4_W32 32, 32 + +FILTER_V4_W32 32, 48 +FILTER_V4_W32 32, 64 + + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16n_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] + +mov r4d, %2/2 + +.loop: + +mov r6d, %1/16 + +.loopW: + +movu m2, [r0] +movu m3, [r0 + r1] + +punpcklbw m4, m2, m3 +punpckhbw m2, m3 + +pmaddubsw m4, m1 +pmaddubsw m2, m1 + +lea r5, [r0 + 2 * r1] +movu m5, [r5] +movu m6, [r5 + r1] + +punpckhbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m2, m7 + +punpcklbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m4, m7 + +mova m7, [tab_c_512] + +pmulhrsw m4, m7 +pmulhrsw m2, m7 + +packuswb m4, m2 + +movu [r2], m4 + +punpcklbw m4, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m4, m1 +pmaddubsw m3, m1 + +movu m5, [r5 + 2 * r1] + +punpcklbw m2, m6, m5 +punpckhbw m6, m5 + +pmaddubsw m2, m0 +pmaddubsw m6, m0 + +paddw m4, m2 +paddw m3, m6 + +pmulhrsw m4, m7 +pmulhrsw m3, m7 + +packuswb m4, m3 + +movu [r2 + r3], m4 + +add r0, 16 +add r2, 16 +dec r6d +jnz .loopW + +lea r0, [r0 + r1 * 2 - %1] +lea r2, [r2 + r3 * 2 - %1] + +dec r4d +jnz .loop +RET +%endmacro + +FILTER_V4_W16n_H2 64, 64 +FILTER_V4_W16n_H2 64, 32 +FILTER_V4_W16n_H2 64, 48 +FILTER_V4_W16n_H2 48, 64 +FILTER_V4_W16n_H2 64, 16 + + +;----------------------------------------------------------------------------- +; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal luma_p2s, 3, 7, 6 + + ; load width and height + mov r3d, r3m + mov r4d, r4m + + ; load constant + mova m4, [tab_c_128] + mova m5, [tab_c_64_n64] + +.loopH: + + xor r5d, r5d +.loopW: + lea r6, [r0 + r5] + + movh m0, [r6] + punpcklbw m0, m4 + pmaddubsw m0, m5 + + movh m1, [r6 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 + + movh m2, [r6 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + lea r6, [r6 + r1 * 2] + movh m3, [r6 + r1] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + add r5, 8 + cmp r5, r3 + jg .width4 + movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + je .nextH + jmp .loopW + +.width4: + movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 + + sub r4d, 4 + jnz .loopH + + RET + +%macro PROCESS_LUMA_W4_4R 0 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[0 1] + + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[1 2] + punpcklqdq m2, m1 ; m2=[0 1 1 2] + pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] + + movd m1, [r0 + r1] + punpcklbw m5, m0, m1 ; m2=[2 3] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[3 4] + punpcklqdq m5, m1 ; m5=[2 3 3 4] + pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] + paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 + pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 + + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[4 5] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[5 6] + punpcklqdq m2, m1 ; m2=[4 5 5 6] + pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] + paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 + pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] + paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 + + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[6 7] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[7 8] + punpcklqdq m2, m1 ; m2=[6 7 7 8] + pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] + paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end + pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] + paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 + + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[8 9] + movd m0, [r0 + 2 * r1] + punpcklbw m1, m0 ; m1=[9 10] + punpcklqdq m2, m1 ; m2=[8 9 9 10] + pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] + paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end +%endmacro + +%macro PROCESS_LUMA_W8_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 + pmaddubsw m0, [r6 + 1 * 16] + paddw m7, m0 ;m7=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 + pmaddubsw m1, [r6 + 1 * 16] + paddw m6, m1 ;m6 = [1+2+3+4] Row2 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 1 * 16] + pmaddubsw m0, [r6 + 2 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 + paddw m5, m2 ;m5=[2+3+4+5] Row3 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 1 * 16] + pmaddubsw m1, [r6 + 2 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 + paddw m4, m2 ;m4=[3+4+5+6] Row4 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 2 * 16] + pmaddubsw m0, [r6 + 3 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end + paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 2 * 16] + pmaddubsw m1, [r6 + 3 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end + paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m0, [r6 + 3 * 16] + paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end + + movq m0, [r0 + 2 * r1] + punpcklbw m1, m0 + pmaddubsw m1, [r6 + 3 * 16] + paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_4xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + +%ifidn %3,pp + mova m3, [tab_c_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + lea r5, [4 * r1] + +.loopH: + PROCESS_LUMA_W4_4R + +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 + + packuswb m4, m5 + + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m4, 2 + pextrd [r2 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 + + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%endif + + sub r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 4, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 8, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 4, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 8, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 16, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_8xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + + %ifidn %3,pp + mova m3, [tab_c_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + lea r5, [4 * r1] + +.loopH: + PROCESS_LUMA_W8_4R + +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r2, [r2 + 2 * r3] + movu [r2], m5 + movu [r2 + r3], m4 +%endif + + sub r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 4, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 8, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 32, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 4, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 8, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 16, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 32, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_12xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + + %ifidn %3,pp + mova m3, [tab_c_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + +.loopH: + PROCESS_LUMA_W8_4R + +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 +%endif + + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 +%else + add r2, 16 +%endif + + PROCESS_LUMA_W4_4R + +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 + + packuswb m4, m5 + + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m4, 2 + pextrd [r5 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 + + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%endif + + lea r5, [4 * r1 + 8] + sub r0, r5 +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - 8] +%else + lea r2, [r2 + 4 * r3 - 16] +%endif + + dec r4d + jnz .loopH + + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_12xN 12, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_12xN 12, 16, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + +%ifidn %3,pp + mova m3, [tab_c_512] +%else + mova m3, [pw_2000] +%endif + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/8) +.loopW: + PROCESS_LUMA_W8_4R +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 +%endif + + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 +%else + add r2, 16 +%endif + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - %1] +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - %1] +%else + lea r2, [r2 + 4 * r3 - 2 * %1] +%endif + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +FILTER_VER_LUMA 16, 4, pp +FILTER_VER_LUMA 16, 8, pp +FILTER_VER_LUMA 16, 12, pp +FILTER_VER_LUMA 16, 16, pp +FILTER_VER_LUMA 16, 32, pp +FILTER_VER_LUMA 16, 64, pp +FILTER_VER_LUMA 24, 32, pp +FILTER_VER_LUMA 32, 8, pp +FILTER_VER_LUMA 32, 16, pp +FILTER_VER_LUMA 32, 24, pp +FILTER_VER_LUMA 32, 32, pp +FILTER_VER_LUMA 32, 64, pp +FILTER_VER_LUMA 48, 64, pp +FILTER_VER_LUMA 64, 16, pp +FILTER_VER_LUMA 64, 32, pp +FILTER_VER_LUMA 64, 48, pp +FILTER_VER_LUMA 64, 64, pp + +FILTER_VER_LUMA 16, 4, ps +FILTER_VER_LUMA 16, 8, ps +FILTER_VER_LUMA 16, 12, ps +FILTER_VER_LUMA 16, 16, ps +FILTER_VER_LUMA 16, 32, ps +FILTER_VER_LUMA 16, 64, ps +FILTER_VER_LUMA 24, 32, ps +FILTER_VER_LUMA 32, 8, ps +FILTER_VER_LUMA 32, 16, ps +FILTER_VER_LUMA 32, 24, ps +FILTER_VER_LUMA 32, 32, ps +FILTER_VER_LUMA 32, 64, ps +FILTER_VER_LUMA 48, 64, ps +FILTER_VER_LUMA 64, 16, ps +FILTER_VER_LUMA 64, 32, ps +FILTER_VER_LUMA 64, 48, ps +FILTER_VER_LUMA 64, 64, ps + +%macro PROCESS_LUMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize + + add r1d, r1d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mova m7, [tab_c_526336] + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_SP_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_SP 4, 4 + FILTER_VER_LUMA_SP 8, 8 + FILTER_VER_LUMA_SP 8, 4 + FILTER_VER_LUMA_SP 4, 8 + FILTER_VER_LUMA_SP 16, 16 + FILTER_VER_LUMA_SP 16, 8 + FILTER_VER_LUMA_SP 8, 16 + FILTER_VER_LUMA_SP 16, 12 + FILTER_VER_LUMA_SP 12, 16 + FILTER_VER_LUMA_SP 16, 4 + FILTER_VER_LUMA_SP 4, 16 + FILTER_VER_LUMA_SP 32, 32 + FILTER_VER_LUMA_SP 32, 16 + FILTER_VER_LUMA_SP 16, 32 + FILTER_VER_LUMA_SP 32, 24 + FILTER_VER_LUMA_SP 24, 32 + FILTER_VER_LUMA_SP 32, 8 + FILTER_VER_LUMA_SP 8, 32 + FILTER_VER_LUMA_SP 64, 64 + FILTER_VER_LUMA_SP 64, 32 + FILTER_VER_LUMA_SP 32, 64 + FILTER_VER_LUMA_SP 64, 48 + FILTER_VER_LUMA_SP 48, 64 + FILTER_VER_LUMA_SP 64, 16 + FILTER_VER_LUMA_SP 16, 64 + +; TODO: combin of U and V is more performance, but need more register +; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it +INIT_XMM ssse3 +cglobal chroma_p2s, 3, 7, 4 + + ; load width and height + mov r3d, r3m + mov r4d, r4m + + ; load constant + mova m2, [tab_c_128] + mova m3, [tab_c_64_n64] + +.loopH: + + xor r5d, r5d +.loopW: + lea r6, [r0 + r5] + + movh m0, [r6] + punpcklbw m0, m2 + pmaddubsw m0, m3 + + movh m1, [r6 + r1] + punpcklbw m1, m2 + pmaddubsw m1, m3 + + add r5d, 8 + cmp r5d, r3d + lea r6, [r2 + r5 * 2] + jg .width4 + movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + je .nextH + jmp .loopW + +.width4: + test r3d, 4 + jz .width2 + test r3d, 2 + movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + lea r6, [r6 + 8] + pshufd m0, m0, 2 + pshufd m1, m1, 2 + jz .nextH + +.width2: + movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + +.nextH: + lea r0, [r0 + r1 * 2] + add r2, FENC_STRIDE / 2 * 4 + + sub r4d, 2 + jnz .loopH + + RET + +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 16] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 16] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mova m6, [tab_c_526336] + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP 4, 4 + FILTER_VER_CHROMA_SP 4, 8 + FILTER_VER_CHROMA_SP 16, 16 + FILTER_VER_CHROMA_SP 16, 8 + FILTER_VER_CHROMA_SP 16, 12 + FILTER_VER_CHROMA_SP 12, 16 + FILTER_VER_CHROMA_SP 16, 4 + FILTER_VER_CHROMA_SP 4, 16 + FILTER_VER_CHROMA_SP 32, 32 + FILTER_VER_CHROMA_SP 32, 16 + FILTER_VER_CHROMA_SP 16, 32 + FILTER_VER_CHROMA_SP 32, 24 + FILTER_VER_CHROMA_SP 24, 32 + FILTER_VER_CHROMA_SP 32, 8 + + FILTER_VER_CHROMA_SP 16, 24 + FILTER_VER_CHROMA_SP 16, 64 + FILTER_VER_CHROMA_SP 12, 32 + FILTER_VER_CHROMA_SP 4, 32 + FILTER_VER_CHROMA_SP 32, 64 + FILTER_VER_CHROMA_SP 32, 48 + FILTER_VER_CHROMA_SP 24, 64 + + FILTER_VER_CHROMA_SP 64, 64 + FILTER_VER_CHROMA_SP 64, 32 + FILTER_VER_CHROMA_SP 64, 48 + FILTER_VER_CHROMA_SP 48, 64 + FILTER_VER_CHROMA_SP 64, 16 + + +%macro PROCESS_CHROMA_SP_W2_4R 1 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + + lea r0, [r0 + 2 * r1] + movd m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + punpcklqdq m0, m1 ;m0=[0 1 1 2] + pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m2, m1 ;m2=[2 3] + + lea r0, [r0 + 2 * r1] + movd m3, [r0] + punpcklwd m1, m3 ;m2=[3 4] + punpcklqdq m2, m1 ;m2=[2 3 3 4] + + pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 + pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 + paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m3, m1 ;m3=[4 5] + + movd m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[5 6] + punpcklqdq m3, m1 ;m2=[4 5 5 6] + pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 + paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 +%endmacro + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m5, [tab_c_526336] + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + paddd m0, m5 + paddd m2, m5 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SP_W2_4R 2, 4 +FILTER_VER_CHROMA_SP_W2_4R 2, 8 + +FILTER_VER_CHROMA_SP_W2_4R 2, 16 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m4, [tab_c_526336] + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + paddd m0, m4 + psrad m0, 12 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1 = [1+2+3+4] Row2 done + paddd m1, m4 + psrad m1, 12 + + packssdw m0, m1 + packuswb m0, m0 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mova m6, [tab_c_526336] + + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + paddd m0, m6 + paddd m2, m6 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 4] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SP_W6_H4 6, 8 + +FILTER_VER_CHROMA_SP_W6_H4 6, 16 + +%macro PROCESS_CHROMA_SP_W8_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l + paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h + paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l + paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h + paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m7, [tab_c_526336] + + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SP_W8_H2 8, 2 +FILTER_VER_CHROMA_SP_W8_H2 8, 4 +FILTER_VER_CHROMA_SP_W8_H2 8, 6 +FILTER_VER_CHROMA_SP_W8_H2 8, 8 +FILTER_VER_CHROMA_SP_W8_H2 8, 16 +FILTER_VER_CHROMA_SP_W8_H2 8, 32 + +FILTER_VER_CHROMA_SP_W8_H2 8, 12 +FILTER_VER_CHROMA_SP_W8_H2 8, 64 + + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_2xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride +%define coef2 m3 +%define Tm0 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t1, [pw_2000] + mova Tm0, [tab_Tm] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + movh t0, [srcq] + pshufb t0, t0, Tm0 + pmaddubsw t0, coef2 + phaddw t0, t0 + psubw t0, t1 + movd [dstq], t0 + + lea srcq, [srcq + srcstrideq] + lea dstq, [dstq + dststrideq] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA_2xN 2, 4 +FILTER_HORIZ_CHROMA_2xN 2, 8 + +FILTER_HORIZ_CHROMA_2xN 2, 16 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_4xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride +%define coef2 m3 +%define Tm0 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t1, [pw_2000] + mova Tm0, [tab_Tm] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + movh t0, [srcq] + pshufb t0, t0, Tm0 + pmaddubsw t0, coef2 + phaddw t0, t0 + psubw t0, t1 + movlps [dstq], t0 + + lea srcq, [srcq + srcstrideq] + lea dstq, [dstq + dststrideq] + + dec r4d + jnz .loopH + RET +%endmacro + +FILTER_HORIZ_CHROMA_4xN 4, 2 +FILTER_HORIZ_CHROMA_4xN 4, 4 +FILTER_HORIZ_CHROMA_4xN 4, 8 +FILTER_HORIZ_CHROMA_4xN 4, 16 + +FILTER_HORIZ_CHROMA_4xN 4, 32 + +%macro PROCESS_CHROMA_W6 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movh [dstq], %2 + pshufd %2, %2, 2 + movd [dstq + 8], %2 +%endmacro + +%macro PROCESS_CHROMA_W12 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq], %2 + movu %1, [srcq + 8] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %1, %1 + psubw %1, %3 + movh [dstq + 16], %1 +%endmacro + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [tab_Tm] + mova Tm1, [tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W%1 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA 6, 8 +FILTER_HORIZ_CHROMA 12, 16 + +FILTER_HORIZ_CHROMA 6, 16 +FILTER_HORIZ_CHROMA 12, 32 + +%macro PROCESS_CHROMA_W8 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq], %2 +%endmacro + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_8xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [tab_Tm] + mova Tm1, [tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W8 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA_8xN 8, 2 +FILTER_HORIZ_CHROMA_8xN 8, 4 +FILTER_HORIZ_CHROMA_8xN 8, 6 +FILTER_HORIZ_CHROMA_8xN 8, 8 +FILTER_HORIZ_CHROMA_8xN 8, 16 +FILTER_HORIZ_CHROMA_8xN 8, 32 + +FILTER_HORIZ_CHROMA_8xN 8, 12 +FILTER_HORIZ_CHROMA_8xN 8, 64 + +%macro PROCESS_CHROMA_W16 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 +%endmacro + +%macro PROCESS_CHROMA_W24 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 + movu %1, [srcq + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq + 32], %2 +%endmacro + +%macro PROCESS_CHROMA_W32 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 + movu %1, [srcq + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 24] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq + 32], %2 + movu [dstq + 48], %4 +%endmacro + +%macro PROCESS_CHROMA_W16o 5 + movu %1, [srcq + %5] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + %5 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq + %5 * 2], %2 + movu [dstq + %5 * 2 + 16], %4 +%endmacro + +%macro PROCESS_CHROMA_W48 4 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 +%endmacro + +%macro PROCESS_CHROMA_W64 4 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 +%endmacro + +;------------------------------------------------------------------------------------------------------------------------------ +; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;------------------------------------------------------------------------------------------------------------------------------ +%macro FILTER_HORIZ_CHROMA_WxN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride +%define coef2 m6 +%define Tm0 m5 +%define Tm1 m4 +%define t3 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [tab_Tm] + mova Tm1, [tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W%1 t0, t1, t2, t3 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA_WxN 16, 4 +FILTER_HORIZ_CHROMA_WxN 16, 8 +FILTER_HORIZ_CHROMA_WxN 16, 12 +FILTER_HORIZ_CHROMA_WxN 16, 16 +FILTER_HORIZ_CHROMA_WxN 16, 32 +FILTER_HORIZ_CHROMA_WxN 24, 32 +FILTER_HORIZ_CHROMA_WxN 32, 8 +FILTER_HORIZ_CHROMA_WxN 32, 16 +FILTER_HORIZ_CHROMA_WxN 32, 24 +FILTER_HORIZ_CHROMA_WxN 32, 32 + +FILTER_HORIZ_CHROMA_WxN 16, 24 +FILTER_HORIZ_CHROMA_WxN 16, 64 +FILTER_HORIZ_CHROMA_WxN 24, 64 +FILTER_HORIZ_CHROMA_WxN 32, 48 +FILTER_HORIZ_CHROMA_WxN 32, 64 + +FILTER_HORIZ_CHROMA_WxN 64, 64 +FILTER_HORIZ_CHROMA_WxN 64, 32 +FILTER_HORIZ_CHROMA_WxN 64, 48 +FILTER_HORIZ_CHROMA_WxN 48, 64 +FILTER_HORIZ_CHROMA_WxN 64, 16 + + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16n 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 + +.loop: + + mov r6d, %1/16 + +.loopW: + + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r5 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + add r0, 16 + add r2, 32 + dec r6d + jnz .loopW + + lea r0, [r0 + r1 * 2 - %1] + lea r2, [r2 + r3 * 2 - %1 * 2] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W16n 64, 64 +FILTER_V_PS_W16n 64, 32 +FILTER_V_PS_W16n 64, 48 +FILTER_V_PS_W16n 48, 64 +FILTER_V_PS_W16n 64, 16 + + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + lea r5, [3 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 + + pmaddubsw m3, m0 + phaddw m2, m3 + + mova m1, [pw_2000] + + psubw m2, m1 + + movd [r2], m2 + pextrd [r2 + r3], m2, 2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + phaddw m4, m5 + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pextrd [r2 + r3], m4, 2 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + psubw m2, m1 + + + movd [r2], m2 + pshufd m2, m2, 2 + movd [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pshufd m4 , m4 ,2 + movd [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +FILTER_V_PS_W2 2, 8 + +FILTER_V_PS_W2 2, 16 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS 4, 4 + FILTER_VER_CHROMA_SS 4, 8 + FILTER_VER_CHROMA_SS 16, 16 + FILTER_VER_CHROMA_SS 16, 8 + FILTER_VER_CHROMA_SS 16, 12 + FILTER_VER_CHROMA_SS 12, 16 + FILTER_VER_CHROMA_SS 16, 4 + FILTER_VER_CHROMA_SS 4, 16 + FILTER_VER_CHROMA_SS 32, 32 + FILTER_VER_CHROMA_SS 32, 16 + FILTER_VER_CHROMA_SS 16, 32 + FILTER_VER_CHROMA_SS 32, 24 + FILTER_VER_CHROMA_SS 24, 32 + FILTER_VER_CHROMA_SS 32, 8 + + FILTER_VER_CHROMA_SS 16, 24 + FILTER_VER_CHROMA_SS 12, 32 + FILTER_VER_CHROMA_SS 4, 32 + FILTER_VER_CHROMA_SS 32, 64 + FILTER_VER_CHROMA_SS 16, 64 + FILTER_VER_CHROMA_SS 32, 48 + FILTER_VER_CHROMA_SS 24, 64 + + FILTER_VER_CHROMA_SS 64, 64 + FILTER_VER_CHROMA_SS 64, 32 + FILTER_VER_CHROMA_SS 64, 48 + FILTER_VER_CHROMA_SS 48, 64 + FILTER_VER_CHROMA_SS 64, 16 + + +;--------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SS_W2_4R 2, 4 +FILTER_VER_CHROMA_SS_W2_4R 2, 8 + +FILTER_VER_CHROMA_SS_W2_4R 2, 16 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + psrad m0, 6 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1=[1+2+3+4] Row2 done + psrad m1, 6 + + packssdw m0, m1 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 2 * 4] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SS_W6_H4 6, 8 + +FILTER_VER_CHROMA_SS_W6_H4 6, 16 + + +;---------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;---------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movu [r2], m0 + movu [r2 + r3], m2 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SS_W8_H2 8, 2 +FILTER_VER_CHROMA_SS_W8_H2 8, 4 +FILTER_VER_CHROMA_SS_W8_H2 8, 6 +FILTER_VER_CHROMA_SS_W8_H2 8, 8 +FILTER_VER_CHROMA_SS_W8_H2 8, 16 +FILTER_VER_CHROMA_SS_W8_H2 8, 32 + +FILTER_VER_CHROMA_SS_W8_H2 8, 12 +FILTER_VER_CHROMA_SS_W8_H2 8, 64 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SS 2 +INIT_XMM sse2 +cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + psrad m0, 6 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + psrad m1, 6 + + packssdw m0, m1 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + psrad m2, 6 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end + psrad m3, 6 + + packssdw m2, m3 + + movlps [r2 + 2 * r3], m2 + lea r5, [3 * r3] + movhps [r2 + r5], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_LUMA_SS 4, 4 + FILTER_VER_LUMA_SS 8, 8 + FILTER_VER_LUMA_SS 8, 4 + FILTER_VER_LUMA_SS 4, 8 + FILTER_VER_LUMA_SS 16, 16 + FILTER_VER_LUMA_SS 16, 8 + FILTER_VER_LUMA_SS 8, 16 + FILTER_VER_LUMA_SS 16, 12 + FILTER_VER_LUMA_SS 12, 16 + FILTER_VER_LUMA_SS 16, 4 + FILTER_VER_LUMA_SS 4, 16 + FILTER_VER_LUMA_SS 32, 32 + FILTER_VER_LUMA_SS 32, 16 + FILTER_VER_LUMA_SS 16, 32 + FILTER_VER_LUMA_SS 32, 24 + FILTER_VER_LUMA_SS 24, 32 + FILTER_VER_LUMA_SS 32, 8 + FILTER_VER_LUMA_SS 8, 32 + FILTER_VER_LUMA_SS 64, 64 + FILTER_VER_LUMA_SS 64, 32 + FILTER_VER_LUMA_SS 32, 64 + FILTER_VER_LUMA_SS 64, 48 + FILTER_VER_LUMA_SS 48, 64 + FILTER_VER_LUMA_SS 64, 16 + FILTER_VER_LUMA_SS 16, 64 diff --git a/source/common/x86/ipfilter8.h b/source/common/x86/ipfilter8.h new file mode 100644 index 0000000..3949409 --- /dev/null +++ b/source/common/x86/ipfilter8.h @@ -0,0 +1,629 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_IPFILTER8_H +#define X265_IPFILTER8_H + +#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \ + void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ + void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + +#define LUMA_FILTERS(cpu) \ + SETUP_LUMA_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 64, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_FUNC_DEF(16, 64, cpu) + +#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \ + void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); + +#define LUMA_SP_FILTERS(cpu) \ + SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu); + +#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \ + void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + +#define LUMA_SS_FILTERS(cpu) \ + SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 4, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(4, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 4, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(4, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 8, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(8, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \ + SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu); + +#if HIGH_BIT_DEPTH + +#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \ + void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + +#define CHROMA_VERT_FILTERS(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu) + +#define CHROMA_VERT_FILTERS_SSE4(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu); + +#define CHROMA_VERT_FILTERS_422(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 12, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 24, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(12, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 48, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(24, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 64, cpu); + +#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(2, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(6, 16, cpu); + +#define CHROMA_VERT_FILTERS_444(cpu) \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(64, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(64, 32, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(64, 48, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(48, 64, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(64, 16, cpu); \ + SETUP_CHROMA_VERT_FUNC_DEF(16, 64, cpu) + +#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \ + void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); + +#define CHROMA_HORIZ_FILTERS(cpu) \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu) + +#define CHROMA_HORIZ_FILTERS_422(cpu) \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 12, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(6, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(2, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 24, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(12, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 48, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(24, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 64, cpu) + +#define CHROMA_HORIZ_FILTERS_444(cpu) \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(64, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(64, 32, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(64, 48, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(48, 64, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(64, 16, cpu); \ + SETUP_CHROMA_HORIZ_FUNC_DEF(16, 64, cpu) + +void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); +void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); + +CHROMA_VERT_FILTERS(_sse2); +CHROMA_HORIZ_FILTERS(_sse4); +CHROMA_VERT_FILTERS_SSE4(_sse4); + +CHROMA_VERT_FILTERS_422(_sse2); +CHROMA_HORIZ_FILTERS_422(_sse4); +CHROMA_VERT_FILTERS_SSE4_422(_sse4); + +CHROMA_VERT_FILTERS_444(_sse2); +CHROMA_HORIZ_FILTERS_444(_sse4); + +#undef CHROMA_VERT_FILTERS_SSE4 +#undef CHROMA_VERT_FILTERS +#undef SETUP_CHROMA_VERT_FUNC_DEF +#undef CHROMA_HORIZ_FILTERS +#undef SETUP_CHROMA_HORIZ_FUNC_DEF + +#undef CHROMA_VERT_FILTERS_422 +#undef CHROMA_VERT_FILTERS_SSE4_422 +#undef CHROMA_HORIZ_FILTERS_422 + +#undef CHROMA_VERT_FILTERS_444 +#undef CHROMA_HORIZ_FILTERS_444 + +#else // if HIGH_BIT_DEPTH + +#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \ + void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ + void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \ + void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + +#define CHROMA_FILTERS(cpu) \ + SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_FUNC_DEF(6, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 32, cpu) + +#define CHROMA_FILTERS_422(cpu) \ + SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 12, cpu); \ + SETUP_CHROMA_FUNC_DEF(6, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(2, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 24, cpu); \ + SETUP_CHROMA_FUNC_DEF(12, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 48, cpu); \ + SETUP_CHROMA_FUNC_DEF(24, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 64, cpu); + +#define CHROMA_FILTERS_444(cpu) \ + SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(64, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF(64, 32, cpu); \ + SETUP_CHROMA_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF(64, 48, cpu); \ + SETUP_CHROMA_FUNC_DEF(48, 64, cpu); \ + SETUP_CHROMA_FUNC_DEF(64, 16, cpu); \ + SETUP_CHROMA_FUNC_DEF(16, 64, cpu); + +#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \ + void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); + +#define CHROMA_SP_FILTERS(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); + +#define CHROMA_SP_FILTERS_SSE4(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(6, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu); + +#define CHROMA_SP_FILTERS_422(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 12, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 64, cpu); + +#define CHROMA_SP_FILTERS_422_SSE4(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(2, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(6, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 24, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(12, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 48, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(24, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu); + +#define CHROMA_SP_FILTERS_444(cpu) \ + SETUP_CHROMA_SP_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(64, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(64, 32, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(64, 48, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(48, 64, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(64, 16, cpu); \ + SETUP_CHROMA_SP_FUNC_DEF(16, 64, cpu); + +#define SETUP_CHROMA_SS_FUNC_DEF(W, H, cpu) \ + void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); + +#define CHROMA_SS_FILTERS(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 2, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 6, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 2, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); + +#define CHROMA_SS_FILTERS_SSE4(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF(2, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(6, 8, cpu); + +#define CHROMA_SS_FILTERS_422(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 12, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 24, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(12, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 48, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(24, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 64, cpu); + +#define CHROMA_SS_FILTERS_422_SSE4(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF(2, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(2, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(6, 16, cpu); + +#define CHROMA_SS_FILTERS_444(cpu) \ + SETUP_CHROMA_SS_FUNC_DEF(8, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 12, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(12, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 4, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(4, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 24, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(24, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 8, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(8, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(64, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(64, 32, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(32, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(64, 48, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(48, 64, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(64, 16, cpu); \ + SETUP_CHROMA_SS_FUNC_DEF(16, 64, cpu); + +CHROMA_FILTERS(_sse4); +CHROMA_SP_FILTERS(_sse2); +CHROMA_SP_FILTERS_SSE4(_sse4); +CHROMA_SS_FILTERS(_sse2); +CHROMA_SS_FILTERS_SSE4(_sse4); + +CHROMA_FILTERS_422(_sse4); +CHROMA_SP_FILTERS_422(_sse2); +CHROMA_SP_FILTERS_422_SSE4(_sse4); +CHROMA_SS_FILTERS_422(_sse2); +CHROMA_SS_FILTERS_422_SSE4(_sse4); + +CHROMA_FILTERS_444(_sse4); +CHROMA_SP_FILTERS_444(_sse4); +CHROMA_SS_FILTERS_444(_sse2); + +void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); + +#undef SETUP_CHROMA_FUNC_DEF +#undef SETUP_CHROMA_SP_FUNC_DEF +#undef SETUP_CHROMA_SS_FUNC_DEF +#undef CHROMA_FILTERS +#undef CHROMA_SP_FILTERS +#undef CHROMA_SS_FILTERS +#undef CHROMA_SS_FILTERS_SSE4 +#undef CHROMA_SP_FILTERS_SSE4 + +#undef CHROMA_FILTERS_422 +#undef CHROMA_SP_FILTERS_422 +#undef CHROMA_SS_FILTERS_422 +#undef CHROMA_SS_FILTERS_422_SSE4 +#undef CHROMA_SP_FILTERS_422_SSE4 + +#undef CHROMA_FILTERS_444 +#undef CHROMA_SP_FILTERS_444 +#undef CHROMA_SS_FILTERS_444 + +#endif // if HIGH_BIT_DEPTH + +LUMA_FILTERS(_sse4); +LUMA_SP_FILTERS(_sse4); +LUMA_SS_FILTERS(_sse2); +LUMA_FILTERS(_avx2); + +void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY); +void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height); + +#undef LUMA_FILTERS +#undef LUMA_SP_FILTERS +#undef LUMA_SS_FILTERS +#undef SETUP_LUMA_FUNC_DEF +#undef SETUP_LUMA_SP_FUNC_DEF +#undef SETUP_LUMA_SS_FUNC_DEF + +#endif // ifndef X265_MC_H diff --git a/source/common/x86/loopfilter.asm b/source/common/x86/loopfilter.asm new file mode 100644 index 0000000..5068167 --- /dev/null +++ b/source/common/x86/loopfilter.asm @@ -0,0 +1,85 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen +;* Praveen Kumar Tiwari +;* Nabajit Deka +;* Dnyaneshwar Gorade +;* Murugan Vairavel +;* Yuvaraj Venkatesh +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" + +SECTION_RODATA 32 + +pw_2: times 16 db 2 + +SECTION .text + +;============================================================================================================ +; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft) +;============================================================================================================ +INIT_XMM sse4 +cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft + + neg r3 ; r3 = -iSignLeft + movd m0, r3d + pslldq m0, 15 ; m0 = [iSignLeft x .. x] + pcmpeqb m4, m4 ; m4 = [pb -1] + pxor m5, m5 ; m5 = 0 + movh m6, [r1] ; m6 = m_offsetEo + +.loop: + movu m7, [r0] ; m1 = pRec[x] + mova m1, m7 + movu m2, [r0+1] ; m2 = pRec[x+1] + + psubusb m3, m2, m7 + psubusb m1, m2 + pcmpeqb m3, m5 + pcmpeqb m1, m5 + pcmpeqb m2, m7 + + pabsb m3, m3 ; m1 = (pRec[x] - pRec[x+1]) > 0) ? 1 : 0 + por m1, m3 ; m1 = iSignRight + pandn m2, m1 + + palignr m3, m2, m0, 15 ; m3 = -iSignLeft + psignb m3, m4 ; m3 = iSignLeft + mova m0, m4 + pslldq m0, 15 + pand m0, m2 ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1] + paddb m2, m3 + paddb m2, [pw_2] ; m1 = uiEdgeType + pshufb m3, m6, m2 + pmovzxbw m2, m7 ; rec + punpckhbw m7, m5 + pmovsxbw m1, m3 ; iOffsetEo + punpckhbw m3, m3 + psraw m3, 8 + paddw m2, m1 + paddw m7, m3 + packuswb m2, m7 + movu [r0], m2 + + add r0q, 16 + sub r2d, 16 + jnz .loop + RET diff --git a/source/common/x86/loopfilter.h b/source/common/x86/loopfilter.h new file mode 100644 index 0000000..7f0f409 --- /dev/null +++ b/source/common/x86/loopfilter.h @@ -0,0 +1,29 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Dnyaneshwar Gorade + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_LOOPFILTER_H +#define X265_LOOPFILTER_H + +void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft); + +#endif // ifndef X265_LOOPFILTER_H diff --git a/source/common/x86/mc-a.asm b/source/common/x86/mc-a.asm new file mode 100644 index 0000000..4a7ac5c --- /dev/null +++ b/source/common/x86/mc-a.asm @@ -0,0 +1,3722 @@ +;***************************************************************************** +;* mc-a.asm: x86 motion compensation +;***************************************************************************** +;* Copyright (C) 2003-2013 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* Laurent Aimar +;* Dylan Yudaken +;* Holger Lubitz +;* Min Chen +;* Oskar Arvidsson +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf_adj: times 8 db 0 + times 8 db 2 + times 8 db 4 + times 8 db 6 +sq_1: times 1 dq 1 + +SECTION .text + +cextern pb_0 +cextern pw_1 +cextern pw_4 +cextern pw_8 +cextern pw_32 +cextern pw_64 +cextern pw_128 +cextern pw_256 +cextern pw_512 +cextern pw_1023 +cextern pw_1024 +cextern pw_00ff +cextern pw_pixel_max +cextern sw_64 +cextern pd_32 +cextern deinterleave_shufd + +;==================================================================================================================== +;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +;==================================================================================================================== +; r0 = pSrc0, r1 = pSrc1 +; r2 = pDst, r3 = iStride0 +; r4 = iStride1, r5 = iDstStride +%if HIGH_BIT_DEPTH +INIT_XMM sse4 +cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + add r3, r3 + add r4, r4 + add r5, r5 + + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + + punpckldq m1, m2 + punpckldq m3, m4 + + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m0, [r1 + r4] + punpckldq m2, m4 + punpckldq m5, m0 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + paddw m1, m3 + pmulhrsw m1, [pw_1024] + paddw m1, [pw_512] + + pxor m0, m0 + pmaxsw m1, m0 + pminsw m1, [pw_1023] + movd [r2], m1 + pextrd [r2 + r5], m1, 1 + lea r2, [r2 + 2 * r5] + pextrd [r2], m1, 2 + pextrd [r2 + r5], m1, 3 + + RET +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m0, [pw_512] + pxor m7, m7 + add r3, r3 + add r4, r4 + add r5, r5 + +%rep 2 + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + + punpckldq m1, m2 + punpckldq m3, m4 + + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m6, [r1 + r4] + + punpckldq m2, m4 + punpckldq m5, m6 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + paddw m1, m3 + pmulhrsw m1, [pw_1024] + paddw m1, m0 + + pmaxsw m1, m7 + pminsw m1, [pw_1023] + movd [r2], m1 + pextrd [r2 + r5], m1, 1 + lea r2, [r2 + 2 * r5] + pextrd [r2], m1, 2 + pextrd [r2 + r5], m1, 3 + + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + lea r2, [r2 + 2 * r5] +%endrep + RET + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m6, [pw_1023] + mova m7, [pw_1024] + mov r6d, 16/4 + add r3, r3 + add r4, r4 + add r5, r5 +.loop: + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r4 * 2] + punpckldq m1, m2 + punpckldq m3, m4 + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m0, [r1 + r4] + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r4 * 2] + punpckldq m2, m4 + punpckldq m5, m0 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, [pw_512] + pxor m0, m0 + pmaxsw m1, m0 + pminsw m1, m6 + movd [r2], m1 + pextrd [r2 + r5], m1, 1 + lea r2, [r2 + r5 * 2] + pextrd [r2], m1, 2 + pextrd [r2 + r5], m1, 3 + lea r2, [r2 + r5 * 2] + dec r6d + jnz .loop + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + add r3, r3 + add r4, r4 + add r5, r5 + + movh m0, [r0] + movh m1, [r0 + r3] + movh m2, [r1] + movh m3, [r1 + r4] + + punpcklqdq m0, m1 + punpcklqdq m2, m3 + paddw m0, m2 + pmulhrsw m0, [pw_1024] + paddw m0, [pw_512] + + pxor m6, m6 + pmaxsw m0, m6 + pminsw m0, [pw_1023] + movh [r2], m0 + movhps [r2 + r5], m0 + RET +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + +%rep 4 + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + + pmaxsw m0, m6 + pminsw m0, m5 + movh [r2], m0 + pextrd [r2 + 8], m0, 2 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + + pmaxsw m1, m6 + pminsw m1, m5 + movh [r2 + r5], m1 + pextrd [r2 + r5 + 8], m1, 2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + RET +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + mov r6d, 16/2 + add r3, r3 + add r4, r4 + add r5, r5 +.loop: + movu m0, [r0] + movu m2, [r1] + movu m1, [r0 + r3] + movu m3, [r1 + r4] + dec r6d + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r4 * 2] + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + paddw m0, m4 + paddw m1, m4 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m5 + pminsw m1, m5 + movh [r2], m0 + pextrd [r2 + 8], m0, 2 + movh [r2 + r5], m1 + pextrd [r2 + r5 + 8], m1, 2 + lea r2, [r2 + r5 * 2] + jnz .loop + RET +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + RET +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + +%rep 3 + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + RET + +;----------------------------------------------------------------------------- +%macro ADDAVG_W4_H4 1 +INIT_XMM sse4 +cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, %1/4 + +.loop: +%rep 2 + movh m0, [r0] + movh m1, [r0 + r3] + movh m2, [r1] + movh m3, [r1 + r4] + + punpcklqdq m0, m1 + punpcklqdq m2, m3 + + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + + pmaxsw m0, m6 + pminsw m0, m5 + + movh [r2], m0 + movhps [r2 + r5], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W4_H4 4 +ADDAVG_W4_H4 8 +ADDAVG_W4_H4 16 + +ADDAVG_W4_H4 32 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W8_H4 1 +INIT_XMM sse4 +cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + mov r6d, %1/4 + +.loop: +%rep 2 + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W8_H4 4 +ADDAVG_W8_H4 8 +ADDAVG_W8_H4 16 +ADDAVG_W8_H4 32 + +ADDAVG_W8_H4 12 +ADDAVG_W8_H4 64 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W12_H4 1 +INIT_XMM sse4 +cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + mov r6d, %1/4 + +.loop: +%rep 2 + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movh m0, [r0 + 16] + movh m1, [r0 + 16 + r3] + movh m2, [r1 + 16] + movh m3, [r1 + 16 + r4] + + punpcklqdq m0, m1 + punpcklqdq m2, m3 + + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movh [r2 + 16], m0 + movhps [r2 + r5 + 16], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W12_H4 16 + +ADDAVG_W12_H4 32 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W16_H4 1 +INIT_XMM sse4 +cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + mov r6d, %1/4 + +.loop: +%rep 2 + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 16], m1 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + r5 + 16], m2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W16_H4 4 +ADDAVG_W16_H4 8 +ADDAVG_W16_H4 12 +ADDAVG_W16_H4 16 +ADDAVG_W16_H4 32 +ADDAVG_W16_H4 64 + +ADDAVG_W16_H4 24 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W24_H2 2 +INIT_XMM sse4 +cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, %2/2 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 16], m1 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + movu m2, [r0 + r3 + 16] + movu m3, [r1 + r4 + 16] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + r5 + 16], m2 + + movu m1, [r0 + r3 + 32] + movu m3, [r1 + r4 + 32] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5 + 32], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W24_H2 24, 32 + +ADDAVG_W24_H2 24, 64 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W32_H2 1 +INIT_XMM sse4 +cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 16], m1 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m1, [r0 + 48] + movu m2, [r1 + 48] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 48], m1 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + r5 + 16], m2 + + movu m1, [r0 + 32 + r3] + movu m3, [r1 + 32 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5 + 32], m1 + + movu m2, [r0 + 48 + r3] + movu m3, [r1 + 48 + r4] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + r5 + 48], m2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W32_H2 8 +ADDAVG_W32_H2 16 +ADDAVG_W32_H2 24 +ADDAVG_W32_H2 32 +ADDAVG_W32_H2 64 + +ADDAVG_W32_H2 48 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W48_H2 1 +INIT_XMM sse4 +cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 16], m1 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m1, [r0 + 48] + movu m2, [r1 + 48] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 48], m1 + + movu m0, [r0 + 64] + movu m2, [r1 + 64] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 64], m0 + + movu m1, [r0 + 80] + movu m2, [r1 + 80] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 80], m1 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + r5], m1 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + 16 + r5], m2 + + movu m1, [r0 + 32 + r3] + movu m3, [r1 + 32 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 32 + r5], m1 + + movu m2, [r0 + 48 + r3] + movu m3, [r1 + 48 + r4] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + 48 + r5], m2 + + movu m1, [r0 + 64 + r3] + movu m3, [r1 + 64 + r4] + paddw m1, m3 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 64 + r5], m1 + + movu m2, [r0 + 80 + r3] + movu m3, [r1 + 80 + r4] + paddw m2, m3 + pmulhrsw m2, m7 + paddw m2, m4 + pmaxsw m2, m6 + pminsw m2, m5 + movu [r2 + 80 + r5], m2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W48_H2 64 + +;----------------------------------------------------------------------------- +%macro ADDAVG_W64_H1 1 +INIT_XMM sse4 +cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m7, [pw_1024] + pxor m6, m6 + add r3, r3 + add r4, r4 + add r5, r5 + mov r6d, %1 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2], m0 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 16], m1 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m1, [r0 + 48] + movu m2, [r1 + 48] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 48], m1 + + movu m0, [r0 + 64] + movu m2, [r1 + 64] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 64], m0 + + movu m1, [r0 + 80] + movu m2, [r1 + 80] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 80], m1 + + movu m0, [r0 + 96] + movu m2, [r1 + 96] + paddw m0, m2 + pmulhrsw m0, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m5 + movu [r2 + 96], m0 + + movu m1, [r0 + 112] + movu m2, [r1 + 112] + paddw m1, m2 + pmulhrsw m1, m7 + paddw m1, m4 + pmaxsw m1, m6 + pminsw m1, m5 + movu [r2 + 112], m1 + + add r2, r5 + add r0, r3 + add r1, r4 + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W64_H1 16 +ADDAVG_W64_H1 32 +ADDAVG_W64_H1 48 +ADDAVG_W64_H1 64 +;----------------------------------------------------------------------------- +%else ; !HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m0, [pw_256] + mova m7, [pw_128] + add r3, r3 + add r4, r4 + + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + + punpckldq m1, m2 + punpckldq m3, m4 + + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m6, [r1 + r4] + + punpckldq m2, m4 + punpckldq m5, m6 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + + paddw m1, m3 + pmulhrsw m1, m0 + paddw m1, m7 + packuswb m1, m1 + + pextrw [r2], m1, 0 + pextrw [r2 + r5], m1, 1 + lea r2, [r2 + 2 * r5] + pextrw [r2], m1, 2 + pextrw [r2 + r5], m1, 3 + + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_2x8, 6,6,8, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m0, [pw_256] + mova m7, [pw_128] + add r3, r3 + add r4, r4 + + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + + punpckldq m1, m2 + punpckldq m3, m4 + + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m6, [r1 + r4] + + punpckldq m2, m4 + punpckldq m5, m6 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + + paddw m1, m3 + pmulhrsw m1, m0 + paddw m1, m7 + packuswb m1, m1 + + pextrw [r2], m1, 0 + pextrw [r2 + r5], m1, 1 + lea r2, [r2 + 2 * r5] + pextrw [r2], m1, 2 + pextrw [r2 + r5], m1, 3 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + + punpckldq m1, m2 + punpckldq m3, m4 + + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m6, [r1 + r4] + + punpckldq m2, m4 + punpckldq m5, m6 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + + paddw m1, m3 + pmulhrsw m1, m0 + paddw m1, m7 + packuswb m1, m1 + + pextrw [r2], m1, 0 + pextrw [r2 + r5], m1, 1 + lea r2, [r2 + 2 * r5] + pextrw [r2], m1, 2 + pextrw [r2 + r5], m1, 3 + + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_2x16, 6,7,8, src0, src1, dst, src0Stride, src1tride, dstStride + mova m0, [pw_256] + mova m7, [pw_128] + mov r6d, 16/4 + add r3, r3 + add r4, r4 +.loop: + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r4] + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r4 * 2] + punpckldq m1, m2 + punpckldq m3, m4 + movd m2, [r0] + movd m4, [r0 + r3] + movd m5, [r1] + movd m6, [r1 + r4] + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r4 * 2] + punpckldq m2, m4 + punpckldq m5, m6 + punpcklqdq m1, m2 + punpcklqdq m3, m5 + paddw m1, m3 + pmulhrsw m1, m0 + paddw m1, m7 + packuswb m1, m1 + pextrw [r2], m1, 0 + pextrw [r2 + r5], m1, 1 + lea r2, [r2 + r5 * 2] + pextrw [r2], m1, 2 + pextrw [r2 + r5], m1, 3 + lea r2, [r2 + r5 * 2] + dec r6d + jnz .loop + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m1, [pw_256] + mova m3, [pw_128] + add r3, r3 + add r4, r4 + + movh m0, [r0] + movhps m0, [r0 + r3] + movh m2, [r1] + movhps m2, [r1 + r4] + + paddw m0, m2 + pmulhrsw m0, m1 + paddw m0, m3 + + packuswb m0, m0 + movd [r2], m0 + pshufd m0, m0, 1 + movd [r2 + r5], m0 + + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +%macro ADDAVG_W4_H4 1 +INIT_XMM sse4 +cglobal addAvg_4x%1, 6,7,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m1, [pw_256] + mova m3, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1/4 + +.loop: + movh m0, [r0] + movhps m0, [r0 + r3] + movh m2, [r1] + movhps m2, [r1 + r4] + + paddw m0, m2 + pmulhrsw m0, m1 + paddw m0, m3 + + packuswb m0, m0 + movd [r2], m0 + pshufd m0, m0, 1 + movd [r2 + r5], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movh m0, [r0] + movhps m0, [r0 + r3] + movh m2, [r1] + movhps m2, [r1 + r4] + + paddw m0, m2 + pmulhrsw m0, m1 + paddw m0, m3 + + packuswb m0, m0 + movd [r2], m0 + pshufd m0, m0, 1 + movd [r2 + r5], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W4_H4 4 +ADDAVG_W4_H4 8 +ADDAVG_W4_H4 16 + +ADDAVG_W4_H4 32 + +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_6x8, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movd [r2 + r5], m1 + pextrw [r2 + r5 + 4], m1, 2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movd [r2 + r5], m1 + pextrw [r2 + r5 + 4], m1, 2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movd [r2 + r5], m1 + pextrw [r2 + r5 + 4], m1, 2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movd [r2 + r5], m1 + pextrw [r2 + r5 + 4], m1, 2 + + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_6x16, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + mov r6d, 16/2 + add r3, r3 + add r4, r4 +.loop: + movu m0, [r0] + movu m2, [r1] + movu m1, [r0 + r3] + movu m3, [r1 + r4] + dec r6d + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r4 * 2] + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + packuswb m0, m0 + packuswb m1, m1 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 + movd [r2 + r5], m1 + pextrw [r2 + r5 + 4], m1, 2 + lea r2, [r2 + r5 * 2] + jnz .loop + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movh [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movh [r2 + r5], m1 + + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movh [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movh [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movh [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movh [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movh [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + packuswb m1, m1 + movh [r2 + r5], m1 + + RET +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +%macro ADDAVG_W8_H4 1 +INIT_XMM sse4 +cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1/4 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + movh [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m1, m1 + movh [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + movh [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m1, m1 + movh [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W8_H4 4 +ADDAVG_W8_H4 8 +ADDAVG_W8_H4 16 +ADDAVG_W8_H4 32 + +ADDAVG_W8_H4 12 +ADDAVG_W8_H4 64 + +;----------------------------------------------------------------------------- + + +;----------------------------------------------------------------------------- +%macro ADDAVG_W12_H4 1 +INIT_XMM sse4 +cglobal addAvg_12x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1/4 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + packuswb m0, m0 + movh [r2], m0 + + movh m0, [r0 + 16] + movhps m0, [r0 + 16 + r3] + movh m2, [r1 + 16] + movhps m2, [r1 + 16 + r4] + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + movd [r2 + 8], m0 + pshufd m0, m0, 1 + movd [r2 + 8 + r5], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m1, m1 + movh [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + movh [r2], m0 + + movh m0, [r0 + 16] + movhps m0, [r0 + 16 + r3] + movh m2, [r1 + 16] + movhps m2, [r1 + 16 + r4] + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + movd [r2 + 8], m0 + pshufd m0, m0, 1 + movd [r2 + 8 + r5], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m1, m1 + movh [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W12_H4 16 + +ADDAVG_W12_H4 32 + +;----------------------------------------------------------------------------- + + +;----------------------------------------------------------------------------- +%macro ADDAVG_W16_H4 1 +INIT_XMM sse4 +cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1/4 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W16_H4 4 +ADDAVG_W16_H4 8 +ADDAVG_W16_H4 12 +ADDAVG_W16_H4 16 +ADDAVG_W16_H4 32 +ADDAVG_W16_H4 64 + +ADDAVG_W16_H4 24 + +;----------------------------------------------------------------------------- + + +;----------------------------------------------------------------------------- +%macro ADDAVG_W24_H2 2 +INIT_XMM sse4 +cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %2/2 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + movh [r2 + 16], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + r5], m1 + + movu m1, [r0 + 32 + r3] + movu m3, [r1 + 32 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m1, m1 + movh [r2 + 16 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W24_H2 24, 32 + +ADDAVG_W24_H2 24, 64 + +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +%macro ADDAVG_W32_H2 1 +INIT_XMM sse4 +cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 48] + movu m2, [r1 + 48] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2 + 16], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + r5], m1 + + movu m1, [r0 + 32 + r3] + movu m3, [r1 + 32 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 48 + r3] + movu m3, [r1 + 48 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + 16 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W32_H2 8 +ADDAVG_W32_H2 16 +ADDAVG_W32_H2 24 +ADDAVG_W32_H2 32 +ADDAVG_W32_H2 64 + +ADDAVG_W32_H2 48 + +;----------------------------------------------------------------------------- + + +;----------------------------------------------------------------------------- +%macro ADDAVG_W48_H2 1 +INIT_XMM sse4 +cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 48] + movu m2, [r1 + 48] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2 + 16], m0 + + movu m0, [r0 + 64] + movu m2, [r1 + 64] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 80] + movu m2, [r1 + 80] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2 + 32], m0 + + movu m1, [r0 + r3] + movu m3, [r1 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 16 + r3] + movu m3, [r1 + 16 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + r5], m1 + + movu m1, [r0 + 32 + r3] + movu m3, [r1 + 32 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 48 + r3] + movu m3, [r1 + 48 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + 16 + r5], m1 + + movu m1, [r0 + 64 + r3] + movu m3, [r1 + 64 + r4] + paddw m1, m3 + pmulhrsw m1, m4 + paddw m1, m5 + + movu m2, [r0 + 80 + r3] + movu m3, [r1 + 80 + r4] + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m1, m2 + movu [r2 + 32 + r5], m1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W48_H2 64 + +;----------------------------------------------------------------------------- + +;----------------------------------------------------------------------------- +%macro ADDAVG_W64_H1 1 +INIT_XMM sse4 +cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + mov r6d, %1 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 16] + movu m2, [r1 + 16] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m2, [r1 + 32] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 48] + movu m2, [r1 + 48] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2 + 16], m0 + + movu m0, [r0 + 64] + movu m2, [r1 + 64] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 80] + movu m2, [r1 + 80] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2 + 32], m0 + + movu m0, [r0 + 96] + movu m2, [r1 + 96] + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 112] + movu m2, [r1 + 112] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + movu [r2 + 48], m0 + + add r2, r5 + add r0, r3 + add r1, r4 + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W64_H1 16 +ADDAVG_W64_H1 32 +ADDAVG_W64_H1 48 +ADDAVG_W64_H1 64 +;----------------------------------------------------------------------------- +%endif ; HIGH_BIT_DEPTH + +;============================================================================= +; implicit weighted biprediction +;============================================================================= +; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 +%if WIN64 + DECLARE_REG_TMP 0,1,2,3,4,5,4,5 + %macro AVG_START 0-1 0 + PROLOGUE 6,7,%1 + %endmacro +%elif UNIX64 + DECLARE_REG_TMP 0,1,2,3,4,5,7,8 + %macro AVG_START 0-1 0 + PROLOGUE 6,9,%1 + %endmacro +%else + DECLARE_REG_TMP 1,2,3,4,5,6,1,2 + %macro AVG_START 0-1 0 + PROLOGUE 0,7,%1 + mov t0, r0m + mov t1, r1m + mov t2, r2m + mov t3, r3m + mov t4, r4m + mov t5, r5m + %endmacro +%endif + +%macro AVG_END 0 + lea t4, [t4+t5*2*SIZEOF_PIXEL] + lea t2, [t2+t3*2*SIZEOF_PIXEL] + lea t0, [t0+t1*2*SIZEOF_PIXEL] + sub eax, 2 + jg .height_loop + %ifidn movu,movq ; detect MMX + EMMS + %endif + RET +%endmacro + +%if HIGH_BIT_DEPTH + +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 + punpcklwd m0, m1 + pmaddwd m0, m3 + paddd m0, m4 + psrad m0, 6 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + movzx t6d, word r6m + mov t7d, 64 + sub t7d, t6d + shl t7d, 16 + add t6d, t7d + movd m3, t6d + SPLATD m3, m3 + mova m4, [pd_32] + pxor m5, m5 +%endmacro + +%else ;!HIGH_BIT_DEPTH +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, m2 + pmullw m1, m3 + paddw m0, m1 + paddw m0, m4 + psraw m0, 6 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + movd m2, r6m + SPLATW m2, m2 ; weight_dst + mova m3, [pw_64] + psubw m3, m2 ; weight_src + mova m4, [pw_32] ; rounding + pxor m5, m5 +%endmacro +%endif ;HIGH_BIT_DEPTH + +%macro BIWEIGHT_SSSE3 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m1 + pmaddubsw m0, m3 + pmulhrsw m0, m4 +%endmacro + +%macro BIWEIGHT_START_SSSE3 0 + movzx t6d, byte r6m ; FIXME x86_64 + mov t7d, 64 + sub t7d, t6d + shl t7d, 8 + add t6d, t7d + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else + SPLATW m3, m3 ; weight_dst,src +%endif +%endmacro + +%if HIGH_BIT_DEPTH +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/4 + packssdw m0, m0 + CLIPW m0, m5, m7 + movh [%1], m0 +%else + SWAP 0, 6 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packssdw m6, m0 + CLIPW m6, m5, m7 + mova [%1], m6 +%endif +%endmacro + +%else ;!HIGH_BIT_DEPTH +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/2 + packuswb m0, m0 + movh [%1], m0 +%else + SWAP 0, 6 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packuswb m6, m0 +%if %4 != 12 + mova [%1], m6 +%else ; !w12 + movh [%1], m6 + movhlps m6, m6 + movd [%1+mmsize/2], m6 +%endif ; w12 +%endif +%endmacro + +%endif ;HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) +;----------------------------------------------------------------------------- +%macro AVG_WEIGHT 1-2 0 +cglobal pixel_avg_weight_w%1 + BIWEIGHT_START + AVG_START %2 +%if HIGH_BIT_DEPTH + mova m7, [pw_pixel_max] +%endif +.height_loop: +%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL) + BIWEIGHT [t2], [t4] + SWAP 0, 6 + BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] +%if HIGH_BIT_DEPTH + packssdw m6, m0 + CLIPW m6, m5, m7 +%else ;!HIGH_BIT_DEPTH + packuswb m6, m0 +%endif ;HIGH_BIT_DEPTH + movlps [t0], m6 + movhps [t0+SIZEOF_PIXEL*t1], m6 +%else +%assign x 0 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize +%assign y mmsize +%if (%1 == 12) && (%1*SIZEOF_PIXEL-x < mmsize) +%assign y (%1*SIZEOF_PIXEL-x) +%endif + BIWEIGHT_ROW t0+x, t2+x, t4+x, y + BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, y +%assign x x+mmsize +%endrep +%endif + AVG_END +%endmacro + +%define BIWEIGHT BIWEIGHT_MMX +%define BIWEIGHT_START BIWEIGHT_START_MMX +INIT_MMX mmx2 +AVG_WEIGHT 4 +AVG_WEIGHT 8 +AVG_WEIGHT 12 +AVG_WEIGHT 16 +AVG_WEIGHT 32 +AVG_WEIGHT 64 +AVG_WEIGHT 24 +AVG_WEIGHT 48 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 4, 8 +AVG_WEIGHT 8, 8 +AVG_WEIGHT 12, 8 +AVG_WEIGHT 16, 8 +AVG_WEIGHT 24, 8 +AVG_WEIGHT 32, 8 +AVG_WEIGHT 48, 8 +AVG_WEIGHT 64, 8 +%else ;!HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 12, 7 +AVG_WEIGHT 16, 7 +AVG_WEIGHT 32, 7 +AVG_WEIGHT 64, 7 +AVG_WEIGHT 24, 7 +AVG_WEIGHT 48, 7 +%define BIWEIGHT BIWEIGHT_SSSE3 +%define BIWEIGHT_START BIWEIGHT_START_SSSE3 +INIT_MMX ssse3 +AVG_WEIGHT 4 +INIT_XMM ssse3 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 12, 7 +AVG_WEIGHT 16, 7 +AVG_WEIGHT 32, 7 +AVG_WEIGHT 64, 7 +AVG_WEIGHT 24, 7 +AVG_WEIGHT 48, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END +%endif ;HIGH_BIT_DEPTH + +;============================================================================= +; P frame explicit weighted prediction +;============================================================================= + +%if HIGH_BIT_DEPTH +; width +%macro WEIGHT_START 1 + mova m0, [r4+ 0] ; 1<= mmsize + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 + %assign x (x+mmsize) +%else + %assign w %3-x +%if w == 20 + %assign w 16 +%endif + WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 + %assign x (x+w) +%endif +%if x >= %3 + %exitrep +%endif +%endrep +%endmacro + +%endif ; HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) +;----------------------------------------------------------------------------- + +%macro WEIGHTER 1 +cglobal mc_weight_w%1, 6,6,8 + FIX_STRIDES r1, r3 + WEIGHT_START %1 +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 + ; we can merge the shift step into the scale factor + ; if (m3<<7) doesn't overflow an int16_t + cmp byte [r4+1], 0 + jz .fast +%endif +.loop: + WEIGHT_TWO_ROW r2, r0, %1, 0 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .loop + RET +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 +.fast: + psllw m3, 7 +.fastloop: + WEIGHT_TWO_ROW r2, r0, %1, 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .fastloop + RET +%endif +%endmacro + +INIT_MMX mmx2 +WEIGHTER 4 +WEIGHTER 8 +WEIGHTER 12 +WEIGHTER 16 +WEIGHTER 20 +INIT_XMM sse2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%if HIGH_BIT_DEPTH +WEIGHTER 12 +%else +INIT_MMX ssse3 +WEIGHTER 4 +INIT_XMM ssse3 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +INIT_YMM avx2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%endif + +%macro OFFSET_OP 7 + mov%6 m0, [%1] + mov%6 m1, [%2] +%if HIGH_BIT_DEPTH + p%5usw m0, m2 + p%5usw m1, m2 +%ifidn %5,add + pminsw m0, m3 + pminsw m1, m3 +%endif +%else + p%5usb m0, m2 + p%5usb m1, m2 +%endif + mov%7 [%3], m0 + mov%7 [%4], m1 +%endmacro + +%macro OFFSET_TWO_ROW 4 +%assign x 0 +%rep %3 +%if (%3*SIZEOF_PIXEL-x) >= mmsize + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a + %assign x (x+mmsize) +%else +%if HIGH_BIT_DEPTH + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h +%else + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d +%endif + %exitrep +%endif +%if x >= %3*SIZEOF_PIXEL + %exitrep +%endif +%endrep +%endmacro + +;----------------------------------------------------------------------------- +;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) +;----------------------------------------------------------------------------- +%macro OFFSET 2 +cglobal mc_offset%2_w%1, 6,6 + FIX_STRIDES r1, r3 + mova m2, [r4] +%if HIGH_BIT_DEPTH +%ifidn %2,add + mova m3, [pw_pixel_max] +%endif +%endif +.loop: + OFFSET_TWO_ROW r2, r0, %1, %2 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .loop + RET +%endmacro + +%macro OFFSETPN 1 + OFFSET %1, add + OFFSET %1, sub +%endmacro +INIT_MMX mmx2 +OFFSETPN 4 +OFFSETPN 8 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +INIT_XMM sse2 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +OFFSETPN 8 +%endif + + +;============================================================================= +; pixel avg +;============================================================================= + +;----------------------------------------------------------------------------- +; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int weight ); +;----------------------------------------------------------------------------- +%macro AVGH 2 +cglobal pixel_avg_%1x%2 + mov eax, %2 + cmp dword r6m, 32 + jne pixel_avg_weight_w%1 %+ SUFFIX +%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads + jmp pixel_avg_w%1_avx2 +%else +%if mmsize == 16 && (%1 % 16 == 0) + test dword r4m, 15 + jz pixel_avg_w%1_sse2 +%endif + jmp pixel_avg_w%1_mmx2 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int height, int weight ); +;----------------------------------------------------------------------------- + +%macro AVG_FUNC 3-4 +cglobal pixel_avg_w%1 + AVG_START +.height_loop: +%assign x 0 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize + %2 m0, [t2+x] + %2 m1, [t2+x+SIZEOF_PIXEL*t3] +%if HIGH_BIT_DEPTH + pavgw m0, [t4+x] + pavgw m1, [t4+x+SIZEOF_PIXEL*t5] +%else ;!HIGH_BIT_DEPTH + pavgb m0, [t4+x] + pavgb m1, [t4+x+SIZEOF_PIXEL*t5] +%endif +%if (%1 == 12) && (%1-x/SIZEOF_PIXEL < mmsize) + %4 [t0+x], m0 + %4 [t0+x+SIZEOF_PIXEL*t1], m1 +%else + %3 [t0+x], m0 + %3 [t0+x+SIZEOF_PIXEL*t1], m1 +%endif +%assign x x+mmsize +%endrep + AVG_END +%endmacro + +%if HIGH_BIT_DEPTH + +INIT_MMX mmx2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 32 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movq, movq +AVGH 16, 64 +AVGH 16, 32 +AVGH 16, 16 +AVGH 16, 12 +AVGH 16, 8 +AVGH 16, 4 + +AVG_FUNC 24, movq, movq +AVGH 24, 32 + +AVG_FUNC 32, movq, movq +AVGH 32, 32 +AVGH 32, 24 +AVGH 32, 16 +AVGH 32, 8 + +AVG_FUNC 48, movq, movq +AVGH 48, 64 + +AVG_FUNC 64, movq, movq +AVGH 64, 64 +AVGH 64, 48 +AVGH 64, 32 +AVGH 64, 16 + +AVG_FUNC 12, movq, movq, movq +AVGH 12, 16 + +INIT_XMM sse2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movdqu, movdqa +AVGH 8, 32 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 64 +AVGH 16, 32 +AVGH 16, 16 +AVGH 16, 12 +AVGH 16, 8 +AVGH 16, 4 + +AVG_FUNC 24, movdqu, movdqa +AVGH 24, 32 + +AVG_FUNC 32, movdqu, movdqa +AVGH 32, 64 +AVGH 32, 32 +AVGH 32, 24 +AVGH 32, 16 +AVGH 32, 8 + +AVG_FUNC 48, movdqu, movdqa +AVGH 48, 64 + +AVG_FUNC 64, movdqu, movdqa +AVGH 64, 64 +AVGH 64, 48 +AVGH 64, 32 +AVGH 64, 16 + +AVG_FUNC 12, movdqu, movdqa, movq +AVGH 12, 16 + +%else ;!HIGH_BIT_DEPTH + +INIT_MMX mmx2 +AVG_FUNC 4, movd, movd +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 32 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 12, movq, movq, movd +AVGH 12, 16 + +AVG_FUNC 16, movq, movq +AVGH 16, 64 +AVGH 16, 32 +AVGH 16, 16 +AVGH 16, 12 +AVGH 16, 8 +AVGH 16, 4 + +AVG_FUNC 32, movq, movq +AVGH 32, 32 +AVGH 32, 24 +AVGH 32, 16 +AVGH 32, 8 + +AVG_FUNC 64, movq, movq +AVGH 64, 64 +AVGH 64, 48 +AVGH 64, 16 + +AVG_FUNC 24, movq, movq +AVGH 24, 32 + +AVG_FUNC 48, movq, movq +AVGH 48, 64 + +INIT_XMM sse2 +AVG_FUNC 64, movdqu, movdqa +AVGH 64, 64 +AVGH 64, 48 +AVGH 64, 32 +AVGH 64, 16 + +AVG_FUNC 32, movdqu, movdqa +AVGH 32, 64 +AVGH 32, 32 +AVGH 32, 24 +AVGH 32, 16 +AVGH 32, 8 + +AVG_FUNC 24, movdqu, movdqa +AVGH 24, 32 + +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 64 +AVGH 16, 32 +AVGH 16, 16 +AVGH 16, 12 +AVGH 16, 8 +AVGH 16, 4 + +AVG_FUNC 48, movdqu, movdqa +AVGH 48, 64 + +AVG_FUNC 12, movdqu, movdqa, movq +AVGH 12, 16 + +AVGH 8, 32 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_XMM ssse3 +AVGH 24, 32 + +AVGH 64, 64 +AVGH 64, 48 +AVGH 64, 32 +AVGH 64, 16 + +AVGH 32, 64 +AVGH 32, 32 +AVGH 32, 24 +AVGH 32, 16 +AVGH 32, 8 + +AVGH 16, 64 +AVGH 16, 32 +AVGH 16, 16 +AVGH 16, 12 +AVGH 16, 8 +AVGH 16, 4 + +AVGH 48, 64 + +AVGH 12, 16 + +AVGH 8, 32 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_MMX ssse3 +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 +INIT_XMM avx2 +; TODO: active AVX2 after debug +;AVG_FUNC 24, movdqu, movdqa +;AVGH 24, 32 + +;AVG_FUNC 64, movdqu, movdqa +;AVGH 64, 64 +;AVGH 64, 48 +;AVGH 64, 16 + +;AVG_FUNC 32, movdqu, movdqa +;AVGH 32, 64 +;AVGH 32, 32 +;AVGH 32, 24 +;AVGH 32, 16 +;AVGH 32, 8 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 + +%endif ;HIGH_BIT_DEPTH + + + +;============================================================================= +; pixel avg2 +;============================================================================= + +%if HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, +; uint16_t *src1, intptr_t src_stride, +; uint16_t *src2, int height ); +;----------------------------------------------------------------------------- +%macro AVG2_W_ONE 1 +cglobal pixel_avg2_w%1, 6,7,4 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2] + movu m1, [r2+r3*2] +%if cpuflag(avx) || mmsize == 8 + pavgw m0, [r2+r4] + pavgw m1, [r2+r6] +%else + movu m2, [r2+r4] + movu m3, [r2+r6] + pavgw m0, m2 + pavgw m1, m3 +%endif + mova [r0], m0 + mova [r0+r1*2], m1 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +%macro AVG2_W_TWO 3 +cglobal pixel_avg2_w%1, 6,7,8 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2] + %2 m1, [r2+mmsize] + movu m2, [r2+r3*2] + %2 m3, [r2+r3*2+mmsize] +%if mmsize == 8 + pavgw m0, [r2+r4] + pavgw m1, [r2+r4+mmsize] + pavgw m2, [r2+r6] + pavgw m3, [r2+r6+mmsize] +%else + movu m4, [r2+r4] + %2 m5, [r2+r4+mmsize] + movu m6, [r2+r6] + %2 m7, [r2+r6+mmsize] + pavgw m0, m4 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 +%endif + mova [r0], m0 + %3 [r0+mmsize], m1 + mova [r0+r1*2], m2 + %3 [r0+r1*2+mmsize], m3 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +INIT_MMX mmx2 +AVG2_W_ONE 4 +AVG2_W_TWO 8, movu, mova +INIT_XMM sse2 +AVG2_W_ONE 8 +AVG2_W_TWO 10, movd, movd +AVG2_W_TWO 16, movu, mova +INIT_YMM avx2 +AVG2_W_ONE 16 + +INIT_MMX +cglobal pixel_avg2_w10_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movh m2, [r2+16] + movu m3, [r2+r3*2+ 0] + movu m4, [r2+r3*2+ 8] + movh m5, [r2+r3*2+16] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r6+ 0] + pavgw m4, [r2+r6+ 8] + pavgw m5, [r2+r6+16] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + movh [r0+16], m2 + mova [r0+r1*2+ 0], m3 + mova [r0+r1*2+ 8], m4 + movh [r0+r1*2+16], m5 + lea r2, [r2+r3*2*2] + lea r0, [r0+r1*2*2] + sub r5d, 2 + jg .height_loop + RET + +cglobal pixel_avg2_w16_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movu m2, [r2+16] + movu m3, [r2+24] + movu m4, [r2+r3*2+ 0] + movu m5, [r2+r3*2+ 8] + movu m6, [r2+r3*2+16] + movu m7, [r2+r3*2+24] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r4+24] + pavgw m4, [r2+r6+ 0] + pavgw m5, [r2+r6+ 8] + pavgw m6, [r2+r6+16] + pavgw m7, [r2+r6+24] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + mova [r0+16], m2 + mova [r0+24], m3 + mova [r0+r1*2+ 0], m4 + mova [r0+r1*2+ 8], m5 + mova [r0+r1*2+16], m6 + mova [r0+r1*2+24], m7 + lea r2, [r2+r3*2*2] + lea r0, [r0+r1*2*2] + sub r5d, 2 + jg .height_loop + RET + +cglobal pixel_avg2_w18_mmx2, 6,7 + sub r4, r2 +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movu m2, [r2+16] + movu m3, [r2+24] + movh m4, [r2+32] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r4+24] + pavgw m4, [r2+r4+32] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + mova [r0+16], m2 + mova [r0+24], m3 + movh [r0+32], m4 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + dec r5d + jg .height_loop + RET + +%macro PIXEL_AVG_W18 0 +cglobal pixel_avg2_w18, 6,7 + sub r4, r2 +.height_loop: + movu m0, [r2+ 0] + movd xm2, [r2+32] +%if mmsize == 32 + pavgw m0, [r2+r4+ 0] + movd xm1, [r2+r4+32] + pavgw xm2, xm1 +%else + movu m1, [r2+16] + movu m3, [r2+r4+ 0] + movu m4, [r2+r4+16] + movd m5, [r2+r4+32] + pavgw m0, m3 + pavgw m1, m4 + pavgw m2, m5 + mova [r0+16], m1 +%endif + mova [r0+ 0], m0 + movd [r0+32], xm2 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + dec r5d + jg .height_loop + RET +%endmacro + +INIT_XMM sse2 +PIXEL_AVG_W18 +INIT_YMM avx2 +PIXEL_AVG_W18 + +%endif ; HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +;----------------------------------------------------------------------------- +; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, +; uint8_t *src1, intptr_t src_stride, +; uint8_t *src2, int height ); +;----------------------------------------------------------------------------- +%macro AVG2_W8 2 +cglobal pixel_avg2_w%1_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + %2 mm0, [r2] + %2 mm1, [r2+r3] + pavgb mm0, [r2+r4] + pavgb mm1, [r2+r6] + lea r2, [r2+r3*2] + %2 [r0], mm0 + %2 [r0+r1], mm1 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +INIT_MMX +AVG2_W8 4, movd +AVG2_W8 8, movq + +%macro AVG2_W16 2 +cglobal pixel_avg2_w%1_mmx2, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movq mm0, [r4] + %2 mm1, [r4+8] + movq mm2, [r4+r3] + %2 mm3, [r4+r3+8] + pavgb mm0, [r4+r2] + pavgb mm1, [r4+r2+8] + pavgb mm2, [r4+r6] + pavgb mm3, [r4+r6+8] + lea r4, [r4+r3*2] + movq [r0], mm0 + %2 [r0+8], mm1 + movq [r0+r1], mm2 + %2 [r0+r1+8], mm3 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +AVG2_W16 12, movd +AVG2_W16 16, movq + +cglobal pixel_avg2_w20_mmx2, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movq mm0, [r4] + movq mm1, [r4+8] + movd mm2, [r4+16] + movq mm3, [r4+r3] + movq mm4, [r4+r3+8] + movd mm5, [r4+r3+16] + pavgb mm0, [r4+r2] + pavgb mm1, [r4+r2+8] + pavgb mm2, [r4+r2+16] + pavgb mm3, [r4+r6] + pavgb mm4, [r4+r6+8] + pavgb mm5, [r4+r6+16] + lea r4, [r4+r3*2] + movq [r0], mm0 + movq [r0+8], mm1 + movd [r0+16], mm2 + movq [r0+r1], mm3 + movq [r0+r1+8], mm4 + movd [r0+r1+16], mm5 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +INIT_XMM +cglobal pixel_avg2_w16_sse2, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + movu m0, [r2] + movu m2, [r2+r3] + movu m1, [r2+r4] + movu m3, [r2+r6] + lea r2, [r2+r3*2] + pavgb m0, m1 + pavgb m2, m3 + mova [r0], m0 + mova [r0+r1], m2 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +cglobal pixel_avg2_w20_sse2, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m2, [r4+r3] + movu m1, [r4+r2] + movu m3, [r4+r6] + movd mm4, [r4+16] + movd mm5, [r4+r3+16] + pavgb m0, m1 + pavgb m2, m3 + pavgb mm4, [r4+r2+16] + pavgb mm5, [r4+r6+16] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m2 + movd [r0+16], mm4 + movd [r0+r1+16], mm5 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +INIT_YMM avx2 +cglobal pixel_avg2_w20, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m1, [r4+r3] + pavgb m0, [r4+r2] + pavgb m1, [r4+r6] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m1 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +; Cacheline split code for processors with high latencies for loads +; split over cache lines. See sad-a.asm for a more detailed explanation. +; This particular instance is complicated by the fact that src1 and src2 +; can have different alignments. For simplicity and code size, only the +; MMX cacheline workaround is used. As a result, in the case of SSE2 +; pixel_avg, the cacheline check functions calls the SSE2 version if there +; is no cacheline split, and the MMX workaround if there is. + +%macro INIT_SHIFT 2 + and eax, 7 + shl eax, 3 + movd %1, [sw_64] + movd %2, eax + psubw %1, %2 +%endmacro + +%macro AVG_CACHELINE_START 0 + %assign stack_offset 0 + INIT_SHIFT mm6, mm7 + mov eax, r4m + INIT_SHIFT mm4, mm5 + PROLOGUE 6,6 + and r2, ~7 + and r4, ~7 + sub r4, r2 +.height_loop: +%endmacro + +%macro AVG_CACHELINE_LOOP 2 + movq mm1, [r2+%1] + movq mm0, [r2+8+%1] + movq mm3, [r2+r4+%1] + movq mm2, [r2+r4+8+%1] + psrlq mm1, mm7 + psllq mm0, mm6 + psrlq mm3, mm5 + psllq mm2, mm4 + por mm0, mm1 + por mm2, mm3 + pavgb mm2, mm0 + %2 [r0+%1], mm2 +%endmacro + +%macro AVG_CACHELINE_FUNC 2 +pixel_avg2_w%1_cache_mmx2: + AVG_CACHELINE_START + AVG_CACHELINE_LOOP 0, movq +%if %1>8 + AVG_CACHELINE_LOOP 8, movq +%if %1>16 + AVG_CACHELINE_LOOP 16, movd +%endif +%endif + add r2, r3 + add r0, r1 + dec r5d + jg .height_loop + RET +%endmacro + +%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set +%if %1 == 12 +;w12 isn't needed because w16 is just as fast if there's no cacheline split +%define cachesplit pixel_avg2_w16_cache_mmx2 +%else +%define cachesplit pixel_avg2_w%1_cache_mmx2 +%endif +cglobal pixel_avg2_w%1_cache%2_%3 + mov eax, r2m + and eax, %2-1 + cmp eax, (%2-%1-(%1 % 8)) +%if %1==12||%1==20 + jbe pixel_avg2_w%1_%3 +%else + jb pixel_avg2_w%1_%3 +%endif +%if 0 ; or %1==8 - but the extra branch seems too expensive + ja cachesplit +%if ARCH_X86_64 + test r4b, 1 +%else + test byte r4m, 1 +%endif + jz pixel_avg2_w%1_%3 +%else + or eax, r4m + and eax, 7 + jz pixel_avg2_w%1_%3 + mov eax, r2m +%endif +%if mmsize==16 || (%1==8 && %2==64) + AVG_CACHELINE_FUNC %1, %2 +%else + jmp cachesplit +%endif +%endmacro + +INIT_MMX +AVG_CACHELINE_CHECK 8, 64, mmx2 +AVG_CACHELINE_CHECK 12, 64, mmx2 +%if ARCH_X86_64 == 0 +AVG_CACHELINE_CHECK 16, 64, mmx2 +AVG_CACHELINE_CHECK 20, 64, mmx2 +AVG_CACHELINE_CHECK 8, 32, mmx2 +AVG_CACHELINE_CHECK 12, 32, mmx2 +AVG_CACHELINE_CHECK 16, 32, mmx2 +AVG_CACHELINE_CHECK 20, 32, mmx2 +%endif +INIT_XMM +AVG_CACHELINE_CHECK 16, 64, sse2 +AVG_CACHELINE_CHECK 20, 64, sse2 + +; computed jump assumes this loop is exactly 48 bytes +%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment +ALIGN 16 +avg_w16_align%1_%2_ssse3: +%if %1==0 && %2==0 + movdqa xmm1, [r2] + pavgb xmm1, [r2+r4] + add r2, r3 +%elif %1==0 + movdqa xmm1, [r2+r4+16] + palignr xmm1, [r2+r4], %2 + pavgb xmm1, [r2] + add r2, r3 +%elif %2&15==0 + movdqa xmm1, [r2+16] + palignr xmm1, [r2], %1 + pavgb xmm1, [r2+r4] + add r2, r3 +%else + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r4+16] + palignr xmm1, [r2], %1 + palignr xmm2, [r2+r4], %2&15 + add r2, r3 + pavgb xmm1, xmm2 +%endif + movdqa [r0], xmm1 + add r0, r1 + dec r5d + jg avg_w16_align%1_%2_ssse3 + ret +%if %1==0 + ; make sure the first ones don't end up short + ALIGN 16 + times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop +%endif +%endmacro + +cglobal pixel_avg2_w16_cache64_ssse3 +%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized + mov eax, r2m + and eax, 0x3f + cmp eax, 0x30 + jb x265_pixel_avg2_w16_sse2 + or eax, r4m + and eax, 7 + jz x265_pixel_avg2_w16_sse2 +%endif + PROLOGUE 6, 8 + lea r6, [r4+r2] + and r4, ~0xf + and r6, 0x1f + and r2, ~0xf + lea r6, [r6*3] ;(offset + align*2)*3 + sub r4, r2 + shl r6, 4 ;jump = (offset + align*2)*48 +%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) +%ifdef PIC + lea r7, [avg_w16_addr] + add r6, r7 +%else + lea r6, [avg_w16_addr + r6] +%endif + TAIL_CALL r6, 1 + +%assign j 0 +%assign k 1 +%rep 16 +AVG16_CACHELINE_LOOP_SSSE3 j, j +AVG16_CACHELINE_LOOP_SSSE3 j, k +%assign j j+1 +%assign k k+1 +%endrep +%endif ; !HIGH_BIT_DEPTH + +;============================================================================= +; pixel copy +;============================================================================= + +%macro COPY1 2 + movu m0, [r2] + movu m1, [r2+r3] + movu m2, [r2+r3*2] + movu m3, [r2+%2] + mova [r0], m0 + mova [r0+r1], m1 + mova [r0+r1*2], m2 + mova [r0+%1], m3 +%endmacro + +%macro COPY2 2-4 0, 1 + movu m0, [r2+%3*mmsize] + movu m1, [r2+%4*mmsize] + movu m2, [r2+r3+%3*mmsize] + movu m3, [r2+r3+%4*mmsize] + mova [r0+%3*mmsize], m0 + mova [r0+%4*mmsize], m1 + mova [r0+r1+%3*mmsize], m2 + mova [r0+r1+%4*mmsize], m3 + movu m0, [r2+r3*2+%3*mmsize] + movu m1, [r2+r3*2+%4*mmsize] + movu m2, [r2+%2+%3*mmsize] + movu m3, [r2+%2+%4*mmsize] + mova [r0+r1*2+%3*mmsize], m0 + mova [r0+r1*2+%4*mmsize], m1 + mova [r0+%1+%3*mmsize], m2 + mova [r0+%1+%4*mmsize], m3 +%endmacro + +%macro COPY4 2 + COPY2 %1, %2, 0, 1 + COPY2 %1, %2, 2, 3 +%endmacro + +;----------------------------------------------------------------------------- +; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, +; uint8_t *src, intptr_t i_src_stride, int i_height ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal mc_copy_w4_mmx, 4,6 + FIX_STRIDES r1, r3 + cmp dword r4m, 4 + lea r5, [r3*3] + lea r4, [r1*3] + je .end +%if HIGH_BIT_DEPTH == 0 + %define mova movd + %define movu movd +%endif + COPY1 r4, r5 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] +.end: + COPY1 r4, r5 + RET + +%macro MC_COPY 1 +%assign %%w %1*SIZEOF_PIXEL/mmsize +%if %%w > 0 +cglobal mc_copy_w%1, 5,7 + FIX_STRIDES r1, r3 + lea r6, [r3*3] + lea r5, [r1*3] +.height_loop: + COPY %+ %%w r5, r6 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r4d, 4 + jg .height_loop + RET +%endif +%endmacro + +INIT_MMX mmx +MC_COPY 8 +MC_COPY 16 +INIT_XMM sse +MC_COPY 8 +MC_COPY 16 +INIT_XMM aligned, sse +MC_COPY 16 +%if HIGH_BIT_DEPTH +INIT_YMM avx +MC_COPY 16 +INIT_YMM aligned, avx +MC_COPY 16 +%endif + +;============================================================================= +; prefetch +;============================================================================= +; assumes 64 byte cachelines +; FIXME doesn't cover all pixels in high depth and/or 4:4:4 + +;----------------------------------------------------------------------------- +; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, +; pixel *pix_uv, intptr_t stride_uv, int mb_x ) +;----------------------------------------------------------------------------- + +%macro PREFETCH_FENC 1 +%if ARCH_X86_64 +cglobal prefetch_fenc_%1, 5,5 + FIX_STRIDES r1, r3 + and r4d, 3 + mov eax, r4d + imul r4d, r1d + lea r0, [r0+r4*4+64*SIZEOF_PIXEL] + prefetcht0 [r0] + prefetcht0 [r0+r1] + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] + + imul eax, r3d + lea r2, [r2+rax*2+64*SIZEOF_PIXEL] + prefetcht0 [r2] + prefetcht0 [r2+r3] +%ifidn %1, 422 + lea r2, [r2+r3*2] + prefetcht0 [r2] + prefetcht0 [r2+r3] +%endif + RET + +%else +cglobal prefetch_fenc_%1, 0,3 + mov r2, r4m + mov r1, r1m + mov r0, r0m + FIX_STRIDES r1 + and r2, 3 + imul r2, r1 + lea r0, [r0+r2*4+64*SIZEOF_PIXEL] + prefetcht0 [r0] + prefetcht0 [r0+r1] + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] + + mov r2, r4m + mov r1, r3m + mov r0, r2m + FIX_STRIDES r1 + and r2, 3 + imul r2, r1 + lea r0, [r0+r2*2+64*SIZEOF_PIXEL] + prefetcht0 [r0] + prefetcht0 [r0+r1] +%ifidn %1, 422 + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] +%endif + ret +%endif ; ARCH_X86_64 +%endmacro + +INIT_MMX mmx2 +PREFETCH_FENC 420 +PREFETCH_FENC 422 + +;----------------------------------------------------------------------------- +; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) +;----------------------------------------------------------------------------- +INIT_MMX mmx2 +cglobal prefetch_ref, 3,3 + FIX_STRIDES r1 + dec r2d + and r2d, r1d + lea r0, [r0+r2*8+64*SIZEOF_PIXEL] + lea r2, [r1*3] + prefetcht0 [r0] + prefetcht0 [r0+r1] + prefetcht0 [r0+r1*2] + prefetcht0 [r0+r2] + lea r0, [r0+r1*4] + prefetcht0 [r0] + prefetcht0 [r0+r1] + prefetcht0 [r0+r1*2] + prefetcht0 [r0+r2] + RET diff --git a/source/common/x86/mc-a2.asm b/source/common/x86/mc-a2.asm new file mode 100644 index 0000000..e8541c1 --- /dev/null +++ b/source/common/x86/mc-a2.asm @@ -0,0 +1,1133 @@ +;***************************************************************************** +;* mc-a2.asm: x86 motion compensation +;***************************************************************************** +;* Copyright (C) 2005-2013 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* Holger Lubitz +;* Mathieu Monnier +;* Oskar Arvidsson +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 + +%if HIGH_BIT_DEPTH +deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 +deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 +%else +deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 +deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 +%endif +pw_1024: times 16 dw 1024 + +pd_16: times 4 dd 16 +pd_0f: times 4 dd 0xffff +pf_inv256: times 8 dd 0.00390625 + +SECTION .text + +cextern pb_0 +cextern pw_1 +cextern pw_16 +cextern pw_32 +cextern pw_512 +cextern pw_00ff +cextern pw_3fff +cextern pw_pixel_max +cextern pd_ffff + +;The hpel_filter routines use non-temporal writes for output. +;The following defines may be uncommented for testing. +;Doing the hpel_filter temporal may be a win if the last level cache +;is big enough (preliminary benching suggests on the order of 4* framesize). + +;%define movntq movq +;%define movntps movaps +;%define sfence + +%if HIGH_BIT_DEPTH == 0 +%undef movntq +%undef movntps +%undef sfence +%endif ; !HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +; void plane_copy_core( pixel *dst, intptr_t i_dst, +; pixel *src, intptr_t i_src, int w, int h ) +;----------------------------------------------------------------------------- +; assumes i_dst and w are multiples of 16, and i_dst>w +INIT_MMX +cglobal plane_copy_core_mmx2, 6,7 + FIX_STRIDES r1, r3, r4d +%if HIGH_BIT_DEPTH == 0 + movsxdifnidn r4, r4d +%endif + sub r1, r4 + sub r3, r4 +.loopy: + lea r6d, [r4-63] +.loopx: + prefetchnta [r2+256] + movq m0, [r2 ] + movq m1, [r2+ 8] + movntq [r0 ], m0 + movntq [r0+ 8], m1 + movq m2, [r2+16] + movq m3, [r2+24] + movntq [r0+16], m2 + movntq [r0+24], m3 + movq m4, [r2+32] + movq m5, [r2+40] + movntq [r0+32], m4 + movntq [r0+40], m5 + movq m6, [r2+48] + movq m7, [r2+56] + movntq [r0+48], m6 + movntq [r0+56], m7 + add r2, 64 + add r0, 64 + sub r6d, 64 + jg .loopx + prefetchnta [r2+256] + add r6d, 63 + jle .end16 +.loop16: + movq m0, [r2 ] + movq m1, [r2+8] + movntq [r0 ], m0 + movntq [r0+8], m1 + add r2, 16 + add r0, 16 + sub r6d, 16 + jg .loop16 +.end16: + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + sfence + emms + RET + + +%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint +%if HIGH_BIT_DEPTH +%assign x 0 +%rep 16/mmsize + mov%4 m0, [%2+(x/2)*mmsize] + mov%4 m1, [%3+(x/2)*mmsize] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + mov%5a [%1+(x+0)*mmsize], m0 + mov%5a [%1+(x+1)*mmsize], m2 + %assign x (x+2) +%endrep +%else + movq m0, [%2] +%if mmsize==16 +%ifidn %4, a + punpcklbw m0, [%3] +%else + movq m1, [%3] + punpcklbw m0, m1 +%endif + mov%5a [%1], m0 +%else + movq m1, [%3] + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + mov%5a [%1+0], m0 + mov%5a [%1+8], m2 +%endif +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned +%if HIGH_BIT_DEPTH +%assign n 0 +%rep 16/mmsize + mova m0, [%3+(n+0)*mmsize] + mova m1, [%3+(n+1)*mmsize] + psrld m2, m0, 16 + psrld m3, m1, 16 + pand m0, %5 + pand m1, %5 + packssdw m0, m1 + packssdw m2, m3 + mov%6 [%1+(n/2)*mmsize], m0 + mov%6 [%2+(n/2)*mmsize], m2 + %assign n (n+2) +%endrep +%else ; !HIGH_BIT_DEPTH +%if mmsize==16 + mova m0, [%3] +%if cpuflag(ssse3) + pshufb m0, %5 +%else + mova m1, m0 + pand m0, %5 + psrlw m1, 8 + packuswb m0, m1 +%endif +%if %4 + mova [%1], m0 +%else + movq [%1], m0 + movhps [%2], m0 +%endif +%else + mova m0, [%3] + mova m1, [%3+8] + mova m2, m0 + mova m3, m1 + pand m0, %5 + pand m1, %5 + psrlw m2, 8 + psrlw m3, 8 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endif ; mmsize == 16 +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro PLANE_INTERLEAVE 0 +;----------------------------------------------------------------------------- +; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, +; uint8_t *srcu, intptr_t i_srcu, +; uint8_t *srcv, intptr_t i_srcv, int w, int h ) +;----------------------------------------------------------------------------- +; assumes i_dst and w are multiples of 16, and i_dst>2*w +cglobal plane_copy_interleave_core, 6,9 + mov r6d, r6m +%if HIGH_BIT_DEPTH + FIX_STRIDES r1, r3, r5, r6d + movifnidn r1mp, r1 + movifnidn r3mp, r3 + mov r6m, r6d +%endif + lea r0, [r0+r6*2] + add r2, r6 + add r4, r6 +%if ARCH_X86_64 + DECLARE_REG_TMP 7,8 +%else + DECLARE_REG_TMP 1,3 +%endif + mov t1, r1 + shr t1, SIZEOF_PIXEL + sub t1, r6 + mov t0d, r7m +.loopy: + mov r6d, r6m + neg r6 +.prefetch: + prefetchnta [r2+r6] + prefetchnta [r4+r6] + add r6, 64 + jl .prefetch + mov r6d, r6m + neg r6 +.loopx: + INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt + INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt + add r6, 16*SIZEOF_PIXEL + jl .loopx +.pad: +%assign n 0 +%rep SIZEOF_PIXEL +%if mmsize==8 + movntq [r0+r6*2+(n+ 0)], m0 + movntq [r0+r6*2+(n+ 8)], m0 + movntq [r0+r6*2+(n+16)], m0 + movntq [r0+r6*2+(n+24)], m0 +%else + movntdq [r0+r6*2+(n+ 0)], m0 + movntdq [r0+r6*2+(n+16)], m0 +%endif + %assign n n+32 +%endrep + add r6, 16*SIZEOF_PIXEL + cmp r6, t1 + jl .pad + add r0, r1mp + add r2, r3mp + add r4, r5 + dec t0d + jg .loopy + sfence + emms + RET + +;----------------------------------------------------------------------------- +; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) +;----------------------------------------------------------------------------- +cglobal store_interleave_chroma, 5,5 + FIX_STRIDES r1 +.loop: + INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a + INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a + add r2, FDEC_STRIDEB*2 + add r3, FDEC_STRIDEB*2 + lea r0, [r0+r1*2] + sub r4d, 2 + jg .loop + RET +%endmacro ; PLANE_INTERLEAVE + +%macro DEINTERLEAVE_START 0 +%if HIGH_BIT_DEPTH + mova m4, [pd_ffff] +%elif cpuflag(ssse3) + mova m4, [deinterleave_shuf] +%else + mova m4, [pw_00ff] +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro PLANE_DEINTERLEAVE 0 +;----------------------------------------------------------------------------- +; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, +; pixel *dstv, intptr_t i_dstv, +; pixel *src, intptr_t i_src, int w, int h ) +;----------------------------------------------------------------------------- +cglobal plane_copy_deinterleave, 6,7 + DEINTERLEAVE_START + mov r6d, r6m + FIX_STRIDES r1, r3, r5, r6d +%if HIGH_BIT_DEPTH + mov r6m, r6d +%endif + add r0, r6 + add r2, r6 + lea r4, [r4+r6*2] +.loopy: + mov r6d, r6m + neg r6 +.loopx: + DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u + DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u + add r6, 16*SIZEOF_PIXEL + jl .loopx + add r0, r1 + add r2, r3 + add r4, r5 + dec dword r7m + jg .loopy + RET + +;----------------------------------------------------------------------------- +; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) +;----------------------------------------------------------------------------- +cglobal load_deinterleave_chroma_fenc, 4,4 + DEINTERLEAVE_START + FIX_STRIDES r2 +.loop: + DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a + DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a + add r0, FENC_STRIDEB*2 + lea r1, [r1+r2*2] + sub r3d, 2 + jg .loop + RET + +;----------------------------------------------------------------------------- +; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) +;----------------------------------------------------------------------------- +cglobal load_deinterleave_chroma_fdec, 4,4 + DEINTERLEAVE_START + FIX_STRIDES r2 +.loop: + DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a + DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a + add r0, FDEC_STRIDEB*2 + lea r1, [r1+r2*2] + sub r3d, 2 + jg .loop + RET +%endmacro ; PLANE_DEINTERLEAVE + +%if HIGH_BIT_DEPTH +INIT_MMX mmx2 +PLANE_INTERLEAVE +INIT_MMX mmx +PLANE_DEINTERLEAVE +INIT_XMM sse2 +PLANE_INTERLEAVE +PLANE_DEINTERLEAVE +INIT_XMM avx +PLANE_INTERLEAVE +PLANE_DEINTERLEAVE +%else +INIT_MMX mmx2 +PLANE_INTERLEAVE +INIT_MMX mmx +PLANE_DEINTERLEAVE +INIT_XMM sse2 +PLANE_INTERLEAVE +PLANE_DEINTERLEAVE +INIT_XMM ssse3 +PLANE_DEINTERLEAVE +%endif + +; These functions are not general-use; not only do the SSE ones require aligned input, +; but they also will fail if given a non-mod16 size. +; memzero SSE will fail for non-mod128. + +;----------------------------------------------------------------------------- +; void *memcpy_aligned( void *dst, const void *src, size_t n ); +;----------------------------------------------------------------------------- +%macro MEMCPY 0 +cglobal memcpy_aligned, 3,3 +%if mmsize == 16 + test r2d, 16 + jz .copy2 + mova m0, [r1+r2-16] + mova [r0+r2-16], m0 + sub r2d, 16 +.copy2: +%endif + test r2d, 2*mmsize + jz .copy4start + mova m0, [r1+r2-1*mmsize] + mova m1, [r1+r2-2*mmsize] + mova [r0+r2-1*mmsize], m0 + mova [r0+r2-2*mmsize], m1 + sub r2d, 2*mmsize +.copy4start: + test r2d, r2d + jz .ret +.copy4: + mova m0, [r1+r2-1*mmsize] + mova m1, [r1+r2-2*mmsize] + mova m2, [r1+r2-3*mmsize] + mova m3, [r1+r2-4*mmsize] + mova [r0+r2-1*mmsize], m0 + mova [r0+r2-2*mmsize], m1 + mova [r0+r2-3*mmsize], m2 + mova [r0+r2-4*mmsize], m3 + sub r2d, 4*mmsize + jg .copy4 +.ret: + REP_RET +%endmacro + +INIT_MMX mmx +MEMCPY +INIT_XMM sse +MEMCPY + +;----------------------------------------------------------------------------- +; void *memzero_aligned( void *dst, size_t n ); +;----------------------------------------------------------------------------- +%macro MEMZERO 1 +cglobal memzero_aligned, 2,2 + add r0, r1 + neg r1 +%if mmsize == 8 + pxor m0, m0 +%else + xorps m0, m0 +%endif +.loop: +%assign i 0 +%rep %1 + mova [r0 + r1 + i], m0 +%assign i i+mmsize +%endrep + add r1, mmsize*%1 + jl .loop + RET +%endmacro + +INIT_MMX mmx +MEMZERO 8 +INIT_XMM sse +MEMZERO 8 +INIT_YMM avx +MEMZERO 4 + +%if HIGH_BIT_DEPTH == 0 +;----------------------------------------------------------------------------- +; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) +;----------------------------------------------------------------------------- +%macro INTEGRAL_INIT4H 0 +cglobal integral_init4h, 3,4 + lea r3, [r0+r2*2] + add r1, r2 + neg r2 + pxor m4, m4 +.loop: + mova m0, [r1+r2] +%if mmsize==32 + movu m1, [r1+r2+8] +%else + mova m1, [r1+r2+16] + palignr m1, m0, 8 +%endif + mpsadbw m0, m4, 0 + mpsadbw m1, m4, 0 + paddw m0, [r0+r2*2] + paddw m1, [r0+r2*2+mmsize] + mova [r3+r2*2 ], m0 + mova [r3+r2*2+mmsize], m1 + add r2, mmsize + jl .loop + RET +%endmacro + +INIT_XMM sse4 +INTEGRAL_INIT4H +INIT_YMM avx2 +INTEGRAL_INIT4H + +%macro INTEGRAL_INIT8H 0 +cglobal integral_init8h, 3,4 + lea r3, [r0+r2*2] + add r1, r2 + neg r2 + pxor m4, m4 +.loop: + mova m0, [r1+r2] +%if mmsize==32 + movu m1, [r1+r2+8] + mpsadbw m2, m0, m4, 100100b + mpsadbw m3, m1, m4, 100100b +%else + mova m1, [r1+r2+16] + palignr m1, m0, 8 + mpsadbw m2, m0, m4, 100b + mpsadbw m3, m1, m4, 100b +%endif + mpsadbw m0, m4, 0 + mpsadbw m1, m4, 0 + paddw m0, [r0+r2*2] + paddw m1, [r0+r2*2+mmsize] + paddw m0, m2 + paddw m1, m3 + mova [r3+r2*2 ], m0 + mova [r3+r2*2+mmsize], m1 + add r2, mmsize + jl .loop + RET +%endmacro + +INIT_XMM sse4 +INTEGRAL_INIT8H +INIT_XMM avx +INTEGRAL_INIT8H +INIT_YMM avx2 +INTEGRAL_INIT8H +%endif ; !HIGH_BIT_DEPTH + +%macro INTEGRAL_INIT_8V 0 +;----------------------------------------------------------------------------- +; void integral_init8v( uint16_t *sum8, intptr_t stride ) +;----------------------------------------------------------------------------- +cglobal integral_init8v, 3,3 + add r1, r1 + add r0, r1 + lea r2, [r0+r1*8] + neg r1 +.loop: + mova m0, [r2+r1] + mova m1, [r2+r1+mmsize] + psubw m0, [r0+r1] + psubw m1, [r0+r1+mmsize] + mova [r0+r1], m0 + mova [r0+r1+mmsize], m1 + add r1, 2*mmsize + jl .loop + RET +%endmacro + +INIT_MMX mmx +INTEGRAL_INIT_8V +INIT_XMM sse2 +INTEGRAL_INIT_8V +INIT_YMM avx2 +INTEGRAL_INIT_8V + +;----------------------------------------------------------------------------- +; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) +;----------------------------------------------------------------------------- +INIT_MMX mmx +cglobal integral_init4v, 3,5 + shl r2, 1 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + mova m0, [r0+r2] + mova m4, [r4+r2] +.loop: + mova m1, m4 + psubw m1, m0 + mova m4, [r4+r2-8] + mova m0, [r0+r2-8] + paddw m1, m4 + mova m3, [r3+r2-8] + psubw m1, m0 + psubw m3, m0 + mova [r0+r2-8], m1 + mova [r1+r2-8], m3 + sub r2, 8 + jge .loop + RET + +INIT_XMM sse2 +cglobal integral_init4v, 3,5 + shl r2, 1 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m0, [r0+r2] + mova m1, [r4+r2] + mova m2, m0 + mova m4, m1 + shufpd m0, [r0+r2+16], 1 + shufpd m1, [r4+r2+16], 1 + paddw m0, m2 + paddw m1, m4 + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 16 + jl .loop + RET + +INIT_XMM ssse3 +cglobal integral_init4v, 3,5 + shl r2, 1 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m2, [r0+r2] + mova m0, [r0+r2+16] + mova m4, [r4+r2] + mova m1, [r4+r2+16] + palignr m0, m2, 8 + palignr m1, m4, 8 + paddw m0, m2 + paddw m1, m4 + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 16 + jl .loop + RET + +INIT_YMM avx2 +cglobal integral_init4v, 3,5 + add r2, r2 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m2, [r0+r2] + movu m1, [r4+r2+8] + paddw m0, m2, [r0+r2+8] + paddw m1, [r4+r2] + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 32 + jl .loop + RET + +%macro FILT8x4 7 + mova %3, [r0+%7] + mova %4, [r0+r5+%7] + pavgb %3, %4 + pavgb %4, [r0+r5*2+%7] + PALIGNR %1, %3, 1, m6 + PALIGNR %2, %4, 1, m6 +%if cpuflag(xop) + pavgb %1, %3 + pavgb %2, %4 +%else + pavgb %1, %3 + pavgb %2, %4 + psrlw %5, %1, 8 + psrlw %6, %2, 8 + pand %1, m7 + pand %2, m7 +%endif +%endmacro + +%macro FILT32x4U 4 + mova m1, [r0+r5] + pavgb m0, m1, [r0] + movu m3, [r0+r5+1] + pavgb m2, m3, [r0+1] + pavgb m1, [r0+r5*2] + pavgb m3, [r0+r5*2+1] + pavgb m0, m2 + pavgb m1, m3 + + mova m3, [r0+r5+mmsize] + pavgb m2, m3, [r0+mmsize] + movu m5, [r0+r5+1+mmsize] + pavgb m4, m5, [r0+1+mmsize] + pavgb m3, [r0+r5*2+mmsize] + pavgb m5, [r0+r5*2+1+mmsize] + pavgb m2, m4 + pavgb m3, m5 + + pshufb m0, m7 + pshufb m1, m7 + pshufb m2, m7 + pshufb m3, m7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m2, m1, m3 + vpermq m0, m0, q3120 + vpermq m1, m4, q3120 + vpermq m2, m2, q3120 + vpermq m3, m5, q3120 + mova [%1], m0 + mova [%2], m1 + mova [%3], m2 + mova [%4], m3 +%endmacro + +%macro FILT16x2 4 + mova m3, [r0+%4+mmsize] + mova m2, [r0+%4] + pavgb m3, [r0+%4+r5+mmsize] + pavgb m2, [r0+%4+r5] + PALIGNR %1, m3, 1, m6 + pavgb %1, m3 + PALIGNR m3, m2, 1, m6 + pavgb m3, m2 +%if cpuflag(xop) + vpperm m5, m3, %1, m7 + vpperm m3, m3, %1, m6 +%else + psrlw m5, m3, 8 + psrlw m4, %1, 8 + pand m3, m7 + pand %1, m7 + packuswb m3, %1 + packuswb m5, m4 +%endif + mova [%2], m3 + mova [%3], m5 + mova %1, m2 +%endmacro + +%macro FILT8x2U 3 + mova m3, [r0+%3+8] + mova m2, [r0+%3] + pavgb m3, [r0+%3+r5+8] + pavgb m2, [r0+%3+r5] + mova m1, [r0+%3+9] + mova m0, [r0+%3+1] + pavgb m1, [r0+%3+r5+9] + pavgb m0, [r0+%3+r5+1] + pavgb m1, m3 + pavgb m0, m2 + psrlw m3, m1, 8 + psrlw m2, m0, 8 + pand m1, m7 + pand m0, m7 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endmacro + +%macro FILT8xU 3 + mova m3, [r0+%3+8] + mova m2, [r0+%3] + pavgw m3, [r0+%3+r5+8] + pavgw m2, [r0+%3+r5] + movu m1, [r0+%3+10] + movu m0, [r0+%3+2] + pavgw m1, [r0+%3+r5+10] + pavgw m0, [r0+%3+r5+2] + pavgw m1, m3 + pavgw m0, m2 + psrld m3, m1, 16 + psrld m2, m0, 16 + pand m1, m7 + pand m0, m7 + packssdw m0, m1 + packssdw m2, m3 + movu [%1], m0 + mova [%2], m2 +%endmacro + +%macro FILT8xA 4 + mova m3, [r0+%4+mmsize] + mova m2, [r0+%4] + pavgw m3, [r0+%4+r5+mmsize] + pavgw m2, [r0+%4+r5] + PALIGNR %1, m3, 2, m6 + pavgw %1, m3 + PALIGNR m3, m2, 2, m6 + pavgw m3, m2 +%if cpuflag(xop) + vpperm m5, m3, %1, m7 + vpperm m3, m3, %1, m6 +%else + psrld m5, m3, 16 + psrld m4, %1, 16 + pand m3, m7 + pand %1, m7 + packssdw m3, %1 + packssdw m5, m4 +%endif + mova [%2], m3 + mova [%3], m5 + mova %1, m2 +%endmacro + +;----------------------------------------------------------------------------- +; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, +; intptr_t src_stride, intptr_t dst_stride, int width, int height ) +;----------------------------------------------------------------------------- +%macro FRAME_INIT_LOWRES 0 +cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise +%if HIGH_BIT_DEPTH + shl dword r6m, 1 + FIX_STRIDES r5 + shl dword r7m, 1 +%endif +%if mmsize >= 16 + add dword r7m, mmsize-1 + and dword r7m, ~(mmsize-1) +%endif + ; src += 2*(height-1)*stride + 2*width + mov r6d, r8m + dec r6d + imul r6d, r5d + add r6d, r7m + lea r0, [r0+r6*2] + ; dst += (height-1)*stride + width + mov r6d, r8m + dec r6d + imul r6d, r6m + add r6d, r7m + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + ; gap = stride - width + mov r6d, r6m + sub r6d, r7m + PUSH r6 + %define dst_gap [rsp+gprsize] + mov r6d, r5d + sub r6d, r7m + shl r6d, 1 + PUSH r6 + %define src_gap [rsp] +%if HIGH_BIT_DEPTH +%if cpuflag(xop) + mova m6, [deinterleave_shuf32a] + mova m7, [deinterleave_shuf32b] +%else + pcmpeqw m7, m7 + psrld m7, 16 +%endif +.vloop: + mov r6d, r7m +%ifnidn cpuname, mmx2 + mova m0, [r0] + mova m1, [r0+r5] + pavgw m0, m1 + pavgw m1, [r0+r5*2] +%endif +.hloop: + sub r0, mmsize*2 + sub r1, mmsize + sub r2, mmsize + sub r3, mmsize + sub r4, mmsize +%ifidn cpuname, mmx2 + FILT8xU r1, r2, 0 + FILT8xU r3, r4, r5 +%else + FILT8xA m0, r1, r2, 0 + FILT8xA m1, r3, r4, r5 +%endif + sub r6d, mmsize + jg .hloop +%else ; !HIGH_BIT_DEPTH +%if cpuflag(avx2) + mova m7, [deinterleave_shuf] +%elif cpuflag(xop) + mova m6, [deinterleave_shuf32a] + mova m7, [deinterleave_shuf32b] +%else + pcmpeqb m7, m7 + psrlw m7, 8 +%endif +.vloop: + mov r6d, r7m +%ifnidn cpuname, mmx2 +%if mmsize <= 16 + mova m0, [r0] + mova m1, [r0+r5] + pavgb m0, m1 + pavgb m1, [r0+r5*2] +%endif +%endif +.hloop: + sub r0, mmsize*2 + sub r1, mmsize + sub r2, mmsize + sub r3, mmsize + sub r4, mmsize +%if mmsize==32 + FILT32x4U r1, r2, r3, r4 +%elifdef m8 + FILT8x4 m0, m1, m2, m3, m10, m11, mmsize + mova m8, m0 + mova m9, m1 + FILT8x4 m2, m3, m0, m1, m4, m5, 0 +%if cpuflag(xop) + vpperm m4, m2, m8, m7 + vpperm m2, m2, m8, m6 + vpperm m5, m3, m9, m7 + vpperm m3, m3, m9, m6 +%else + packuswb m2, m8 + packuswb m3, m9 + packuswb m4, m10 + packuswb m5, m11 +%endif + mova [r1], m2 + mova [r2], m4 + mova [r3], m3 + mova [r4], m5 +%elifidn cpuname, mmx2 + FILT8x2U r1, r2, 0 + FILT8x2U r3, r4, r5 +%else + FILT16x2 m0, r1, r2, 0 + FILT16x2 m1, r3, r4, r5 +%endif + sub r6d, mmsize + jg .hloop +%endif ; HIGH_BIT_DEPTH +.skip: + mov r6, dst_gap + sub r0, src_gap + sub r1, r6 + sub r2, r6 + sub r3, r6 + sub r4, r6 + dec dword r8m + jg .vloop + ADD rsp, 2*gprsize + emms + RET +%endmacro ; FRAME_INIT_LOWRES + +INIT_MMX mmx2 +FRAME_INIT_LOWRES +%if ARCH_X86_64 == 0 +INIT_MMX cache32, mmx2 +FRAME_INIT_LOWRES +%endif +INIT_XMM sse2 +FRAME_INIT_LOWRES +INIT_XMM ssse3 +FRAME_INIT_LOWRES +INIT_XMM avx +FRAME_INIT_LOWRES +INIT_XMM xop +FRAME_INIT_LOWRES +%if HIGH_BIT_DEPTH==0 +INIT_YMM avx2 +FRAME_INIT_LOWRES +%endif + +;----------------------------------------------------------------------------- +; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) +;----------------------------------------------------------------------------- +%macro MBTREE 0 +cglobal mbtree_propagate_cost, 7,7,7 + add r6d, r6d + lea r0, [r0+r6*2] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + pxor xmm4, xmm4 + movss xmm6, [r5] + shufps xmm6, xmm6, 0 + mulps xmm6, [pf_inv256] + movdqa xmm5, [pw_3fff] +.loop: + movq xmm2, [r2+r6] ; intra + movq xmm0, [r4+r6] ; invq + movq xmm3, [r3+r6] ; inter + movq xmm1, [r1+r6] ; prop + punpcklwd xmm2, xmm4 + punpcklwd xmm0, xmm4 + pmaddwd xmm0, xmm2 + pand xmm3, xmm5 + punpcklwd xmm1, xmm4 + punpcklwd xmm3, xmm4 +%if cpuflag(fma4) + cvtdq2ps xmm0, xmm0 + cvtdq2ps xmm1, xmm1 + fmaddps xmm0, xmm0, xmm6, xmm1 + cvtdq2ps xmm1, xmm2 + psubd xmm2, xmm3 + cvtdq2ps xmm2, xmm2 + rcpps xmm3, xmm1 + mulps xmm1, xmm3 + mulps xmm0, xmm2 + addps xmm2, xmm3, xmm3 + fnmaddps xmm3, xmm1, xmm3, xmm2 + mulps xmm0, xmm3 +%else + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm6 ; intra*invq*fps_factor>>8 + cvtdq2ps xmm1, xmm1 ; prop + addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8) + cvtdq2ps xmm1, xmm2 ; intra + psubd xmm2, xmm3 ; intra - inter + cvtdq2ps xmm2, xmm2 ; intra - inter + rcpps xmm3, xmm1 ; 1 / intra 1st approximation + mulps xmm1, xmm3 ; intra * (1/intra 1st approx) + mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2 + mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + addps xmm3, xmm3 ; 2 * (1/intra 1st approx) + subps xmm3, xmm1 ; 2nd approximation for 1/intra + mulps xmm0, xmm3 ; / intra +%endif + cvtps2dq xmm0, xmm0 + movdqa [r0+r6*2], xmm0 + add r6, 8 + jl .loop + RET +%endmacro + +INIT_XMM sse2 +MBTREE +; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower. +INIT_XMM fma4 +MBTREE + +%macro INT16_UNPACK 1 + vpunpckhwd xm4, xm%1, xm7 + vpunpcklwd xm%1, xm7 + vinsertf128 m%1, m%1, xm4, 1 +%endmacro + +; FIXME: align loads/stores to 16 bytes +%macro MBTREE_AVX 0 +cglobal mbtree_propagate_cost, 7,7,8 + add r6d, r6d + lea r0, [r0+r6*2] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + mova xm5, [pw_3fff] + vbroadcastss m6, [r5] + mulps m6, [pf_inv256] +%if notcpuflag(avx2) + pxor xm7, xm7 +%endif +.loop: +%if cpuflag(avx2) + pmovzxwd m0, [r2+r6] ; intra + pmovzxwd m1, [r4+r6] ; invq + pmovzxwd m2, [r1+r6] ; prop + pand xm3, xm5, [r3+r6] ; inter + pmovzxwd m3, xm3 + pmaddwd m1, m0 + psubd m4, m0, m3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m4, m4 + fmaddps m1, m1, m6, m2 + rcpps m3, m0 + mulps m2, m0, m3 + mulps m1, m4 + addps m4, m3, m3 + fnmaddps m4, m2, m3, m4 + mulps m1, m4 +%else + movu xm0, [r2+r6] + movu xm1, [r4+r6] + movu xm2, [r1+r6] + pand xm3, xm5, [r3+r6] + INT16_UNPACK 0 + INT16_UNPACK 1 + INT16_UNPACK 2 + INT16_UNPACK 3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m1, m0 + subps m4, m0, m3 + mulps m1, m6 ; intra*invq*fps_factor>>8 + addps m1, m2 ; prop + (intra*invq*fps_factor>>8) + rcpps m3, m0 ; 1 / intra 1st approximation + mulps m2, m0, m3 ; intra * (1/intra 1st approx) + mulps m2, m3 ; intra * (1/intra 1st approx)^2 + mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + addps m3, m3 ; 2 * (1/intra 1st approx) + subps m3, m2 ; 2nd approximation for 1/intra + mulps m1, m3 ; / intra +%endif + vcvtps2dq m1, m1 + movu [r0+r6*2], m1 + add r6, 16 + jl .loop + RET +%endmacro + +INIT_YMM avx +MBTREE_AVX +INIT_YMM avx2,fma3 +MBTREE_AVX diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h new file mode 100644 index 0000000..95cb609 --- /dev/null +++ b/source/common/x86/mc.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_MC_H +#define X265_MC_H + +#define LOWRES(cpu) \ + void x265_frame_init_lowres_core_ ## cpu(pixel * src0, pixel * dst0, pixel * dsth, pixel * dstv, pixel * dstc, \ + intptr_t src_stride, intptr_t dst_stride, int width, int height); +LOWRES(mmx2) +LOWRES(sse2) +LOWRES(ssse3) +LOWRES(avx) +LOWRES(xop) + +#define DECL_SUF(func, args) \ + void func ## _mmx2 args; \ + void func ## _sse2 args; \ + void func ## _ssse3 args; +DECL_SUF(x265_pixel_avg_64x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x48, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_64x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_48x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x24, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_32x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_24x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x64, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x12, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_16x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_12x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x32, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_8x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_4x16, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_4x8, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) +DECL_SUF(x265_pixel_avg_4x4, (pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int)) + +#undef LOWRES +#undef DECL_SUF + +#endif // ifndef X265_MC_H diff --git a/source/common/x86/pixel-32.asm b/source/common/x86/pixel-32.asm new file mode 100644 index 0000000..a74062d --- /dev/null +++ b/source/common/x86/pixel-32.asm @@ -0,0 +1,420 @@ +;***************************************************************************** +;* pixel-32.asm: x86_32 pixel metrics +;***************************************************************************** +;* Copyright (C) 2003-2013 x264 project +;* +;* Authors: Loren Merritt +;* Laurent Aimar +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +cextern pw_ppmmppmm +cextern pw_pmpmpmpm + +SECTION .text +INIT_MMX mmx2 + +%macro LOAD_DIFF_4x8P 1 ; dx + LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1] + LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3] + LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] + LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1] + LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3] + LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] + movq [spill], m5 + LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5] + movq m5, [spill] +%endmacro + +%macro SUM4x8_MM 0 + movq [spill], m6 + movq [spill+8], m7 + ABSW2 m0, m1, m0, m1, m6, m7 + ABSW2 m2, m3, m2, m3, m6, m7 + paddw m0, m2 + paddw m1, m3 + movq m6, [spill] + movq m7, [spill+8] + ABSW2 m4, m5, m4, m5, m2, m3 + ABSW2 m6, m7, m6, m7, m2, m3 + paddw m4, m6 + paddw m5, m7 + paddw m0, m4 + paddw m1, m5 + paddw m0, m1 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sa8d_8x8_internal + push r0 + push r2 + sub esp, 0x74 +%define args esp+0x74 +%define spill esp+0x60 ; +16 +%define trans esp+0 ; +96 + LOAD_DIFF_4x8P 0 + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + + movq [spill], m1 + TRANSPOSE4x4W 4, 5, 6, 7, 1 + movq [trans+0x00], m4 + movq [trans+0x08], m5 + movq [trans+0x10], m6 + movq [trans+0x18], m7 + movq m1, [spill] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + movq [trans+0x20], m0 + movq [trans+0x28], m1 + movq [trans+0x30], m2 + movq [trans+0x38], m3 + + mov r0, [args+4] + mov r2, [args] + LOAD_DIFF_4x8P 4 + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + + movq [spill], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + movq [trans+0x40], m0 + movq [trans+0x48], m1 + movq [trans+0x50], m2 + movq [trans+0x58], m3 + movq m7, [spill] + TRANSPOSE4x4W 4, 5, 6, 7, 1 + movq m0, [trans+0x00] + movq m1, [trans+0x08] + movq m2, [trans+0x10] + movq m3, [trans+0x18] + + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + SUM4x8_MM + movq [trans], m0 + + movq m0, [trans+0x20] + movq m1, [trans+0x28] + movq m2, [trans+0x30] + movq m3, [trans+0x38] + movq m4, [trans+0x40] + movq m5, [trans+0x48] + movq m6, [trans+0x50] + movq m7, [trans+0x58] + + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + SUM4x8_MM + + pavgw m0, [trans] + add esp, 0x7c + ret +%undef args +%undef spill +%undef trans + +%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op + pxor %7, %7 + pshufw %4, %1, q1032 + pshufw %5, %2, q1032 + pshufw %6, %3, q1032 + paddusw %1, %4 + paddusw %2, %5 + paddusw %3, %6 + punpcklwd %1, %7 + punpcklwd %2, %7 + punpcklwd %3, %7 + pshufw %4, %1, q1032 + pshufw %5, %2, q1032 + pshufw %6, %3, q1032 + %8 %1, %4 + %8 %2, %5 + %8 %3, %6 +%endmacro + +%macro LOAD_4x8P 1 ; dx + pxor m7, m7 + movd m6, [r0+%1+7*FENC_STRIDE] + movd m0, [r0+%1+0*FENC_STRIDE] + movd m1, [r0+%1+1*FENC_STRIDE] + movd m2, [r0+%1+2*FENC_STRIDE] + movd m3, [r0+%1+3*FENC_STRIDE] + movd m4, [r0+%1+4*FENC_STRIDE] + movd m5, [r0+%1+5*FENC_STRIDE] + punpcklbw m6, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + movq [spill], m6 + punpcklbw m2, m7 + punpcklbw m3, m7 + movd m6, [r0+%1+6*FENC_STRIDE] + punpcklbw m4, m7 + punpcklbw m5, m7 + punpcklbw m6, m7 + movq m7, [spill] +%endmacro + +%macro HSUMSUB2 4 + pshufw m4, %1, %3 + pshufw m5, %2, %3 + pmullw %1, %4 + pmullw m5, %4 + paddw %1, m4 + paddw %2, m5 +%endmacro + +;----------------------------------------------------------------------------- +; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) +;----------------------------------------------------------------------------- +cglobal intra_sa8d_x3_8x8, 2,3 + SUB esp, 0x94 +%define edge esp+0x70 ; +32 +%define spill esp+0x60 ; +16 +%define trans esp+0 ; +96 +%define sum esp+0 ; +32 + + pxor m7, m7 + movq m0, [r1+7] + movq m2, [r1+16] + movq m1, m0 + movq m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + movq m6, [pw_ppmmppmm] + HSUMSUB2 m0, m2, q1032, m6 + HSUMSUB2 m1, m3, q1032, m6 + movq m6, [pw_pmpmpmpm] + HSUMSUB2 m0, m2, q2301, m6 + HSUMSUB2 m1, m3, q2301, m6 + movq m4, m0 + movq m5, m2 + paddw m0, m1 + paddw m2, m3 + psubw m4, m1 + psubw m3, m5 + movq [edge+0], m0 + movq [edge+8], m4 + movq [edge+16], m2 + movq [edge+24], m3 + + LOAD_4x8P 0 + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + + movq [spill], m0 + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq [trans+0x00], m4 + movq [trans+0x08], m5 + movq [trans+0x10], m6 + movq [trans+0x18], m7 + movq m0, [spill] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + movq [trans+0x20], m0 + movq [trans+0x28], m1 + movq [trans+0x30], m2 + movq [trans+0x38], m3 + + LOAD_4x8P 4 + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + + movq [spill], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + movq [trans+0x40], m0 + movq [trans+0x48], m1 + movq [trans+0x50], m2 + movq [trans+0x58], m3 + movq m7, [spill] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq m0, [trans+0x00] + movq m1, [trans+0x08] + movq m2, [trans+0x10] + movq m3, [trans+0x18] + + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + + movq [spill+0], m0 + movq [spill+8], m1 + ABSW2 m2, m3, m2, m3, m0, m1 + ABSW2 m4, m5, m4, m5, m0, m1 + paddw m2, m4 + paddw m3, m5 + ABSW2 m6, m7, m6, m7, m4, m5 + movq m0, [spill+0] + movq m1, [spill+8] + paddw m2, m6 + paddw m3, m7 + paddw m2, m3 + ABSW m1, m1, m4 + paddw m2, m1 ; 7x4 sum + movq m7, m0 + movq m1, [edge+8] ; left bottom + psllw m1, 3 + psubw m7, m1 + ABSW2 m0, m7, m0, m7, m5, m3 + paddw m0, m2 + paddw m7, m2 + movq [sum+0], m0 ; dc + movq [sum+8], m7 ; left + + movq m0, [trans+0x20] + movq m1, [trans+0x28] + movq m2, [trans+0x30] + movq m3, [trans+0x38] + movq m4, [trans+0x40] + movq m5, [trans+0x48] + movq m6, [trans+0x50] + movq m7, [trans+0x58] + + HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 + + movd [sum+0x10], m0 + movd [sum+0x12], m1 + movd [sum+0x14], m2 + movd [sum+0x16], m3 + movd [sum+0x18], m4 + movd [sum+0x1a], m5 + movd [sum+0x1c], m6 + movd [sum+0x1e], m7 + + movq [spill], m0 + movq [spill+8], m1 + ABSW2 m2, m3, m2, m3, m0, m1 + ABSW2 m4, m5, m4, m5, m0, m1 + paddw m2, m4 + paddw m3, m5 + paddw m2, m3 + movq m0, [spill] + movq m1, [spill+8] + ABSW2 m6, m7, m6, m7, m4, m5 + ABSW m1, m1, m3 + paddw m2, m7 + paddw m1, m6 + paddw m2, m1 ; 7x4 sum + movq m1, m0 + + movq m7, [edge+0] + psllw m7, 3 ; left top + + mov r2, [edge+0] + add r2, [edge+16] + lea r2, [4*r2+32] + and r2, 0xffc0 + movd m6, r2 ; dc + + psubw m1, m7 + psubw m0, m6 + ABSW2 m0, m1, m0, m1, m5, m6 + movq m3, [sum+0] ; dc + paddw m0, m2 + paddw m1, m2 + movq m2, m0 + paddw m0, m3 + paddw m1, [sum+8] ; h + psrlq m2, 16 + paddw m2, m3 + + movq m3, [edge+16] ; top left + movq m4, [edge+24] ; top right + psllw m3, 3 + psllw m4, 3 + psubw m3, [sum+16] + psubw m4, [sum+24] + ABSW2 m3, m4, m3, m4, m5, m6 + paddw m2, m3 + paddw m2, m4 ; v + + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw + mov r2, r2m + pxor m7, m7 + punpckldq m2, m1 + pavgw m0, m7 + pavgw m2, m7 + movd [r2+8], m0 ; dc + movq [r2+0], m2 ; v, h + ADD esp, 0x94 + RET +%undef edge +%undef spill +%undef trans +%undef sum + + + +;----------------------------------------------------------------------------- +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +cglobal pixel_ssim_4x4x2_core, 0,5 + mov r1, r1m + mov r3, r3m + mov r4, 4 + pxor m0, m0 +.loop: + mov r0, r0m + mov r2, r2m + add r0, r4 + add r2, r4 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + pxor m4, m4 +%rep 4 + movd m5, [r0] + movd m6, [r2] + punpcklbw m5, m0 + punpcklbw m6, m0 + paddw m1, m5 + paddw m2, m6 + movq m7, m5 + pmaddwd m5, m5 + pmaddwd m7, m6 + pmaddwd m6, m6 + paddd m3, m5 + paddd m4, m7 + paddd m3, m6 + add r0, r1 + add r2, r3 +%endrep + mov r0, r4m + lea r0, [r0+r4*4] + pshufw m5, m1, q0032 + pshufw m6, m2, q0032 + paddusw m1, m5 + paddusw m2, m6 + punpcklwd m1, m2 + pshufw m2, m1, q0032 + pshufw m5, m3, q0032 + pshufw m6, m4, q0032 + paddusw m1, m2 + paddd m3, m5 + paddd m4, m6 + punpcklwd m1, m0 + punpckldq m3, m4 + movq [r0+0], m1 + movq [r0+8], m3 + sub r4, 4 + jge .loop + emms + RET + diff --git a/source/common/x86/pixel-a.asm b/source/common/x86/pixel-a.asm new file mode 100644 index 0000000..1e4180b --- /dev/null +++ b/source/common/x86/pixel-a.asm @@ -0,0 +1,6581 @@ +;***************************************************************************** +;* pixel.asm: x86 pixel metrics +;***************************************************************************** +;* Copyright (C) 2003-2013 x264 project +;* +;* Authors: Loren Merritt +;* Holger Lubitz +;* Laurent Aimar +;* Alex Izvorski +;* Fiona Glaser +;* Oskar Arvidsson +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 + times 8 db 1 + times 4 db 1, -1 +hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 +mask_10: times 4 dw 0, -1 +mask_1100: times 2 dd 0, -1 + +ALIGN 32 +transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 +transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 + +sw_f0: dq 0xfff0, 0 +pd_f0: times 4 dd 0xffff0000 + +pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 + +SECTION .text + +cextern pb_0 +cextern pb_1 +cextern pw_1 +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_00ff +cextern pw_ppppmmmm +cextern pw_ppmmppmm +cextern pw_pmpmpmpm +cextern pw_pmmpzzzz +cextern pd_1 +cextern popcnt_table + +;============================================================================= +; SATD +;============================================================================= + +%macro JDUP 2 +%if cpuflag(sse4) + ; just use shufps on anything post conroe + shufps %1, %2, 0 +%elif cpuflag(ssse3) && notcpuflag(atom) + ; join 2x 32 bit and duplicate them + ; emulating shufps is faster on conroe + punpcklqdq %1, %2 + movsldup %1, %1 +%else + ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d + punpckldq %1, %2 +%endif +%endmacro + +%macro HSUMSUB 5 + pmaddubsw m%2, m%5 + pmaddubsw m%1, m%5 + pmaddubsw m%4, m%5 + pmaddubsw m%3, m%5 +%endmacro + +%macro DIFF_UNPACK_SSE2 5 + punpcklbw m%1, m%5 + punpcklbw m%2, m%5 + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + psubw m%1, m%2 + psubw m%3, m%4 +%endmacro + +%macro DIFF_SUMSUB_SSSE3 5 + HSUMSUB %1, %2, %3, %4, %5 + psubw m%1, m%2 + psubw m%3, m%4 +%endmacro + +%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer + movd %1, %3 + movd %2, %4 + JDUP %1, %2 +%endmacro + +%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer + movddup m%3, %6 + movddup m%4, %8 + movddup m%1, %5 + movddup m%2, %7 +%endmacro + +%macro LOAD_DUP_4x8P_PENRYN 8 + ; penryn and nehalem run punpcklqdq and movddup in different units + movh m%3, %6 + movh m%4, %8 + punpcklqdq m%3, m%3 + movddup m%1, %5 + punpcklqdq m%4, m%4 + movddup m%2, %7 +%endmacro + +%macro LOAD_SUMSUB_8x2P 9 + LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr + movddup m%1, [%7] + movddup m%2, [%7+8] + mova m%4, [%6] + movddup m%3, m%4 + punpckhqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr + movu m%4, [%7] + mova m%2, [%6] + DEINTB %1, %2, %3, %4, %5 + psubw m%1, m%3 + psubw m%2, m%4 + SUMSUB_BA w, %1, %2, %3 +%endmacro + +%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none +; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] + LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 + LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 + LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 + LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 +%endmacro + +%macro LOAD_SUMSUB_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + vbroadcasti128 m%1, [%6] + vbroadcasti128 m%3, [%7] + vbroadcasti128 m%2, [%8] + vbroadcasti128 m%4, [%9] + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 + LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer + mova xm%3, %6 + mova xm%4, %8 + mova xm%1, %5 + mova xm%2, %7 + vpermq m%3, m%3, q0011 + vpermq m%4, m%4, q0011 + vpermq m%1, m%1, q0011 + vpermq m%2, m%2, q0011 +%endmacro + +%macro LOAD_SUMSUB8_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +; in: r4=3*stride1, r5=3*stride2 +; in: %2 = horizontal offset +; in: %3 = whether we need to increment pix1 and pix2 +; clobber: m3..m7 +; out: %1 = satd +%macro SATD_4x4_MMX 3 + %xdefine %%n n%1 + %assign offset %2*SIZEOF_PIXEL + LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] + LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] + LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] + LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] +%if %3 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + HADAMARD4_2D 4, 5, 6, 7, 3, %%n + paddw m4, m6 + SWAP %%n, 4 +%endmacro + +; in: %1 = horizontal if 0, vertical if 1 +%macro SATD_8x4_SSE 8-9 +%if %1 + HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax +%else + HADAMARD4_V %2, %3, %4, %5, %6 + ; doing the abs first is a slight advantage + ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 + ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 + HADAMARD 1, max, %2, %4, %6, %7 +%endif +%ifnidn %9, swap + paddw m%8, m%2 +%else + SWAP %8, %2 +%endif +%if %1 + paddw m%8, m%4 +%else + HADAMARD 1, max, %3, %5, %6, %7 + paddw m%8, m%3 +%endif +%endmacro + +%macro SATD_8x4_1_SSE 10 +%if %1 + HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax +%else + HADAMARD4_V %2, %3, %4, %5, %6 + ; doing the abs first is a slight advantage + ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 + ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 + HADAMARD 1, max, %2, %4, %6, %7 +%endif + + pxor m%10, m%10 + mova m%9, m%2 + punpcklwd m%9, m%10 + paddd m%8, m%9 + mova m%9, m%2 + punpckhwd m%9, m%10 + paddd m%8, m%9 + +%if %1 + pxor m%10, m%10 + mova m%9, m%4 + punpcklwd m%9, m%10 + paddd m%8, m%9 + mova m%9, m%4 + punpckhwd m%9, m%10 + paddd m%8, m%9 +%else + HADAMARD 1, max, %3, %5, %6, %7 + pxor m%10, m%10 + mova m%9, m%3 + punpcklwd m%9, m%10 + paddd m%8, m%9 + mova m%9, m%3 + punpckhwd m%9, m%10 + paddd m%8, m%9 +%endif +%endmacro + +%macro SATD_START_MMX 0 + FIX_STRIDES r1, r3 + lea r4, [3*r1] ; 3*stride1 + lea r5, [3*r3] ; 3*stride2 +%endmacro + +%macro SATD_END_MMX 0 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 + movd eax, m0 +%else ; !HIGH_BIT_DEPTH + pshufw m1, m0, q1032 + paddw m0, m1 + pshufw m1, m0, q2301 + paddw m0, m1 + movd eax, m0 + and eax, 0xffff +%endif ; HIGH_BIT_DEPTH + RET +%endmacro + +; FIXME avoid the spilling of regs to hold 3*stride. +; for small blocks on x86_32, modify pixel pointer instead. + +;----------------------------------------------------------------------------- +; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_MMX mmx2 +cglobal pixel_satd_16x4_internal + SATD_4x4_MMX m2, 0, 0 + SATD_4x4_MMX m1, 4, 0 + paddw m0, m2 + SATD_4x4_MMX m2, 8, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 12, 0 + paddw m0, m2 + paddw m0, m1 + ret + +cglobal pixel_satd_8x8_internal + SATD_4x4_MMX m2, 0, 0 + SATD_4x4_MMX m1, 4, 1 + paddw m0, m2 + paddw m0, m1 +pixel_satd_8x4_internal_mmx2: + SATD_4x4_MMX m2, 0, 0 + SATD_4x4_MMX m1, 4, 0 + paddw m0, m2 + paddw m0, m1 + ret + +%if HIGH_BIT_DEPTH +%macro SATD_MxN_MMX 3 +cglobal pixel_satd_%1x%2, 4,7 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_%1x%3_internal_mmx2 + HADDUW m0, m1 + movd r6d, m0 +%rep %2/%3-1 + pxor m0, m0 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_%1x%3_internal_mmx2 + movd m2, r4 + HADDUW m0, m1 + movd r4, m0 + add r6, r4 + movd r4, m2 +%endrep + movifnidn eax, r6d + RET +%endmacro + +SATD_MxN_MMX 16, 16, 4 +SATD_MxN_MMX 16, 8, 4 +SATD_MxN_MMX 8, 16, 8 +%endif ; HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +cglobal pixel_satd_16x16, 4,6 + SATD_START_MMX + pxor m0, m0 +%rep 3 + call pixel_satd_16x4_internal_mmx2 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endrep + call pixel_satd_16x4_internal_mmx2 + HADDUW m0, m1 + movd eax, m0 + RET + +cglobal pixel_satd_16x8, 4,6 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_16x4_internal_mmx2 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_16x4_internal_mmx2 + SATD_END_MMX + +cglobal pixel_satd_8x16, 4,6 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_8x8_internal_mmx2 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_8x8_internal_mmx2 + SATD_END_MMX +%endif ; !HIGH_BIT_DEPTH + +cglobal pixel_satd_8x8, 4,6 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_8x8_internal_mmx2 + SATD_END_MMX + +cglobal pixel_satd_8x4, 4,6 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_8x4_internal_mmx2 + SATD_END_MMX + +cglobal pixel_satd_4x16, 4,6 + SATD_START_MMX + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 + SATD_END_MMX + +cglobal pixel_satd_4x8, 4,6 + SATD_START_MMX + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 + SATD_END_MMX + +cglobal pixel_satd_4x4, 4,6 + SATD_START_MMX + SATD_4x4_MMX m0, 0, 0 + SATD_END_MMX + +%macro SATD_START_SSE2 2-3 0 + FIX_STRIDES r1, r3 +%if HIGH_BIT_DEPTH && %3 + pxor %2, %2 +%elif cpuflag(ssse3) && notcpuflag(atom) +%if mmsize==32 + mova %2, [hmul_16p] +%else + mova %2, [hmul_8p] +%endif +%endif + lea r4, [3*r1] + lea r5, [3*r3] + pxor %1, %1 +%endmacro + +%macro SATD_END_SSE2 1-2 +%if HIGH_BIT_DEPTH + HADDUW %1, xm0 +%if %0 == 2 + paddd %1, %2 +%endif +%else + HADDW %1, xm7 +%endif + movd eax, %1 + RET +%endmacro + +%macro SATD_ACCUM 3 +%if HIGH_BIT_DEPTH + HADDUW %1, %2 + paddd %3, %1 + pxor %1, %1 +%endif +%endmacro + +%macro BACKUP_POINTERS 0 +%if ARCH_X86_64 +%if WIN64 + PUSH r7 +%endif + mov r6, r0 + mov r7, r2 +%endif +%endmacro + +%macro RESTORE_AND_INC_POINTERS 0 +%if ARCH_X86_64 + lea r0, [r6+8*SIZEOF_PIXEL] + lea r2, [r7+8*SIZEOF_PIXEL] +%if WIN64 + POP r7 +%endif +%else + mov r0, r0mp + mov r2, r2mp + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL +%endif +%endmacro + +%macro SATD_4x8_SSE 3-4 +%if HIGH_BIT_DEPTH + movh m0, [r0+0*r1] + movh m4, [r2+0*r3] + movh m1, [r0+1*r1] + movh m5, [r2+1*r3] + movhps m0, [r0+4*r1] + movhps m4, [r2+4*r3] + movh m2, [r0+2*r1] + movh m6, [r2+2*r3] + psubw m0, m4 + movh m3, [r0+r4] + movh m4, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + movhps m1, [r0+1*r1] + movhps m5, [r2+1*r3] + movhps m2, [r0+2*r1] + movhps m6, [r2+2*r3] + psubw m1, m5 + movhps m3, [r0+r4] + movhps m4, [r2+r5] + psubw m2, m6 + psubw m3, m4 +%else ; !HIGH_BIT_DEPTH + movd m4, [r2] + movd m5, [r2+r3] + movd m6, [r2+2*r3] + add r2, r5 + movd m0, [r0] + movd m1, [r0+r1] + movd m2, [r0+2*r1] + add r0, r4 + movd m3, [r2+r3] + JDUP m4, m3 + movd m3, [r0+r1] + JDUP m0, m3 + movd m3, [r2+2*r3] + JDUP m5, m3 + movd m3, [r0+2*r1] + JDUP m1, m3 +%if %1==0 && %2==1 + mova m3, [hmul_4p] + DIFFOP 0, 4, 1, 5, 3 +%else + DIFFOP 0, 4, 1, 5, 7 +%endif + movd m5, [r2] + add r2, r5 + movd m3, [r0] + add r0, r4 + movd m4, [r2] + JDUP m6, m4 + movd m4, [r0] + JDUP m2, m4 + movd m4, [r2+r3] + JDUP m5, m4 + movd m4, [r0+r1] + JDUP m3, m4 +%if %1==0 && %2==1 + mova m4, [hmul_4p] + DIFFOP 2, 6, 3, 5, 4 +%else + DIFFOP 2, 6, 3, 5, 7 +%endif +%endif ; HIGH_BIT_DEPTH +%if %0 == 4 + SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4 +%else + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SATDS_SSE2 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH) +cglobal pixel_satd_4x4, 4, 6, 6 + SATD_START_MMX + mova m4, [hmul_4p] + LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] + LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] + LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] + LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + HADAMARD 0, sumsub, 0, 1, 2, 3 + HADAMARD 4, sumsub, 0, 1, 2, 3 + HADAMARD 1, amax, 0, 1, 2, 3 + HADDW m0, m1 + movd eax, m0 + RET +%endif + +cglobal pixel_satd_4x8, 4, 6, 8 + SATD_START_MMX +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + HADDW m7, m1 + movd eax, m7 + RET + +cglobal pixel_satd_4x16, 4, 6, 8 + SATD_START_MMX +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET + +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 +%%pixel_satd_8x4_internal: + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_8x8_internal2 +%if WIN64 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 +%%pixel_satd_8x4_internal2: + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 +%else + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 +%%pixel_satd_8x4_internal2: + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 +%endif + ret + +; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) +; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) +%if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx) + +cglobal pixel_satd_16x4_internal2 + LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 + lea r2, [r2+4*r3] + lea r0, [r0+4*r1] + SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13 + SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13 + ret + +cglobal pixel_satd_16x4, 4,6,14 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_16x8, 4,6,14 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + jmp %%pixel_satd_16x8_internal + +cglobal pixel_satd_16x12, 4,6,14 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + jmp %%pixel_satd_16x8_internal + +cglobal pixel_satd_16x32, 4,6,14 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + jmp %%pixel_satd_16x8_internal + +cglobal pixel_satd_16x64, 4,6,14 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + jmp %%pixel_satd_16x8_internal + +cglobal pixel_satd_16x16, 4,6,14 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 +%%pixel_satd_16x8_internal: + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 48] + lea r2, [r7 + 48] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 48] + lea r2, [r7 + 48] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 48] + lea r2, [r7 + 48] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 48] + lea r2, [r7 + 48] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + +%else + +%if WIN64 +cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + lea r2, [r7 + 32*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + lea r2, [r7 + 40*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,32*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,40*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + + +%if WIN64 +cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + lea r2, [r7 + 32*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + lea r2, [r7 + 40*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + lea r2, [r7 + 48*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + lea r2, [r7 + 56*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,32*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,40*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,48*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,56*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + lea r2, [r7 + 32*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + lea r2, [r7 + 40*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + lea r2, [r7 + 48*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + lea r2, [r7 + 56*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 32*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 40*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 48*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 56*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + lea r2, [r7 + 32*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + lea r2, [r7 + 40*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + lea r2, [r7 + 48*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + lea r2, [r7 + 56*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m8, m8 + movhlps m8, m6 + paddd m6, m8 + pshufd m8, m6, 1 + paddd m6, m8 + movd eax, m6 + RET +%else +cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 32*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 40*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 48*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 56*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + lea r2, [r7 + 32*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + lea r2, [r7 + 40*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + lea r2, [r7 + 48*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + lea r2, [r7 + 56*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m8, m8 + movhlps m8, m6 + paddd m6, m8 + pshufd m8, m6, 1 + paddd m6, m8 + movd eax, m6 + RET +%else +cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 32*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 40*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 48*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 56*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_16x4, 4,6,14 +%else +cglobal pixel_satd_16x4, 4,6,8 +%endif + SATD_START_SSE2 m6, m7 + BACKUP_POINTERS + call %%pixel_satd_8x4_internal2 + RESTORE_AND_INC_POINTERS + call %%pixel_satd_8x4_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +%if WIN64 +cglobal pixel_satd_16x8, 4,6,14 +%else +cglobal pixel_satd_16x8, 4,6,8 +%endif + SATD_START_SSE2 m6, m7 + BACKUP_POINTERS + call pixel_satd_8x8_internal2 + RESTORE_AND_INC_POINTERS + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +%if WIN64 +cglobal pixel_satd_16x12, 4,6,14 +%else +cglobal pixel_satd_16x12, 4,6,8 +%endif + SATD_START_SSE2 m6, m7, 1 + BACKUP_POINTERS + call pixel_satd_8x8_internal2 + call %%pixel_satd_8x4_internal2 + RESTORE_AND_INC_POINTERS + call pixel_satd_8x8_internal2 + call %%pixel_satd_8x4_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +%if WIN64 +cglobal pixel_satd_16x16, 4,6,14 +%else +cglobal pixel_satd_16x16, 4,6,8 +%endif + SATD_START_SSE2 m6, m7, 1 + BACKUP_POINTERS + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + RESTORE_AND_INC_POINTERS + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +%if WIN64 +cglobal pixel_satd_16x32, 4,6,14 +%else +cglobal pixel_satd_16x32, 4,6,8 +%endif + SATD_START_SSE2 m6, m7, 1 + BACKUP_POINTERS + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + RESTORE_AND_INC_POINTERS + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +%if WIN64 +cglobal pixel_satd_16x64, 4,6,14 +%else +cglobal pixel_satd_16x64, 4,6,8 +%endif + SATD_START_SSE2 m6, m7, 1 + BACKUP_POINTERS + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + RESTORE_AND_INC_POINTERS + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if HIGH_BIT_DEPTH +%if WIN64 +cglobal pixel_satd_12x16, 4,8,8 + SATD_START_MMX + mov r6, r0 + mov r7, r2 + pxor m7, m7 + SATD_4x8_SSE vertical, 0, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 4*SIZEOF_PIXEL] + lea r2, [r7 + 4*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + pxor m1, m1 + movhlps m1, m7 + paddd m7, m1 + pshufd m1, m7, 1 + paddd m7, m1 + movd eax, m7 + RET +%else +cglobal pixel_satd_12x16, 4,7,8,0-gprsize + SATD_START_MMX + mov r6, r0 + mov [rsp], r2 + pxor m7, m7 + SATD_4x8_SSE vertical, 0, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 4*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 4*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + pxor m1, m1 + movhlps m1, m7 + paddd m7, m1 + pshufd m1, m7, 1 + paddd m7, m1 + movd eax, m7 + RET +%endif +%else ;HIGH_BIT_DEPTH +%if WIN64 +cglobal pixel_satd_12x16, 4,8,8 + SATD_START_MMX + mov r6, r0 + mov r7, r2 +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 4*SIZEOF_PIXEL] + lea r2, [r7 + 4*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET +%else +cglobal pixel_satd_12x16, 4,7,8,0-gprsize + SATD_START_MMX + mov r6, r0 + mov [rsp], r2 +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 4*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 4*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET +%endif +%endif + +%if WIN64 +cglobal pixel_satd_24x32, 4,8,14 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_24x32, 4,7,8,0-gprsize + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif ;WIN64 + +%if WIN64 +cglobal pixel_satd_8x32, 4,6,14 +%else +cglobal pixel_satd_8x32, 4,6,8 +%endif + SATD_START_SSE2 m6, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +%if WIN64 +cglobal pixel_satd_8x16, 4,6,14 +%else +cglobal pixel_satd_8x16, 4,6,8 +%endif + SATD_START_SSE2 m6, m7 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_SSE2 m6, m7 + call pixel_satd_8x8_internal + SATD_END_SSE2 m6 + +%if WIN64 +cglobal pixel_satd_8x4, 4,6,14 +%else +cglobal pixel_satd_8x4, 4,6,8 +%endif + SATD_START_SSE2 m6, m7 + call %%pixel_satd_8x4_internal2 + SATD_END_SSE2 m6 +%endmacro ; SATDS_SSE2 + + +;============================================================================= +; SA8D +;============================================================================= + +%macro SA8D_INTER 0 +%if ARCH_X86_64 + %define lh m10 + %define rh m0 +%else + %define lh m0 + %define rh [esp+48] +%endif +%if HIGH_BIT_DEPTH + HADDUW m0, m1 + paddd lh, rh +%else + paddusw lh, rh +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro SA8D_8x8 0 + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif ; HIGH_BIT_DEPTH + paddd m0, [pd_1] + psrld m0, 1 + paddd m12, m0 +%endmacro + +%macro SA8D_16x16 0 + call pixel_sa8d_8x8_internal ; pix[0] + add r2, 8*SIZEOF_PIXEL + add r0, 8*SIZEOF_PIXEL +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova m10, m0 + call pixel_sa8d_8x8_internal ; pix[8] + lea r2, [r2+8*r3] + lea r0, [r0+8*r1] + SA8D_INTER + call pixel_sa8d_8x8_internal ; pix[8*stride+8] + sub r2, 8*SIZEOF_PIXEL + sub r0, 8*SIZEOF_PIXEL + SA8D_INTER + call pixel_sa8d_8x8_internal ; pix[8*stride] + SA8D_INTER + SWAP 0, 10 +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + paddd m0, [pd_1] + psrld m0, 1 + paddd m12, m0 +%endmacro + +%macro AVG_16x16 0 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d +%endmacro + +%macro SA8D 0 +; sse2 doesn't seem to like the horizontal way of doing things +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if ARCH_X86_64 +;----------------------------------------------------------------------------- +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sa8d_8x8_internal + lea r6, [r0+4*r1] + lea r7, [r2+4*r3] + LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 + LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 +%if vertical + HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +%else ; non-sse2 + HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 +%endif + paddw m0, m1 + paddw m0, m2 + paddw m0, m8 + SAVE_MM_PERMUTATION + ret + +cglobal pixel_sa8d_8x8, 4,8,12 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] +%if vertical == 0 + mova m7, [hmul_8p] +%endif + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif ; HIGH_BIT_DEPTH + movd eax, m0 + add eax, 1 + shr eax, 1 + RET + +cglobal pixel_sa8d_16x16, 4,8,12 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] +%if vertical == 0 + mova m7, [hmul_8p] +%endif + call pixel_sa8d_8x8_internal ; pix[0] + add r2, 8*SIZEOF_PIXEL + add r0, 8*SIZEOF_PIXEL +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova m10, m0 + call pixel_sa8d_8x8_internal ; pix[8] + lea r2, [r2+8*r3] + lea r0, [r0+8*r1] + SA8D_INTER + call pixel_sa8d_8x8_internal ; pix[8*stride+8] + sub r2, 8*SIZEOF_PIXEL + sub r0, 8*SIZEOF_PIXEL + SA8D_INTER + call pixel_sa8d_8x8_internal ; pix[8*stride] + SA8D_INTER + SWAP 0, 10 +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd eax, m0 + add eax, 1 + shr eax, 1 + RET + +cglobal pixel_sa8d_8x16, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_8x8 + lea r0, [r0 + 8*r1] + lea r2, [r2 + 8*r3] + SA8D_8x8 + movd eax, m12 + RET + +cglobal pixel_sa8d_8x32, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + movd eax, m12 + RET + +cglobal pixel_sa8d_16x8, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + movd eax, m12 + RET + +cglobal pixel_sa8d_16x32, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_16x64, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_24x32, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + movd eax, m12 + RET + +cglobal pixel_sa8d_32x8, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + movd eax, m12 + RET + +cglobal pixel_sa8d_32x16, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_32x24, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + sub r0, 8*SIZEOF_PIXEL + sub r2, 8*SIZEOF_PIXEL + SA8D_8x8 + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_8x8 + movd eax, m12 + RET + +cglobal pixel_sa8d_32x32, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_32x64, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_48x64, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_64x16, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_64x32, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_64x48, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +cglobal pixel_sa8d_64x64, 4,8,13 + FIX_STRIDES r1, r3 + lea r4, [3*r1] + lea r5, [3*r3] + pxor m12, m12 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + add r2, 16*SIZEOF_PIXEL + add r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r0, [r0+8*r1] + lea r2, [r2+8*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + lea r4, [8*r1] + lea r5, [8*r3] + sub r0, r4 + sub r2, r5 + sub r2, 16*SIZEOF_PIXEL + sub r0, 16*SIZEOF_PIXEL + lea r4, [3*r1] + lea r5, [3*r3] + SA8D_16x16 + movd eax, m12 + RET + +%else ; ARCH_X86_32 +%if mmsize == 16 +cglobal pixel_sa8d_8x8_internal + %define spill0 [esp+4] + %define spill1 [esp+20] + %define spill2 [esp+36] +%if vertical + LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 + HADAMARD4_2D 0, 1, 2, 3, 4 + movdqa spill0, m3 + LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 + HADAMARD4_2D 4, 5, 6, 7, 3 + HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax + movdqa m3, spill0 + paddw m0, m1 + HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax +%else ; mmsize == 8 + mova m7, [hmul_8p] + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 + ; could do first HADAMARD4_V here to save spilling later + ; surprisingly, not a win on conroe or even p4 + mova spill0, m2 + mova spill1, m3 + mova spill2, m1 + SWAP 1, 7 + LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 + HADAMARD4_V 4, 5, 6, 7, 3 + mova m1, spill2 + mova m2, spill0 + mova m3, spill1 + mova spill0, m6 + mova spill1, m7 + HADAMARD4_V 0, 1, 2, 3, 7 + SUMSUB_BADC w, 0, 4, 1, 5, 7 + HADAMARD 2, sumsub, 0, 4, 7, 6 + HADAMARD 2, sumsub, 1, 5, 7, 6 + HADAMARD 1, amax, 0, 4, 7, 6 + HADAMARD 1, amax, 1, 5, 7, 6 + mova m6, spill0 + mova m7, spill1 + paddw m0, m1 + SUMSUB_BADC w, 2, 6, 3, 7, 4 + HADAMARD 2, sumsub, 2, 6, 4, 5 + HADAMARD 2, sumsub, 3, 7, 4, 5 + HADAMARD 1, amax, 2, 6, 4, 5 + HADAMARD 1, amax, 3, 7, 4, 5 +%endif ; sse2/non-sse2 + paddw m0, m2 + paddw m0, m3 + SAVE_MM_PERMUTATION + ret +%endif ; ifndef mmx2 + +cglobal pixel_sa8d_8x8_internal2 + %define spill0 [esp+4] + LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 + HADAMARD4_2D 0, 1, 2, 3, 4 + movdqa spill0, m3 + LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 + HADAMARD4_2D 4, 5, 6, 7, 3 + HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax + movdqa m3, spill0 + paddw m0, m1 + HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax + paddw m0, m2 + paddw m0, m3 + SAVE_MM_PERMUTATION + ret + +cglobal pixel_sa8d_8x8, 4,7 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 48 + lea r4, [3*r1] + lea r5, [3*r3] + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif ; HIGH_BIT_DEPTH + movd eax, m0 + add eax, 1 + shr eax, 1 + mov esp, r6 + RET + +cglobal pixel_sa8d_16x16, 4,7 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + lea r4, [3*r1] + lea r5, [3*r3] + call pixel_sa8d_8x8_internal +%if mmsize == 8 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal +%if mmsize == 8 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%else + SA8D_INTER +%endif + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + SA8D_INTER +%else ; !HIGH_BIT_DEPTH + paddusw m0, [esp+64-mmsize] +%if mmsize == 16 + HADDUW m0, m1 +%else + mova m2, [esp+48] + pxor m7, m7 + mova m1, m0 + mova m3, m2 + punpcklwd m0, m7 + punpckhwd m1, m7 + punpcklwd m2, m7 + punpckhwd m3, m7 + paddd m0, m1 + paddd m2, m3 + paddd m0, m2 + HADDD m0, m1 +%endif +%endif ; HIGH_BIT_DEPTH + movd eax, m0 + add eax, 1 + shr eax, 1 + mov esp, r6 + RET + +cglobal pixel_sa8d_8x16, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_8x32, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_16x8, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_16x32, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_16x64, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_24x32, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_32x8, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_32x16, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_32x24, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 + HADDUW m0, m1 + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_32x32, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_32x64, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_48x64, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_64x16, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_64x32, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_64x48, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET + +cglobal pixel_sa8d_64x64, 4,7,8 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 64 + + lea r4, [r1 + 2*r1] + lea r5, [r3 + 2*r3] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [rsp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + mov dword [esp+36], r4d + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + lea r0, [r0 + r1*8] + lea r2, [r2 + r3*8] + mov [r6+20], r0 + mov [r6+28], r2 + + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 16*SIZEOF_PIXEL + add r2, 16*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 24*SIZEOF_PIXEL + add r2, 24*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 32*SIZEOF_PIXEL + add r2, 32*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 40*SIZEOF_PIXEL + add r2, 40*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + AVG_16x16 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 48*SIZEOF_PIXEL + add r2, 48*SIZEOF_PIXEL + lea r4, [r1 + 2*r1] + call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+48], m0 + + mov r0, [r6+20] + mov r2, [r6+28] + add r0, 56*SIZEOF_PIXEL + add r2, 56*SIZEOF_PIXEL + call pixel_sa8d_8x8_internal2 + SA8D_INTER + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal2 + SA8D_INTER +%if HIGH_BIT_DEPTH == 0 + HADDUW m0, m1 +%endif + movd r4d, m0 + add r4d, 1 + shr r4d, 1 + add r4d, dword [esp+36] + mov eax, r4d + mov esp, r6 + RET +%endif ; !ARCH_X86_64 +%endmacro ; SA8D + +;============================================================================= +; INTRA SATD +;============================================================================= +%define TRANS TRANS_SSE2 +%define DIFFOP DIFF_UNPACK_SSE2 +%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P +%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 +%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size +%define movdqu movups +%define punpcklqdq movlhps +INIT_XMM sse2 +SA8D +SATDS_SSE2 + +%if HIGH_BIT_DEPTH == 0 +INIT_XMM ssse3,atom +SATDS_SSE2 +SA8D +%endif + +%define DIFFOP DIFF_SUMSUB_SSSE3 +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE +%if HIGH_BIT_DEPTH == 0 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 +%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 +%endif +INIT_XMM ssse3 +SATDS_SSE2 +SA8D +%undef movdqa ; nehalem doesn't like movaps +%undef movdqu ; movups +%undef punpcklqdq ; or movlhps + +%define TRANS TRANS_SSE4 +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN +INIT_XMM sse4 +SATDS_SSE2 +SA8D + +; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so +; it's effectively free. +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE +INIT_XMM avx +SATDS_SSE2 +SA8D + +%define TRANS TRANS_XOP +INIT_XMM xop +SATDS_SSE2 +SA8D + + +%if HIGH_BIT_DEPTH == 0 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 +%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 +%define TRANS TRANS_SSE4 + +%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] + movq xm%1, [r0] + movq xm%3, [r2] + movq xm%2, [r0+r1] + movq xm%4, [r2+r3] + vinserti128 m%1, m%1, [r0+4*r1], 1 + vinserti128 m%3, m%3, [r2+4*r3], 1 + vinserti128 m%2, m%2, [r0+r4], 1 + vinserti128 m%4, m%4, [r2+r5], 1 + punpcklqdq m%1, m%1 + punpcklqdq m%3, m%3 + punpcklqdq m%2, m%2 + punpcklqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + movq xm%3, [r0] + movq xm%5, [r2] + movq xm%4, [r0+r1] + movq xm%6, [r2+r3] + vinserti128 m%3, m%3, [r0+4*r1], 1 + vinserti128 m%5, m%5, [r2+4*r3], 1 + vinserti128 m%4, m%4, [r0+r4], 1 + vinserti128 m%6, m%6, [r2+r5], 1 + punpcklqdq m%3, m%3 + punpcklqdq m%5, m%5 + punpcklqdq m%4, m%4 + punpcklqdq m%6, m%6 + DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 +%endmacro + +%macro SATD_START_AVX2 2-3 0 + FIX_STRIDES r1, r3 +%if %3 + mova %2, [hmul_8p] + lea r4, [5*r1] + lea r5, [5*r3] +%else + mova %2, [hmul_16p] + lea r4, [3*r1] + lea r5, [3*r3] +%endif + pxor %1, %1 +%endmacro + +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +cglobal pixel_satd_16x8_internal + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_16x16, 4,6,8 + SATD_START_AVX2 m6, m7 + call pixel_satd_16x8_internal + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +pixel_satd_16x8_internal: + call pixel_satd_16x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_16x8, 4,6,8 + SATD_START_AVX2 m6, m7 + jmp pixel_satd_16x8_internal + +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_8x16, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_sa8d_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + HADAMARD4_V 0, 1, 2, 3, 4 + HADAMARD 8, sumsub, 0, 1, 4, 5 + HADAMARD 8, sumsub, 2, 3, 4, 5 + HADAMARD 2, sumsub, 0, 1, 4, 5 + HADAMARD 2, sumsub, 2, 3, 4, 5 + HADAMARD 1, amax, 0, 1, 4, 5 + HADAMARD 1, amax, 2, 3, 4, 5 + paddw m6, m0 + paddw m6, m2 + ret + +cglobal pixel_sa8d_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_sa8d_8x8_internal + vextracti128 xm1, m6, 1 + paddw xm6, xm1 + HADDW xm6, xm1 + movd eax, xm6 + add eax, 1 + shr eax, 1 + RET +%endif ; HIGH_BIT_DEPTH + +; Input 16bpp, Output 8bpp +;------------------------------------------------------------------------------------------------------------------------ +;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) +;------------------------------------------------------------------------------------------------------------------------ +INIT_XMM sse2 +cglobal downShift_16, 7,7,3 + movd m0, r6d ; m0 = shift + add r1, r1 + dec r5d +.loopH: + xor r6, r6 +.loopW: + movu m1, [r0 + r6 * 2] + movu m2, [r0 + r6 * 2 + 16] + psrlw m1, m0 + psrlw m2, m0 + packuswb m1, m2 + movu [r2 + r6], m1 + + add r6, 16 + cmp r6d, r4d + jl .loopW + + ; move to next row + add r0, r1 + add r2, r3 + dec r5d + jnz .loopH + +;processing last row of every frame [To handle width which not a multiple of 16] + +.loop16: + movu m1, [r0] + movu m2, [r0 + 16] + psrlw m1, m0 + psrlw m2, m0 + packuswb m1, m2 + movu [r2], m1 + + add r0, 2 * mmsize + add r2, mmsize + sub r4d, 16 + jz .end + cmp r4d, 15 + jg .loop16 + + cmp r4d, 8 + jl .process4 + movu m1, [r0] + psrlw m1, m0 + packuswb m1, m1 + movh [r2], m1 + + add r0, mmsize + add r2, 8 + sub r4d, 8 + jz .end + +.process4: + cmp r4d, 4 + jl .process2 + movh m1,[r0] + psrlw m1, m0 + packuswb m1, m1 + movd [r2], m1 + + add r0, 8 + add r2, 4 + sub r4d, 4 + jz .end + +.process2: + cmp r4d, 2 + jl .process1 + movd m1, [r0] + psrlw m1, m0 + packuswb m1, m1 + movd r6, m1 + mov [r2], r6w + + add r0, 4 + add r2, 2 + sub r4d, 2 + jz .end + +.process1: + movd m1, [r0] + psrlw m1, m0 + packuswb m1, m1 + movd r3, m1 + mov [r2], r3b +.end: + RET + +; Input 8bpp, Output 16bpp +;--------------------------------------------------------------------------------------------------------------------- +;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) +;--------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal upShift_8, 7,7,3 + + movd m2, r6d ; m0 = shift + add r3, r3 + dec r5d + +.loopH: + xor r6, r6 +.loopW: + pmovzxbw m0,[r0 + r6] + pmovzxbw m1,[r0 + r6 + 8] + psllw m0, m2 + psllw m1, m2 + movu [r2 + r6 * 2], m0 + movu [r2 + r6 * 2 + 16], m1 + + add r6, 16 + cmp r6d, r4d + jl .loopW + + ; move to next row + add r0, r1 + add r2, r3 + dec r5d + jnz .loopH + +;processing last row of every frame [To handle width which not a multiple of 16] + +.loop16: + pmovzxbw m0,[r0] + pmovzxbw m1,[r0 + 8] + psllw m0, m2 + psllw m1, m2 + movu [r2], m0 + movu [r2 + 16], m1 + + add r0, mmsize + add r2, 2 * mmsize + sub r4d, 16 + jz .end + cmp r4d, 15 + jg .loop16 + + cmp r4d, 8 + jl .process4 + pmovzxbw m0,[r0] + psllw m0, m2 + movu [r2], m0 + + add r0, 8 + add r2, mmsize + sub r4d, 8 + jz .end + +.process4: + cmp r4d, 4 + jl .process2 + movd m0,[r0] + pmovzxbw m0,m0 + psllw m0, m2 + movh [r2], m0 + + add r0, 4 + add r2, 8 + sub r4d, 4 + jz .end + +.process2: + cmp r4d, 2 + jl .process1 + movzx r3d, byte [r0] + shl r3d, 2 + mov [r2], r3w + movzx r3d, byte [r0 + 1] + shl r3d, 2 + mov [r2 + 2], r3w + + add r0, 2 + add r2, 4 + sub r4d, 2 + jz .end + +.process1: + movzx r3d, byte [r0] + shl r3d, 2 + mov [r2], r3w +.end: + RET diff --git a/source/common/x86/pixel-util.h b/source/common/x86/pixel-util.h new file mode 100644 index 0000000..90bb4fc --- /dev/null +++ b/source/common/x86/pixel-util.h @@ -0,0 +1,130 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_PIXEL_UTIL_H +#define X265_PIXEL_UTIL_H + +void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); + +void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); +void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); +void x265_getResidual16_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); +void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); +void x265_getResidual32_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); +void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); + +void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride); + +void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride); + +uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); +uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); +uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); +uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); +void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); +void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift); +int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff); + +void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); +void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); + +void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1, + const uint8_t * pix2, intptr_t stride2, int sums[2][4]); +void x265_pixel_ssim_4x4x2_core_sse2(const pixel * pix1, intptr_t stride1, + const pixel * pix2, intptr_t stride2, int sums[2][4]); +void x265_pixel_ssim_4x4x2_core_avx(const pixel * pix1, intptr_t stride1, + const pixel * pix2, intptr_t stride2, int sums[2][4]); +float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width); +float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width); + +void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t); +void x265_scale1D_128to64_avx2(pixel *, pixel *, intptr_t); +void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t); + +#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \ + void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \ + void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1); + +#define CHROMA_PIXELSUB_DEF(cpu) \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); + +#define CHROMA_PIXELSUB_DEF_422(cpu) \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \ + SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu); + +#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \ + void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \ + void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1); + +#define LUMA_PIXELSUB_DEF(cpu) \ + SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \ + SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \ + SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \ + SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu); + +CHROMA_PIXELSUB_DEF(_sse4); +LUMA_PIXELSUB_DEF(_sse4); +CHROMA_PIXELSUB_DEF(_sse2); +LUMA_PIXELSUB_DEF(_sse2); + +CHROMA_PIXELSUB_DEF_422(_sse4); +CHROMA_PIXELSUB_DEF_422(_sse2); + +#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \ + uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride); + +#define LUMA_PIXELVAR_DEF(cpu) \ + SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \ + SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu); \ + SETUP_LUMA_PIXELVAR_FUNC(32, 32, cpu); \ + SETUP_LUMA_PIXELVAR_FUNC(64, 64, cpu); + +LUMA_PIXELVAR_DEF(_sse2); + +#undef CHROMA_PIXELSUB_DEF +#undef CHROMA_PIXELSUB_DEF_422 +#undef LUMA_PIXELSUB_DEF +#undef LUMA_PIXELVAR_DEF +#undef SETUP_CHROMA_PIXELSUB_PS_FUNC +#undef SETUP_LUMA_PIXELSUB_PS_FUNC +#undef SETUP_LUMA_PIXELVAR_FUNC + +#endif // ifndef X265_PIXEL_UTIL_H diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm new file mode 100644 index 0000000..38fb52e --- /dev/null +++ b/source/common/x86/pixel-util8.asm @@ -0,0 +1,5001 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen +;* Nabajit Deka +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +%if BIT_DEPTH == 10 +ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 +ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 +pf_64: times 4 dd 64.0 +pf_128: times 4 dd 128.0 +%elif BIT_DEPTH == 9 +ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 +ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 +%else ; 8-bit +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +%endif +mask_ff: times 16 db 0xff + times 16 db 0 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmulw_16p: times 8 dw 1 + times 4 dw 1, -1 + +trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 + +SECTION .text + +cextern pw_1 +cextern pb_1 +cextern pw_00ff +cextern pw_2000 +cextern pw_pixel_max +cextern pd_1 +cextern pd_32767 +cextern pd_n32768 + +;----------------------------------------------------------------------------- +; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 == 1 +cglobal calcRecons4, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons4, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d + + pxor m4, m4 + mova m5, [pw_pixel_max] + mov t7b, 4/2 +.loop: + movh m0, [r0] + movh m1, [r0 + r4] + punpcklqdq m0, m1 + movh m2, [r1] + movh m3, [r1 + r4] + punpcklqdq m2, m3 + paddw m0, m2 + CLIPW m0, m4, m5 + + ; store recipred[] + movh [r3], m0 + movhps [r3 + r6], m0 + + ; store recqt[] + movh [r2], m0 + movhps [r2 + r5], m0 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%else ;HIGH_BIT_DEPTH + +%if ARCH_X86_64 == 1 +cglobal calcRecons4, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons4, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r5d, r5d + + pxor m0, m0 + mov t7b, 4/2 +.loop: + movd m1, [r0] + movd m2, [r0 + r4] + punpckldq m1, m2 + punpcklbw m1, m0 + movh m2, [r1] + movh m3, [r1 + r4 * 2] + punpcklqdq m2, m3 + paddw m1, m2 + packuswb m1, m1 + + ; store recon[] and recipred[] + movd [r3], m1 + pshufd m2, m1, 1 + movd [r3 + r6], m2 + + ; store recqt[] + punpcklbw m1, m0 + movh [r2], m1 + movhps [r2 + r5], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 4] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%endif ;HIGH_BIT_DEPTH + + +INIT_XMM sse2 +%if ARCH_X86_64 == 1 +cglobal calcRecons8, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons8, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + +%if HIGH_BIT_DEPTH + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d + + pxor m4, m4 + mova m5, [pw_pixel_max] + mov t7b, 8/2 +.loop: + movu m0, [r0] + movu m1, [r0 + r4] + movu m2, [r1] + movu m3, [r1 + r4] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recipred[] + movu [r3], m0 + movu [r3 + r6], m1 + + ; store recqt[] + movu [r2], m0 + movu [r2 + r5], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%else ;HIGH_BIT_DEPTH + + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r5d, r5d + + pxor m0, m0 + mov t7b, 8/2 +.loop: + movh m1, [r0] + movh m2, [r0 + r4] + punpcklbw m1, m0 + punpcklbw m2, m0 + movu m3, [r1] + movu m4, [r1 + r4 * 2] + paddw m1, m3 + paddw m2, m4 + packuswb m1, m2 + + ; store recon[] and recipred[] + movh [r3], m1 + movhps [r3 + r6], m1 + + ; store recqt[] + punpcklbw m2, m1, m0 + punpckhbw m1, m0 + movu [r2], m2 + movu [r2 + r5], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 4] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%endif ;HIGH_BIT_DEPTH + + + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +%if ARCH_X86_64 == 1 +cglobal calcRecons16, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons16, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d + + pxor m4, m4 + mova m5, [pw_pixel_max] + mov t7b, 16/2 +.loop: + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r1] + movu m3, [r1 + 16] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recipred[] + movu [r3], m0 + movu [r3 + 16], m1 + + ; store recqt[] + movu [r2], m0 + movu [r2 + 16], m1 + + movu m0, [r0 + r4] + movu m1, [r0 + r4 + 16] + movu m2, [r1 + r4] + movu m3, [r1 + r4 + 16] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recon[] and recipred[] + movu [r3 + r6], m0 + movu [r3 + r6 + 16], m1 + + ; store recqt[] + movu [r2 + r5], m0 + movu [r2 + r5 + 16], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%else ;HIGH_BIT_DEPTH + +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal calcRecons16, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons16, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r5d, r5d + + pxor m0, m0 + mov t7b, 16 +.loop: + movu m2, [r0] + pmovzxbw m1, m2 + punpckhbw m2, m0 + paddw m1, [r1] + paddw m2, [r1 + 16] + packuswb m1, m2 + + ; store recon[] and recipred[] + movu [r3], m1 + + ; store recqt[] + pmovzxbw m2, m1 + punpckhbw m1, m0 + movu [r2], m2 + movu [r2 + 16], m1 + + add r2, r5 + add r3, r6 + add r0, r4 + lea r1, [r1 + r4 * 2] + + dec t7b + jnz .loop + RET +%endif ;HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +%if ARCH_X86_64 == 1 +cglobal calcRecons32, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons32, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d + + pxor m4, m4 + mova m5, [pw_pixel_max] + mov t7b, 32/2 +.loop: + + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r1] + movu m3, [r1 + 16] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recipred[] + movu [r3], m0 + movu [r3 + 16], m1 + + ; store recqt[] + movu [r2], m0 + movu [r2 + 16], m1 + + movu m0, [r0 + 32] + movu m1, [r0 + 48] + movu m2, [r1 + 32] + movu m3, [r1 + 48] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recon[] and recipred[] + movu [r3 + 32], m0 + movu [r3 + 48], m1 + + ; store recqt[] + movu [r2 + 32], m0 + movu [r2 + 48], m1 + add r2, r5 + + movu m0, [r0 + r4] + movu m1, [r0 + r4 + 16] + movu m2, [r1 + r4] + movu m3, [r1 + r4 + 16] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recon[] and recipred[] + movu [r3 + r6], m0 + movu [r3 + r6 + 16], m1 + + ; store recqt[] + movu [r2], m0 + movu [r2 + 16], m1 + + movu m0, [r0 + r4 + 32] + movu m1, [r0 + r4 + 48] + movu m2, [r1 + r4 + 32] + movu m3, [r1 + r4 + 48] + paddw m0, m2 + paddw m1, m3 + CLIPW2 m0, m1, m4, m5 + + ; store recon[] and recipred[] + movu [r3 + r6 + 32], m0 + movu [r3 + r6 + 48], m1 + lea r3, [r3 + r6 * 2] + + ; store recqt[] + movu [r2 + 32], m0 + movu [r2 + 48], m1 + add r2, r5 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + + dec t7b + jnz .loop + RET +%else ;HIGH_BIT_DEPTH +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal calcRecons32, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons32, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + + mov r4d, r4m + mov r5d, r5m + mov r6d, r6m + add r5d, r5d + + pxor m0, m0 + mov t7b, 32 +.loop: + movu m2, [r0] + movu m4, [r0 + 16] + pmovzxbw m1, m2 + punpckhbw m2, m0 + pmovzxbw m3, m4 + punpckhbw m4, m0 + + paddw m1, [r1 + 0 * 16] + paddw m2, [r1 + 1 * 16] + packuswb m1, m2 + + paddw m3, [r1 + 2 * 16] + paddw m4, [r1 + 3 * 16] + packuswb m3, m4 + + ; store recon[] and recipred[] + movu [r3], m1 + movu [r3 + 16], m3 + + ; store recqt[] + pmovzxbw m2, m1 + punpckhbw m1, m0 + movu [r2 + 0 * 16], m2 + movu [r2 + 1 * 16], m1 + pmovzxbw m4, m3 + punpckhbw m3, m0 + movu [r2 + 2 * 16], m4 + movu [r2 + 3 * 16], m3 + + add r2, r5 + add r3, r6 + add r0, r4 + lea r1, [r1 + r4 * 2] + + dec t7b + jnz .loop + RET +%endif ;HIGH_BIT_DEPTH + + +;----------------------------------------------------------------------------- +; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +%if HIGH_BIT_DEPTH +cglobal getResidual4, 4,4,4 + add r3, r3 + + ; row 0-1 + movh m0, [r0] + movh m1, [r0 + r3] + movh m2, [r1] + movh m3, [r1 + r3] + punpcklqdq m0, m1 + punpcklqdq m2, m3 + psubw m0, m2 + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 2] + + ; row 2-3 + movh m0, [r0] + movh m1, [r0 + r3] + movh m2, [r1] + movh m3, [r1 + r3] + punpcklqdq m0, m1 + punpcklqdq m2, m3 + psubw m0, m2 + + movh [r2], m0 + movhps [r2 + r3], m0 +%else +cglobal getResidual4, 4,4,5 + pxor m0, m0 + + ; row 0-1 + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r3] + punpckldq m1, m2 + punpcklbw m1, m0 + punpckldq m3, m4 + punpcklbw m3, m0 + psubw m1, m3 + movh [r2], m1 + movhps [r2 + r3 * 2], m1 + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] + + ; row 2-3 + movd m1, [r0] + movd m2, [r0 + r3] + movd m3, [r1] + movd m4, [r1 + r3] + punpckldq m1, m2 + punpcklbw m1, m0 + punpckldq m3, m4 + punpcklbw m3, m0 + psubw m1, m3 + movh [r2], m1 + movhps [r2 + r3 * 2], m1 +%endif + RET + + +INIT_XMM sse2 +%if HIGH_BIT_DEPTH +cglobal getResidual8, 4,4,4 + add r3, r3 + +%assign x 0 +%rep 8/2 + ; row 0-1 + movu m1, [r0] + movu m2, [r0 + r3] + movu m3, [r1] + movu m4, [r1 + r3] + psubw m1, m3 + psubw m2, m4 + movu [r2], m1 + movu [r2 + r3], m2 +%assign x x+1 +%if (x != 4) + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 2] +%endif +%endrep +%else +cglobal getResidual8, 4,4,5 + pxor m0, m0 + +%assign x 0 +%rep 8/2 + ; row 0-1 + movh m1, [r0] + movh m2, [r0 + r3] + movh m3, [r1] + movh m4, [r1 + r3] + punpcklbw m1, m0 + punpcklbw m2, m0 + punpcklbw m3, m0 + punpcklbw m4, m0 + psubw m1, m3 + psubw m2, m4 + movu [r2], m1 + movu [r2 + r3 * 2], m2 +%assign x x+1 +%if (x != 4) + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] +%endif +%endrep +%endif + RET + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal getResidual16, 4,5,6 + add r3, r3 + mov r4d, 16/4 +.loop: + ; row 0-1 + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + r3] + movu m3, [r0 + r3 + 16] + movu m4, [r1] + movu m5, [r1 + 16] + psubw m0, m4 + psubw m1, m5 + movu m4, [r1 + r3] + movu m5, [r1 + r3 + 16] + psubw m2, m4 + psubw m3, m5 + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + + movu [r2], m0 + movu [r2 + 16], m1 + movu [r2 + r3], m2 + movu [r2 + r3 + 16], m3 + lea r2, [r2 + r3 * 2] + + ; row 2-3 + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + r3] + movu m3, [r0 + r3 + 16] + movu m4, [r1] + movu m5, [r1 + 16] + psubw m0, m4 + psubw m1, m5 + movu m4, [r1 + r3] + movu m5, [r1 + r3 + 16] + psubw m2, m4 + psubw m3, m5 + + movu [r2], m0 + movu [r2 + 16], m1 + movu [r2 + r3], m2 + movu [r2 + r3 + 16], m3 + + dec r4d + + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 2] + + jnz .loop +%else + +INIT_XMM sse4 +cglobal getResidual16, 4,5,8 + mov r4d, 16/4 + pxor m0, m0 +.loop: + ; row 0-1 + movu m1, [r0] + movu m2, [r0 + r3] + movu m3, [r1] + movu m4, [r1 + r3] + pmovzxbw m5, m1 + punpckhbw m1, m0 + pmovzxbw m6, m2 + punpckhbw m2, m0 + pmovzxbw m7, m3 + punpckhbw m3, m0 + psubw m5, m7 + psubw m1, m3 + pmovzxbw m7, m4 + punpckhbw m4, m0 + psubw m6, m7 + psubw m2, m4 + + movu [r2], m5 + movu [r2 + 16], m1 + movu [r2 + r3 * 2], m6 + movu [r2 + r3 * 2 + 16], m2 + + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] + + ; row 2-3 + movu m1, [r0] + movu m2, [r0 + r3] + movu m3, [r1] + movu m4, [r1 + r3] + pmovzxbw m5, m1 + punpckhbw m1, m0 + pmovzxbw m6, m2 + punpckhbw m2, m0 + pmovzxbw m7, m3 + punpckhbw m3, m0 + psubw m5, m7 + psubw m1, m3 + pmovzxbw m7, m4 + punpckhbw m4, m0 + psubw m6, m7 + psubw m2, m4 + + movu [r2], m5 + movu [r2 + 16], m1 + movu [r2 + r3 * 2], m6 + movu [r2 + r3 * 2 + 16], m2 + + dec r4d + + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] + + jnz .loop +%endif + + RET + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal getResidual32, 4,5,6 + add r3, r3 + mov r4d, 32/2 +.loop: + ; row 0 + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + movu m3, [r0 + 48] + movu m4, [r1] + movu m5, [r1 + 16] + psubw m0, m4 + psubw m1, m5 + movu m4, [r1 + 32] + movu m5, [r1 + 48] + psubw m2, m4 + psubw m3, m5 + + movu [r2], m0 + movu [r2 + 16], m1 + movu [r2 + 32], m2 + movu [r2 + 48], m3 + + ; row 1 + movu m0, [r0 + r3] + movu m1, [r0 + r3 + 16] + movu m2, [r0 + r3 + 32] + movu m3, [r0 + r3 + 48] + movu m4, [r1 + r3] + movu m5, [r1 + r3 + 16] + psubw m0, m4 + psubw m1, m5 + movu m4, [r1 + r3 + 32] + movu m5, [r1 + r3 + 48] + psubw m2, m4 + psubw m3, m5 + + movu [r2 + r3], m0 + movu [r2 + r3 + 16], m1 + movu [r2 + r3 + 32], m2 + movu [r2 + r3 + 48], m3 + + dec r4d + + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 2] + + jnz .loop + +%else +INIT_XMM sse4 +cglobal getResidual32, 4,5,7 + mov r4d, 32/2 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r0 + 16] + movu m3, [r1] + movu m4, [r1 + 16] + pmovzxbw m5, m1 + punpckhbw m1, m0 + pmovzxbw m6, m3 + punpckhbw m3, m0 + psubw m5, m6 + psubw m1, m3 + movu [r2 + 0 * 16], m5 + movu [r2 + 1 * 16], m1 + + pmovzxbw m5, m2 + punpckhbw m2, m0 + pmovzxbw m6, m4 + punpckhbw m4, m0 + psubw m5, m6 + psubw m2, m4 + movu [r2 + 2 * 16], m5 + movu [r2 + 3 * 16], m2 + + movu m1, [r0 + r3] + movu m2, [r0 + r3 + 16] + movu m3, [r1 + r3] + movu m4, [r1 + r3 + 16] + pmovzxbw m5, m1 + punpckhbw m1, m0 + pmovzxbw m6, m3 + punpckhbw m3, m0 + psubw m5, m6 + psubw m1, m3 + movu [r2 + r3 * 2 + 0 * 16], m5 + movu [r2 + r3 * 2 + 1 * 16], m1 + + pmovzxbw m5, m2 + punpckhbw m2, m0 + pmovzxbw m6, m4 + punpckhbw m4, m0 + psubw m5, m6 + psubw m2, m4 + movu [r2 + r3 * 2 + 2 * 16], m5 + movu [r2 + r3 * 2 + 3 * 16], m2 + + dec r4d + + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] + + jnz .loop +%endif + RET + + +;----------------------------------------------------------------------------- +; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal quant, 5,6,8 + ; fill qbits + movd m4, r4d ; m4 = qbits + + ; fill qbits-8 + sub r4d, 8 + movd m6, r4d ; m6 = qbits8 + + ; fill offset + movd m5, r5m + pshufd m5, m5, 0 ; m5 = add + + lea r5, [pd_1] + + mov r4d, r6m + shr r4d, 3 + pxor m7, m7 ; m7 = numZero +.loop: + ; 4 coeff + movu m0, [r0] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1] ; m0 = tmpLevel1 + paddd m2, m1, m5 + psrad m2, m4 ; m2 = level1 + + pslld m3, m2, 8 + psrad m1, m6 + psubd m1, m3 ; m1 = deltaU1 + + movu [r2], m1 + psignd m3, m2, m0 + pminud m2, [r5] + paddd m7, m2 + packssdw m3, m3 + movh [r3], m3 + + ; 4 coeff + movu m0, [r0 + 16] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1 + 16] ; m0 = tmpLevel1 + paddd m2, m1, m5 + psrad m2, m4 ; m2 = level1 + pslld m3, m2, 8 + psrad m1, m6 + psubd m1, m3 ; m1 = deltaU1 + movu [r2 + 16], m1 + psignd m3, m2, m0 + pminud m2, [r5] + paddd m7, m2 + packssdw m3, m3 + movh [r3 + 8], m3 + + add r0, 32 + add r1, 32 + add r2, 32 + add r3, 16 + + dec r4d + jnz .loop + + pxor m0, m0 + psadbw m7, m0 + movhlps m0, m7 + paddd m7, m0 + movd eax, m7 + RET + + +IACA_START +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal quant, 5,5,10 + ; fill qbits + movd xm4, r4d ; m4 = qbits + + ; fill qbits-8 + sub r4d, 8 + movd xm6, r4d ; m6 = qbits8 + + ; fill offset + vpbroadcastd m5, r5m ; m5 = add + + vpbroadcastw m9, [pw_1] ; m9 = word [1] + + mov r4d, r6m + shr r4d, 4 + pxor m7, m7 ; m7 = numZero +.loop: + ; 8 coeff + movu m0, [r0] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1] ; m0 = tmpLevel1 + paddd m2, m1, m5 + psrad m2, xm4 ; m2 = level1 + + pslld m3, m2, 8 + psrad m1, xm6 + psubd m1, m3 ; m1 = deltaU1 + movu [r2], m1 + psignd m2, m0 + + ; 8 coeff + movu m0, [r0 + mmsize] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 + paddd m3, m1, m5 + psrad m3, xm4 ; m2 = level1 + + pslld m8, m3, 8 + psrad m1, xm6 + psubd m1, m8 ; m1 = deltaU1 + movu [r2 + mmsize], m1 + psignd m3, m0 + + packssdw m2, m3 + vpermq m2, m2, q3120 + movu [r3], m2 + + ; count non-zero coeff + ; TODO: popcnt is faster, but some CPU can't support + pminuw m2, m9 + paddw m7, m2 + + add r0, mmsize*2 + add r1, mmsize*2 + add r2, mmsize*2 + add r3, mmsize + + dec r4d + jnz .loop + + ; sum count + xorpd m0, m0 + psadbw m7, m0 + vextracti128 xm1, m7, 1 + paddd xm7, xm1 + movhlps xm0, xm7 + paddd xm7, xm0 + movd eax, xm7 + RET + +%else ; ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal quant, 5,6,8 + ; fill qbits + movd xm4, r4d ; m4 = qbits + + ; fill qbits-8 + sub r4d, 8 + movd xm6, r4d ; m6 = qbits8 + + ; fill offset + vpbroadcastd m5, r5m ; m5 = ad + + lea r5, [pd_1] + + mov r4d, r6m + shr r4d, 4 + pxor m7, m7 ; m7 = numZero +.loop: + ; 8 coeff + movu m0, [r0] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1] ; m0 = tmpLevel1 + paddd m2, m1, m5 + psrad m2, xm4 ; m2 = level1 + + pslld m3, m2, 8 + psrad m1, xm6 + psubd m1, m3 ; m1 = deltaU1 + + movu [r2], m1 + psignd m3, m2, m0 + pminud m2, [r5] + paddd m7, m2 + packssdw m3, m3 + vpermq m3, m3, q0020 + movu [r3], xm3 + + ; 8 coeff + movu m0, [r0 + mmsize] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 + paddd m2, m1, m5 + psrad m2, xm4 ; m2 = level1 + + pslld m3, m2, 8 + psrad m1, xm6 + psubd m1, m3 ; m1 = deltaU1 + + movu [r2 + mmsize], m1 + psignd m3, m2, m0 + pminud m2, [r5] + paddd m7, m2 + packssdw m3, m3 + vpermq m3, m3, q0020 + movu [r3 + mmsize/2], xm3 + + add r0, mmsize*2 + add r1, mmsize*2 + add r2, mmsize*2 + add r3, mmsize + + dec r4d + jnz .loop + + xorpd m0, m0 + psadbw m7, m0 + vextracti128 xm1, m7, 1 + paddd xm7, xm1 + movhlps xm0, xm7 + paddd xm7, xm0 + movd eax, xm7 + RET +%endif ; ARCH_X86_64 == 1 +IACA_END + + +;----------------------------------------------------------------------------- +; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal nquant, 3,5,8 + movd m6, r4m + mov r4d, r5m + pxor m7, m7 ; m7 = numZero + movd m5, r3m ; m5 = qbits + pshufd m6, m6, 0 ; m6 = add + mov r3d, r4d ; r3 = numCoeff + shr r4d, 3 + +.loop: + movu m0, [r0] ; m0 = level + movu m1, [r0 + 16] ; m1 = level + + pabsd m2, m0 + pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff + paddd m2, m6 + psrad m2, m5 ; m0 = level1 + psignd m2, m0 + + pabsd m3, m1 + pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff + paddd m3, m6 + psrad m3, m5 ; m1 = level1 + psignd m3, m1 + + packssdw m2, m3 + + movu [r2], m2 + add r0, 32 + add r1, 32 + add r2, 16 + + pxor m4, m4 + pcmpeqw m2, m4 + psubw m7, m2 + + dec r4d + jnz .loop + + packuswb m7, m7 + psadbw m7, m4 + mov eax, r3d + movd r4d, m7 + sub eax, r4d ; numSig + RET + + +INIT_YMM avx2 +cglobal nquant, 3,5,7 + vpbroadcastd m4, r4m + vpbroadcastd m6, [pw_1] + mov r4d, r5m + pxor m5, m5 ; m7 = numZero + movd xm3, r3m ; m5 = qbits + mov r3d, r4d ; r3 = numCoeff + shr r4d, 4 + +.loop: + movu m0, [r0] ; m0 = level + pabsd m1, m0 + pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff + paddd m1, m4 + psrad m1, xm3 ; m0 = level1 + psignd m1, m0 + + movu m0, [r0 + mmsize] ; m0 = level + pabsd m2, m0 + pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff + paddd m2, m4 + psrad m2, xm3 ; m0 = level1 + psignd m2, m0 + + packssdw m1, m2 + vpermq m2, m1, q3120 + + movu [r2], m2 + add r0, mmsize * 2 + add r1, mmsize * 2 + add r2, mmsize + + pminuw m1, m6 + paddw m5, m1 + + dec r4d + jnz .loop + + pxor m0, m0 + psadbw m5, m0 + vextracti128 xm0, m5, 1 + paddd xm5, xm0 + pshufd xm0, xm5, 2 + paddd xm5, xm0 + movd eax, xm5 + RET + + +;----------------------------------------------------------------------------- +; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal dequant_normal, 5,5,5 + mova m2, [pw_1] +%if HIGH_BIT_DEPTH + cmp r3d, 32767 + jle .skip + shr r3d, 2 + sub r4d, 2 +.skip: +%endif + movd m0, r4d ; m0 = shift + add r4d, 15 + bts r3d, r4d + movd m1, r3d + pshufd m1, m1, 0 ; m1 = dword [add scale] + ; m0 = shift + ; m1 = scale + ; m2 = word [1] +.loop: + movu m3, [r0] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) + pmaddwd m4, m1 + psrad m3, m0 + psrad m4, m0 + packssdw m3, m3 ; OPT_ME: store must be 32 bits + pmovsxwd m3, m3 + packssdw m4, m4 + pmovsxwd m4, m4 + mova [r1], m3 + mova [r1 + 16], m4 + + add r0, 16 + add r1, 32 + + sub r2d, 8 + jnz .loop + RET + + +INIT_YMM avx2 +cglobal dequant_normal, 5,5,7 + vpbroadcastd m2, [pw_1] ; m2 = word [1] + vpbroadcastd m5, [pd_32767] ; m5 = dword [32767] + vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768] +%if HIGH_BIT_DEPTH + cmp r3d, 32767 + jle .skip + shr r3d, 2 + sub r4d, 2 +.skip: +%endif + movd xm0, r4d ; m0 = shift + add r4d, -1+16 + bts r3d, r4d + vpbroadcastd m1, r3d ; m1 = dword [add scale] + + ; m0 = shift + ; m1 = scale + ; m2 = word [1] + shr r2d, 4 +.loop: + movu m3, [r0] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) + pmaddwd m4, m1 + psrad m3, xm0 + psrad m4, xm0 + pminsd m3, m5 + pmaxsd m3, m6 + pminsd m4, m5 + pmaxsd m4, m6 + mova [r1 + 0 * mmsize/2], xm3 + mova [r1 + 1 * mmsize/2], xm4 + vextracti128 [r1 + 2 * mmsize/2], m3, 1 + vextracti128 [r1 + 3 * mmsize/2], m4, 1 + + add r0, mmsize + add r1, mmsize * 2 + + dec r2d + jnz .loop + RET + + +;----------------------------------------------------------------------------- +; int count_nonzero(const int16_t *quantCoeff, int numCoeff); +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal count_nonzero, 2,2,3 + pxor m0, m0 + shr r1d, 4 + movd m1, r1d + pshufb m1, m0 + +.loop: + mova m2, [r0 + 0] + packsswb m2, [r0 + 16] + add r0, 32 + pcmpeqb m2, m0 + paddb m1, m2 + dec r1d + jnz .loop + + psadbw m1, m0 + pshufd m0, m1, 2 + paddd m0, m1 + movd eax, m0 + RET + + +;----------------------------------------------------------------------------------------------------------------------------------------------- +;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) +;----------------------------------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal weight_pp, 6, 7, 6 + + shl r5d, 6 ; m0 = [w0<<6] + mov r6d, r6m + shl r6d, 16 + or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. + movd m0, r6d + pshufd m0, m0, 0 ; m0 = [w0<<6, round] + movd m1, r7m + movd m2, r8m + pshufd m2, m2, 0 + mova m5, [pw_1] + sub r2d, r3d + shr r3d, 4 + +.loopH: + mov r5d, r3d + +.loopW: + pmovzxbw m4, [r0] + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, m1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, m1 + paddd m4, m2 + + packssdw m3, m4 + packuswb m3, m3 + movh [r1], m3 + + pmovzxbw m4, [r0 + 8] + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, m1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, m1 + paddd m4, m2 + + packssdw m3, m4 + packuswb m3, m3 + movh [r1 + 8], m3 + + add r0, 16 + add r1, 16 + + dec r5d + jnz .loopW + + lea r0, [r0 + r2] + lea r1, [r1 + r2] + + dec r4d + jnz .loopH + RET + + +INIT_YMM avx2 +cglobal weight_pp, 6, 7, 6 + + shl r5d, 6 ; m0 = [w0<<6] + mov r6d, r6m + shl r6d, 16 + or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. + movd xm0, r6d + pshufd xm0, xm0, 0 ; m0 = [w0<<6, round] + vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate + + movd xm1, r7m + vpbroadcastd m2, r8m + mova m5, [pw_1] + sub r2d, r3d + shr r3d, 4 + +.loopH: + mov r5d, r3d + +.loopW: + pmovzxbw m4, [r0] + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, xm1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, xm1 + paddd m4, m2 + + packssdw m3, m4 + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + movu [r1], xm3 + + add r0, 16 + add r1, 16 + + dec r5d + jnz .loopW + + lea r0, [r0 + r2] + lea r1, [r1 + r2] + + dec r4d + jnz .loopH + RET + +;------------------------------------------------------------------------------------------------------------------------------------------------- +;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) +;------------------------------------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +%if ARCH_X86_64 +cglobal weight_sp, 6, 7+2, 7 + %define tmp_r0 r7 + %define tmp_r1 r8 +%else ; ARCH_X86_64 = 0 +cglobal weight_sp, 6, 7, 7, 0-(2*4) + %define tmp_r0 [(rsp + 0 * 4)] + %define tmp_r1 [(rsp + 1 * 4)] +%endif ; ARCH_X86_64 + + movd m0, r6m ; m0 = [w0] + + movd m1, r7m ; m1 = [round] + punpcklwd m0, m1 + pshufd m0, m0, 0 ; m0 = [w0 round] + + movd m1, r8m ; m1 = [shift] + + movd m2, r9m + pshufd m2, m2, 0 ; m2 =[offset] + + mova m3, [pw_1] + mova m4, [pw_2000] + + add r2d, r2d + +.loopH: + mov r6d, r4d + + ; save old src and dst + mov tmp_r0, r0 + mov tmp_r1, r1 +.loopW: + movu m5, [r0] + paddw m5, m4 + + punpcklwd m6,m5, m3 + pmaddwd m6, m0 + psrad m6, m1 + paddd m6, m2 + + punpckhwd m5, m3 + pmaddwd m5, m0 + psrad m5, m1 + paddd m5, m2 + + packssdw m6, m5 + packuswb m6, m6 + + sub r6d, 8 + jl .width4 + movh [r1], m6 + je .nextH + add r0, 16 + add r1, 8 + + jmp .loopW + +.width4: + cmp r6d, -4 + jl .width2 + movd [r1], m6 + je .nextH + add r1, 4 + pshufd m6, m6, 1 + +.width2: + pextrw [r1], m6, 0 + +.nextH: + mov r0, tmp_r0 + mov r1, tmp_r1 + lea r0, [r0 + r2] + lea r1, [r1 + r3] + + dec r5d + jnz .loopH + + RET + +;----------------------------------------------------------------- +; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +INIT_XMM sse2 +cglobal transpose4, 3, 3, 4, dest, src, stride +%if HIGH_BIT_DEPTH == 1 + add r2, r2 + movh m0, [r1] + movh m1, [r1 + r2] + movh m2, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m3, [r1 + r2] + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + movu [r0], m0 + movu [r0 + 16], m1 +%else ;HIGH_BIT_DEPTH == 0 + movd m0, [r1] + movd m1, [r1 + r2] + movd m2, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movd m3, [r1 + r2] + + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklwd m0, m2 + movu [r0], m0 +%endif + RET + +;----------------------------------------------------------------- +; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +%if HIGH_BIT_DEPTH == 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose8, 3, 5, 5 + add r2, r2 + lea r3, [3 * r2] + lea r4, [r1 + 4 * r2] + movu xm0, [r1] + vinserti128 m0, m0, [r4], 1 + movu xm1, [r1 + r2] + vinserti128 m1, m1, [r4 + r2], 1 + movu xm2, [r1 + 2 * r2] + vinserti128 m2, m2, [r4 + 2 * r2], 1 + movu xm3, [r1 + r3] + vinserti128 m3, m3, [r4 + r3], 1 + + punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6] + punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6] + + punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8] + punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8] + + punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8] + punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8] + + punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8] + punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8] + + vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8] + vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8] + vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8] + vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8] + + movu [r0 + 0 * 32], m3 + movu [r0 + 1 * 32], m4 + movu [r0 + 2 * 32], m1 + movu [r0 + 3 * 32], m0 + RET +%endif + +INIT_XMM sse2 +%macro TRANSPOSE_4x4 1 + movh m0, [r1] + movh m1, [r1 + r2] + movh m2, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m3, [r1 + r2] + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + movh [r0], m0 + movhps [r0 + %1], m0 + movh [r0 + 2 * %1], m1 + lea r0, [r0 + 2 * %1] + movhps [r0 + %1], m1 +%endmacro +cglobal transpose8_internal + TRANSPOSE_4x4 r5 + lea r1, [r1 + 2 * r2] + lea r0, [r3 + 8] + TRANSPOSE_4x4 r5 + lea r1, [r1 + 2 * r2] + neg r2 + lea r1, [r1 + r2 * 8 + 8] + neg r2 + lea r0, [r3 + 4 * r5] + TRANSPOSE_4x4 r5 + lea r1, [r1 + 2 * r2] + lea r0, [r3 + 8 + 4 * r5] + TRANSPOSE_4x4 r5 + ret +cglobal transpose8, 3, 6, 4, dest, src, stride + add r2, r2 + mov r3, r0 + mov r5, 16 + call transpose8_internal + RET +%else ;HIGH_BIT_DEPTH == 0 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose8, 3, 4, 4 + lea r3, [r2 * 3] + movq xm0, [r1] + movhps xm0, [r1 + 2 * r2] + movq xm1, [r1 + r2] + movhps xm1, [r1 + r3] + lea r1, [r1 + 4 * r2] + movq xm2, [r1] + movhps xm2, [r1 + 2 * r2] + movq xm3, [r1 + r2] + movhps xm3, [r1 + r3] + + vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7] + vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8] + + punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6] + punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8] + + punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8] + punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8] + + mova m0, [trans8_shuf] + + vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8] + vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8] + + movu [r0], m1 + movu [r0 + 32], m2 + RET +%endif + +INIT_XMM sse2 +cglobal transpose8, 3, 5, 8, dest, src, stride + lea r3, [2 * r2] + lea r4, [3 * r2] + movh m0, [r1] + movh m1, [r1 + r2] + movh m2, [r1 + r3] + movh m3, [r1 + r4] + movh m4, [r1 + 4 * r2] + lea r1, [r1 + 4 * r2] + movh m5, [r1 + r2] + movh m6, [r1 + r3] + movh m7, [r1 + r4] + + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + punpckhwd m5, m4, m6 + punpcklwd m4, m6 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m3, m1, m5 + punpckldq m1, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + 32], m1 + movu [r0 + 48], m3 + RET +%endif + +%macro TRANSPOSE_8x8 1 + + movh m0, [r1] + movh m1, [r1 + r2] + movh m2, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m3, [r1 + r2] + movh m4, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m5, [r1 + r2] + movh m6, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m7, [r1 + r2] + + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + punpckhwd m5, m4, m6 + punpcklwd m4, m6 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m3, m1, m5 + punpckldq m1, m5 + + movh [r0], m0 + movhps [r0 + %1], m0 + movh [r0 + 2 * %1], m2 + lea r0, [r0 + 2 * %1] + movhps [r0 + %1], m2 + movh [r0 + 2 * %1], m1 + lea r0, [r0 + 2 * %1] + movhps [r0 + %1], m1 + movh [r0 + 2 * %1], m3 + lea r0, [r0 + 2 * %1] + movhps [r0 + %1], m3 + +%endmacro + + +;----------------------------------------------------------------- +; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +%if HIGH_BIT_DEPTH == 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose16x8_internal + movu m0, [r1] + movu m1, [r1 + r2] + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + lea r1, [r1 + 4 * r2] + + movu m4, [r1] + movu m5, [r1 + r2] + movu m6, [r1 + 2 * r2] + movu m7, [r1 + r3] + + punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2] + punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2] + + punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4] + punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4] + + punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6] + punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6] + + punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8] + punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8] + + punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4] + punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4] + + punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8] + punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8] + + punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4] + punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4] + + punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8] + punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8] + + punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8] + punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8] + + punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8] + punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8] + + punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8] + punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8] + + punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8] + punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8] + + movu [r0 + 0 * 32], xm6 + vextracti128 [r0 + 8 * 32], m6, 1 + movu [r0 + 1 * 32], xm7 + vextracti128 [r0 + 9 * 32], m7, 1 + movu [r0 + 2 * 32], xm1 + vextracti128 [r0 + 10 * 32], m1, 1 + movu [r0 + 3 * 32], xm8 + vextracti128 [r0 + 11 * 32], m8, 1 + movu [r0 + 4 * 32], xm3 + vextracti128 [r0 + 12 * 32], m3, 1 + movu [r0 + 5 * 32], xm5 + vextracti128 [r0 + 13 * 32], m5, 1 + movu [r0 + 6 * 32], xm2 + vextracti128 [r0 + 14 * 32], m2, 1 + movu [r0 + 7 * 32], xm0 + vextracti128 [r0 + 15 * 32], m0, 1 + ret + +cglobal transpose16, 3, 4, 9 + add r2, r2 + lea r3, [r2 * 3] + call transpose16x8_internal + lea r1, [r1 + 4 * r2] + add r0, 16 + call transpose16x8_internal + RET +%endif +INIT_XMM sse2 +cglobal transpose16, 3, 7, 4, dest, src, stride + add r2, r2 + mov r3, r0 + mov r4, r1 + mov r5, 32 + mov r6, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r4 + 16] + lea r0, [r6 + 8 * r5] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * r5 + 16] + mov r3, r0 + call transpose8_internal + RET +%else ;HIGH_BIT_DEPTH == 0 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose16, 3, 5, 9 + lea r3, [r2 * 3] + lea r4, [r1 + 8 * r2] + + movu xm0, [r1] + movu xm1, [r1 + r2] + movu xm2, [r1 + 2 * r2] + movu xm3, [r1 + r3] + vinserti128 m0, m0, [r4], 1 + vinserti128 m1, m1, [r4 + r2], 1 + vinserti128 m2, m2, [r4 + 2 * r2], 1 + vinserti128 m3, m3, [r4 + r3], 1 + lea r1, [r1 + 4 * r2] + lea r4, [r4 + 4 * r2] + + movu xm4, [r1] + movu xm5, [r1 + r2] + movu xm6, [r1 + 2 * r2] + movu xm7, [r1 + r3] + vinserti128 m4, m4, [r4], 1 + vinserti128 m5, m5, [r4 + r2], 1 + vinserti128 m6, m6, [r4 + 2 * r2], 1 + vinserti128 m7, m7, [r4 + r3], 1 + + punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10] + punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10] + + punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12] + punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12] + + punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14] + punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14] + + punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16] + punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16] + + punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12] + punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12] + + punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16] + punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16] + + punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12] + punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12] + + punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16] + punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16] + + punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + vpermq m6, m6, 0xD8 + vpermq m7, m7, 0xD8 + vpermq m1, m1, 0xD8 + vpermq m8, m8, 0xD8 + vpermq m3, m3, 0xD8 + vpermq m5, m5, 0xD8 + vpermq m2, m2, 0xD8 + vpermq m0, m0, 0xD8 + + movu [r0 + 0 * 16], m6 + movu [r0 + 2 * 16], m7 + movu [r0 + 4 * 16], m1 + movu [r0 + 6 * 16], m8 + movu [r0 + 8 * 16], m3 + movu [r0 + 10 * 16], m5 + movu [r0 + 12 * 16], m2 + movu [r0 + 14 * 16], m0 + RET +%endif +INIT_XMM sse2 +cglobal transpose16, 3, 5, 8, dest, src, stride + mov r3, r0 + mov r4, r1 + TRANSPOSE_8x8 16 + lea r1, [r1 + 2 * r2] + lea r0, [r3 + 8] + TRANSPOSE_8x8 16 + lea r1, [r4 + 8] + lea r0, [r3 + 8 * 16] + TRANSPOSE_8x8 16 + lea r1, [r1 + 2 * r2] + lea r0, [r3 + 8 * 16 + 8] + TRANSPOSE_8x8 16 + RET +%endif + +cglobal transpose16_internal + TRANSPOSE_8x8 r6 + lea r1, [r1 + 2 * r2] + lea r0, [r5 + 8] + TRANSPOSE_8x8 r6 + lea r1, [r1 + 2 * r2] + neg r2 + lea r1, [r1 + r2 * 8] + lea r1, [r1 + r2 * 8 + 8] + neg r2 + lea r0, [r5 + 8 * r6] + TRANSPOSE_8x8 r6 + lea r1, [r1 + 2 * r2] + lea r0, [r5 + 8 * r6 + 8] + TRANSPOSE_8x8 r6 + ret + +;----------------------------------------------------------------- +; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +%if HIGH_BIT_DEPTH == 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose8x32_internal + movu m0, [r1] + movu m1, [r1 + 32] + movu m2, [r1 + r2] + movu m3, [r1 + r2 + 32] + movu m4, [r1 + 2 * r2] + movu m5, [r1 + 2 * r2 + 32] + movu m6, [r1 + r3] + movu m7, [r1 + r3 + 32] + lea r1, [r1 + 4 * r2] + + punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] + punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] + + punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] + punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] + + punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] + punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] + + punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] + punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] + + punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] + punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] + + punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] + punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] + + punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] + punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] + + punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] + punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] + + movq [r0 + 0 * 64], xm7 + movhps [r0 + 1 * 64], xm7 + vextracti128 xm5, m7, 1 + movq [r0 + 8 * 64], xm5 + movhps [r0 + 9 * 64], xm5 + + movu m7, [r1] + movu m9, [r1 + 32] + movu m10, [r1 + r2] + movu m11, [r1 + r2 + 32] + movu m12, [r1 + 2 * r2] + movu m13, [r1 + 2 * r2 + 32] + movu m14, [r1 + r3] + movu m15, [r1 + r3 + 32] + + punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] + punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] + + punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] + punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] + + punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] + punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] + + punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] + punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] + + punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] + punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] + + punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] + punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] + + punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] + punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] + + punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] + punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] + + movq [r0 + 0 * 64 + 8], xm15 + movhps [r0 + 1 * 64 + 8], xm15 + vextracti128 xm13, m15, 1 + movq [r0 + 8 * 64 + 8], xm13 + movhps [r0 + 9 * 64 + 8], xm13 + + punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] + punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] + + punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] + punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] + + punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] + punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] + + punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] + punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] + + punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] + punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] + + punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] + punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] + + punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] + punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] + + movu [r0 + 2 * 64], xm13 + vextracti128 [r0 + 10 * 64], m13, 1 + + movu [r0 + 3 * 64], xm8 + vextracti128 [r0 + 11 * 64], m8, 1 + + movu [r0 + 4 * 64], xm5 + vextracti128 [r0 + 12 * 64], m5, 1 + + movu [r0 + 5 * 64], xm2 + vextracti128 [r0 + 13 * 64], m2, 1 + + movu [r0 + 6 * 64], xm10 + vextracti128 [r0 + 14 * 64], m10, 1 + + movu [r0 + 7 * 64], xm0 + vextracti128 [r0 + 15 * 64], m0, 1 + + movu [r0 + 16 * 64], xm7 + vextracti128 [r0 + 24 * 64], m7, 1 + + movu [r0 + 17 * 64], xm4 + vextracti128 [r0 + 25 * 64], m4, 1 + + movu [r0 + 18 * 64], xm12 + vextracti128 [r0 + 26 * 64], m12, 1 + + movu [r0 + 19 * 64], xm6 + vextracti128 [r0 + 27 * 64], m6, 1 + + movu [r0 + 20 * 64], xm14 + vextracti128 [r0 + 28 * 64], m14, 1 + + movu [r0 + 21 * 64], xm3 + vextracti128 [r0 + 29 * 64], m3, 1 + + movu [r0 + 22 * 64], xm11 + vextracti128 [r0 + 30 * 64], m11, 1 + + movu [r0 + 23 * 64], xm1 + vextracti128 [r0 + 31 * 64], m1, 1 + ret + +cglobal transpose32, 3, 4, 16 + add r2, r2 + lea r3, [r2 * 3] + call transpose8x32_internal + add r0, 16 + lea r1, [r1 + 4 * r2] + call transpose8x32_internal + add r0, 16 + lea r1, [r1 + 4 * r2] + call transpose8x32_internal + add r0, 16 + lea r1, [r1 + 4 * r2] + call transpose8x32_internal + RET +%endif +INIT_XMM sse2 +cglobal transpose32, 3, 7, 4, dest, src, stride + add r2, r2 + mov r3, r0 + mov r4, r1 + mov r5, 64 + mov r6, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r4 + 16] + lea r0, [r6 + 8 * 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 64 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 64 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 64 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r4 + 32] + lea r0, [r6 + 16 * 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 64 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 64 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 64 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r4 + 48] + lea r0, [r6 + 24 * 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 64 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 64 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 64 + 48] + mov r3, r0 + call transpose8_internal + RET +%else ;HIGH_BIT_DEPTH == 0 +INIT_XMM sse2 +cglobal transpose32, 3, 7, 8, dest, src, stride + mov r3, r0 + mov r4, r1 + mov r5, r0 + mov r6, 32 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 16] + mov r5, r0 + call transpose16_internal + lea r1, [r4 + 16] + lea r0, [r3 + 16 * 32] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 16 * 32 + 16] + mov r5, r0 + call transpose16_internal + RET + +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose32, 3, 5, 16 + lea r3, [r2 * 3] + mov r4d, 2 + +.loop: + movu m0, [r1] + movu m1, [r1 + r2] + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + lea r1, [r1 + 4 * r2] + + movu m4, [r1] + movu m5, [r1 + r2] + movu m6, [r1 + 2 * r2] + movu m7, [r1 + r3] + + punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] + punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] + + punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] + punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] + + punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] + punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] + + punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] + punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] + + punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] + punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] + + punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] + punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] + + punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] + punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4] + + punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] + punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8] + + punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] + punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] + + punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] + punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] + + punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] + punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] + + punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] + punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] + + movq [r0 + 0 * 32], xm6 + movhps [r0 + 1 * 32], xm6 + vextracti128 xm4, m6, 1 + movq [r0 + 16 * 32], xm4 + movhps [r0 + 17 * 32], xm4 + + lea r1, [r1 + 4 * r2] + movu m9, [r1] + movu m10, [r1 + r2] + movu m11, [r1 + 2 * r2] + movu m12, [r1 + r3] + lea r1, [r1 + 4 * r2] + + movu m13, [r1] + movu m14, [r1 + r2] + movu m15, [r1 + 2 * r2] + movu m6, [r1 + r3] + + punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] + punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] + + punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] + punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] + + punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] + punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] + + punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] + punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] + + punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] + punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] + + punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] + punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] + + punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] + punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] + + punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] + punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] + + punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] + punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] + + punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] + punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] + + punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] + punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] + + punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] + punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] + + + punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + movq [r0 + 0 * 32 + 8], xm15 + movhps [r0 + 1 * 32 + 8], xm15 + vextracti128 xm9, m15, 1 + movq [r0 + 16 * 32 + 8], xm9 + movhps [r0 + 17 * 32 + 8], xm9 + + movu [r0 + 2 * 32], xm13 + vextracti128 [r0 + 18 * 32], m13, 1 + + movu [r0 + 3 * 32], xm7 + vextracti128 [r0 + 19 * 32], m7, 1 + + movu [r0 + 4 * 32], xm6 + vextracti128 [r0 + 20 * 32], m6, 1 + + movu [r0 + 5 * 32], xm1 + vextracti128 [r0 + 21 * 32], m1, 1 + + movu [r0 + 6 * 32], xm10 + vextracti128 [r0 + 22 * 32], m10, 1 + + movu [r0 + 7 * 32], xm8 + vextracti128 [r0 + 23 * 32], m8, 1 + + movu [r0 + 8 * 32], xm4 + vextracti128 [r0 + 24 * 32], m4, 1 + + movu [r0 + 9 * 32], xm3 + vextracti128 [r0 + 25 * 32], m3, 1 + + movu [r0 + 10 * 32], xm12 + vextracti128 [r0 + 26 * 32], m12, 1 + + movu [r0 + 11 * 32], xm5 + vextracti128 [r0 + 27 * 32], m5, 1 + + movu [r0 + 12 * 32], xm14 + vextracti128 [r0 + 28 * 32], m14, 1 + + movu [r0 + 13 * 32], xm2 + vextracti128 [r0 + 29 * 32], m2, 1 + + movu [r0 + 14 * 32], xm11 + vextracti128 [r0 + 30 * 32], m11, 1 + + movu [r0 + 15 * 32], xm0 + vextracti128 [r0 + 31 * 32], m0, 1 + + add r0, 16 + lea r1, [r1 + 4 * r2] + dec r4d + jnz .loop + RET +%endif +%endif + +;----------------------------------------------------------------- +; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +%if HIGH_BIT_DEPTH == 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal transpose8x32_64_internal + movu m0, [r1] + movu m1, [r1 + 32] + movu m2, [r1 + r2] + movu m3, [r1 + r2 + 32] + movu m4, [r1 + 2 * r2] + movu m5, [r1 + 2 * r2 + 32] + movu m6, [r1 + r3] + movu m7, [r1 + r3 + 32] + lea r1, [r1 + 4 * r2] + + punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] + punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] + + punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] + punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] + + punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] + punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] + + punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] + punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] + + punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] + punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] + + punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] + punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] + + punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] + punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] + + punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] + punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] + + movq [r0 + 0 * 128], xm7 + movhps [r0 + 1 * 128], xm7 + vextracti128 xm5, m7, 1 + movq [r0 + 8 * 128], xm5 + movhps [r0 + 9 * 128], xm5 + + movu m7, [r1] + movu m9, [r1 + 32] + movu m10, [r1 + r2] + movu m11, [r1 + r2 + 32] + movu m12, [r1 + 2 * r2] + movu m13, [r1 + 2 * r2 + 32] + movu m14, [r1 + r3] + movu m15, [r1 + r3 + 32] + + punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] + punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] + + punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] + punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] + + punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] + punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] + + punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] + punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] + + punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] + punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] + + punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] + punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] + + punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] + punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] + + punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] + punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] + + movq [r0 + 0 * 128 + 8], xm15 + movhps [r0 + 1 * 128 + 8], xm15 + vextracti128 xm13, m15, 1 + movq [r0 + 8 * 128 + 8], xm13 + movhps [r0 + 9 * 128 + 8], xm13 + + punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] + punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] + + punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] + punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] + + punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] + punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] + + punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] + punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] + + punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] + punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] + + punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] + punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] + + punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] + punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] + + movu [r0 + 2 * 128], xm13 + vextracti128 [r0 + 10 * 128], m13, 1 + + movu [r0 + 3 * 128], xm8 + vextracti128 [r0 + 11 * 128], m8, 1 + + movu [r0 + 4 * 128], xm5 + vextracti128 [r0 + 12 * 128], m5, 1 + + movu [r0 + 5 * 128], xm2 + vextracti128 [r0 + 13 * 128], m2, 1 + + movu [r0 + 6 * 128], xm10 + vextracti128 [r0 + 14 * 128], m10, 1 + + movu [r0 + 7 * 128], xm0 + vextracti128 [r0 + 15 * 128], m0, 1 + + movu [r0 + 16 * 128], xm7 + vextracti128 [r0 + 24 * 128], m7, 1 + + movu [r0 + 17 * 128], xm4 + vextracti128 [r0 + 25 * 128], m4, 1 + + movu [r0 + 18 * 128], xm12 + vextracti128 [r0 + 26 * 128], m12, 1 + + movu [r0 + 19 * 128], xm6 + vextracti128 [r0 + 27 * 128], m6, 1 + + movu [r0 + 20 * 128], xm14 + vextracti128 [r0 + 28 * 128], m14, 1 + + movu [r0 + 21 * 128], xm3 + vextracti128 [r0 + 29 * 128], m3, 1 + + movu [r0 + 22 * 128], xm11 + vextracti128 [r0 + 30 * 128], m11, 1 + + movu [r0 + 23 * 128], xm1 + vextracti128 [r0 + 31 * 128], m1, 1 + ret + +cglobal transpose64, 3, 6, 16 + add r2, r2 + lea r3, [3 * r2] + lea r4, [r1 + 64] + lea r5, [r0 + 16] + + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r5, [r0 + 16] + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r5, [r0 + 16] + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r5, [r0 + 16] + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r5, [r0 + 16] + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r5, [r0 + 16] + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r5, [r0 + 16] + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + mov r0, r5 + lea r4, [r1 + 4 * r2] + lea r1, [r4 - 64] + call transpose8x32_64_internal + mov r1, r4 + lea r0, [r0 + 32 * 128] + call transpose8x32_64_internal + RET +%endif +INIT_XMM sse2 +cglobal transpose64, 3, 7, 4, dest, src, stride + add r2, r2 + mov r3, r0 + mov r4, r1 + mov r5, 128 + mov r6, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 16] + lea r0, [r6 + 8 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 8 * 128 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 32] + lea r0, [r6 + 16 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 16 * 128 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 48] + lea r0, [r6 + 24 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 24 * 128 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 64] + lea r0, [r6 + 32 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 32 * 128 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 80] + lea r0, [r6 + 40 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 40 * 128 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 96] + lea r0, [r6 + 48 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 48 * 128 + 112] + mov r3, r0 + call transpose8_internal + + lea r1, [r4 + 112] + lea r0, [r6 + 56 * 128] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 16] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 32] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 48] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 64] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 80] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 96] + mov r3, r0 + call transpose8_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r6 + 56 * 128 + 112] + mov r3, r0 + call transpose8_internal + RET +%else ;HIGH_BIT_DEPTH == 0 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 + +cglobal transpose16x32_avx2 + movu m0, [r1] + movu m1, [r1 + r2] + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + lea r1, [r1 + 4 * r2] + + movu m4, [r1] + movu m5, [r1 + r2] + movu m6, [r1 + 2 * r2] + movu m7, [r1 + r3] + + punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] + punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] + + punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] + punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] + + punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] + punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] + + punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] + punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] + + punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] + punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] + + punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] + punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] + + punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] + punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4] + + punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] + punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8] + + punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] + punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] + + punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] + punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] + + punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] + punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] + + punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] + punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] + + movq [r0 + 0 * 64], xm6 + movhps [r0 + 1 * 64], xm6 + vextracti128 xm4, m6, 1 + movq [r0 + 16 * 64], xm4 + movhps [r0 + 17 * 64], xm4 + + lea r1, [r1 + 4 * r2] + movu m9, [r1] + movu m10, [r1 + r2] + movu m11, [r1 + 2 * r2] + movu m12, [r1 + r3] + lea r1, [r1 + 4 * r2] + + movu m13, [r1] + movu m14, [r1 + r2] + movu m15, [r1 + 2 * r2] + movu m6, [r1 + r3] + + punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] + punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] + + punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] + punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] + + punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] + punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] + + punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] + punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] + + punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] + punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] + + punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] + punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] + + punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] + punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] + + punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] + punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] + + punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] + punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] + + punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] + punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] + + punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] + punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] + + punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] + punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] + + + punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + + movq [r0 + 0 * 64 + 8], xm15 + movhps [r0 + 1 * 64 + 8], xm15 + vextracti128 xm9, m15, 1 + movq [r0 + 16 * 64 + 8], xm9 + movhps [r0 + 17 * 64 + 8], xm9 + + movu [r0 + 2 * 64], xm13 + vextracti128 [r0 + 18 * 64], m13, 1 + + movu [r0 + 3 * 64], xm7 + vextracti128 [r0 + 19 * 64], m7, 1 + + movu [r0 + 4 * 64], xm6 + vextracti128 [r0 + 20 * 64], m6, 1 + + movu [r0 + 5 * 64], xm1 + vextracti128 [r0 + 21 * 64], m1, 1 + + movu [r0 + 6 * 64], xm10 + vextracti128 [r0 + 22 * 64], m10, 1 + + movu [r0 + 7 * 64], xm8 + vextracti128 [r0 + 23 * 64], m8, 1 + + movu [r0 + 8 * 64], xm4 + vextracti128 [r0 + 24 * 64], m4, 1 + + movu [r0 + 9 * 64], xm3 + vextracti128 [r0 + 25 * 64], m3, 1 + + movu [r0 + 10 * 64], xm12 + vextracti128 [r0 + 26 * 64], m12, 1 + + movu [r0 + 11 * 64], xm5 + vextracti128 [r0 + 27 * 64], m5, 1 + + movu [r0 + 12 * 64], xm14 + vextracti128 [r0 + 28 * 64], m14, 1 + + movu [r0 + 13 * 64], xm2 + vextracti128 [r0 + 29 * 64], m2, 1 + + movu [r0 + 14 * 64], xm11 + vextracti128 [r0 + 30 * 64], m11, 1 + + movu [r0 + 15 * 64], xm0 + vextracti128 [r0 + 31 * 64], m0, 1 + ret + +cglobal transpose64, 3, 6, 16 + + lea r3, [r2 * 3] + lea r4, [r0 + 16] + + lea r5, [r1 + 32] + call transpose16x32_avx2 + lea r0, [r0 + 32 * 64] + mov r1, r5 + call transpose16x32_avx2 + + mov r0, r4 + lea r5, [r1 + 4 * r2] + + lea r1, [r5 - 32] + call transpose16x32_avx2 + lea r0, [r0 + 32 * 64] + mov r1, r5 + call transpose16x32_avx2 + + lea r0, [r4 + 16] + lea r5, [r1 + 4 * r2] + + lea r1, [r5 - 32] + call transpose16x32_avx2 + lea r0, [r0 + 32 * 64] + mov r1, r5 + call transpose16x32_avx2 + + lea r5, [r1 + 4 * r2] + lea r0, [r4 + 32] + + lea r1, [r5 - 32] + call transpose16x32_avx2 + lea r0, [r0 + 32 * 64] + mov r1, r5 + call transpose16x32_avx2 + RET +%endif + +INIT_XMM sse2 +cglobal transpose64, 3, 7, 8, dest, src, stride + mov r3, r0 + mov r4, r1 + mov r5, r0 + mov r6, 64 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 16] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 32] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 48] + mov r5, r0 + call transpose16_internal + + lea r1, [r4 + 16] + lea r0, [r3 + 16 * 64] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 16 * 64 + 16] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 16 * 64 + 32] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 16 * 64 + 48] + mov r5, r0 + call transpose16_internal + + lea r1, [r4 + 32] + lea r0, [r3 + 32 * 64] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 32 * 64 + 16] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 32 * 64 + 32] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 32 * 64 + 48] + mov r5, r0 + call transpose16_internal + + lea r1, [r4 + 48] + lea r0, [r3 + 48 * 64] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 48 * 64 + 16] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 48 * 64 + 32] + mov r5, r0 + call transpose16_internal + lea r1, [r1 - 8 + 2 * r2] + lea r0, [r3 + 48 * 64 + 48] + mov r5, r0 + call transpose16_internal + RET +%endif + + +;============================================================================= +; SSIM +;============================================================================= + +;----------------------------------------------------------------------------- +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +%macro SSIM_ITER 1 +%if HIGH_BIT_DEPTH + movdqu m5, [r0+(%1&1)*r1] + movdqu m6, [r2+(%1&1)*r3] +%else + movq m5, [r0+(%1&1)*r1] + movq m6, [r2+(%1&1)*r3] + punpcklbw m5, m0 + punpcklbw m6, m0 +%endif +%if %1==1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] +%endif +%if %1==0 + movdqa m1, m5 + movdqa m2, m6 +%else + paddw m1, m5 + paddw m2, m6 +%endif + pmaddwd m7, m5, m6 + pmaddwd m5, m5 + pmaddwd m6, m6 + ACCUM paddd, 3, 5, %1 + ACCUM paddd, 4, 7, %1 + paddd m3, m6 +%endmacro + +%macro SSIM 0 +cglobal pixel_ssim_4x4x2_core, 4,4,8 + FIX_STRIDES r1, r3 + pxor m0, m0 + SSIM_ITER 0 + SSIM_ITER 1 + SSIM_ITER 2 + SSIM_ITER 3 + ; PHADDW m1, m2 + ; PHADDD m3, m4 + movdqa m7, [pw_1] + pshufd m5, m3, q2301 + pmaddwd m1, m7 + pmaddwd m2, m7 + pshufd m6, m4, q2301 + packssdw m1, m2 + paddd m3, m5 + pshufd m1, m1, q3120 + paddd m4, m6 + pmaddwd m1, m7 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + +%if UNIX64 + %define t0 r4 +%else + %define t0 rax + mov t0, r4mp +%endif + + movq [t0+ 0], m1 + movq [t0+ 8], m3 + movhps [t0+16], m1 + movq [t0+24], m5 + RET + +;----------------------------------------------------------------------------- +; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) +;----------------------------------------------------------------------------- +cglobal pixel_ssim_end4, 2,3 + mov r2d, r2m + mova m0, [r0+ 0] + mova m1, [r0+16] + mova m2, [r0+32] + mova m3, [r0+48] + mova m4, [r0+64] + paddd m0, [r1+ 0] + paddd m1, [r1+16] + paddd m2, [r1+32] + paddd m3, [r1+48] + paddd m4, [r1+64] + paddd m0, m1 + paddd m1, m2 + paddd m2, m3 + paddd m3, m4 + TRANSPOSE4x4D 0, 1, 2, 3, 4 + +; s1=m0, s2=m1, ss=m2, s12=m3 +%if BIT_DEPTH == 10 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m4, m0, m1 ; s1*s2 + mulps m0, m0 ; s1*s1 + mulps m1, m1 ; s2*s2 + mulps m2, [pf_64] ; ss*64 + mulps m3, [pf_128] ; s12*128 + addps m4, m4 ; s1*s2*2 + addps m0, m1 ; s1*s1 + s2*s2 + subps m2, m0 ; vars + subps m3, m4 ; covar*2 + movaps m1, [ssim_c1] + addps m4, m1 ; s1*s2*2 + ssim_c1 + addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 + movaps m1, [ssim_c2] + addps m2, m1 ; vars + ssim_c2 + addps m3, m1 ; covar*2 + ssim_c2 +%else + pmaddwd m4, m1, m0 ; s1*s2 + pslld m1, 16 + por m0, m1 + pmaddwd m0, m0 ; s1*s1 + s2*s2 + pslld m4, 1 + pslld m3, 7 + pslld m2, 6 + psubd m3, m4 ; covar*2 + psubd m2, m0 ; vars + mova m1, [ssim_c1] + paddd m0, m1 + paddd m4, m1 + mova m1, [ssim_c2] + paddd m3, m1 + paddd m2, m1 + cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) + cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) + cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) + cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) +%endif + mulps m4, m3 + mulps m0, m2 + divps m4, m0 ; ssim + + cmp r2d, 4 + je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level + neg r2 + +%ifdef PIC + lea r3, [mask_ff + 16] + %xdefine %%mask r3 +%else + %xdefine %%mask mask_ff + 16 +%endif +%if cpuflag(avx) + andps m4, [%%mask + r2*4] +%else + movups m0, [%%mask + r2*4] + andps m4, m0 +%endif + +.skip: + movhlps m0, m4 + addps m0, m4 +%if cpuflag(ssse3) + movshdup m4, m0 +%else + pshuflw m4, m0, q0032 +%endif + addss m0, m4 +%if ARCH_X86_64 == 0 + movss r0m, m0 + fld dword r0m +%endif + RET +%endmacro ; SSIM + +INIT_XMM sse2 +SSIM +INIT_XMM avx +SSIM + +;----------------------------------------------------------------- +; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) +;----------------------------------------------------------------- +INIT_XMM ssse3 +cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride +%if HIGH_BIT_DEPTH + mova m7, [deinterleave_word_shuf] + + movu m0, [r1] + palignr m1, m0, 2 + movu m2, [r1 + 16] + palignr m3, m2, 2 + movu m4, [r1 + 32] + palignr m5, m4, 2 + movu m6, [r1 + 48] + pavgw m0, m1 + palignr m1, m6, 2 + pavgw m2, m3 + pavgw m4, m5 + pavgw m6, m1 + pshufb m0, m0, m7 + pshufb m2, m2, m7 + pshufb m4, m4, m7 + pshufb m6, m6, m7 + punpcklqdq m0, m2 + movu [r0], m0 + punpcklqdq m4, m6 + movu [r0 + 16], m4 + + + + movu m0, [r1 + 64] + palignr m1, m0, 2 + movu m2, [r1 + 80] + palignr m3, m2, 2 + movu m4, [r1 + 96] + palignr m5, m4, 2 + movu m6, [r1 + 112] + pavgw m0, m1 + palignr m1, m6, 2 + pavgw m2, m3 + pavgw m4, m5 + pavgw m6, m1 + pshufb m0, m0, m7 + pshufb m2, m2, m7 + pshufb m4, m4, m7 + pshufb m6, m6, m7 + punpcklqdq m0, m2 + movu [r0 + 32], m0 + punpcklqdq m4, m6 + movu [r0 + 48], m4 + + movu m0, [r1 + 128] + palignr m1, m0, 2 + movu m2, [r1 + 144] + palignr m3, m2, 2 + movu m4, [r1 + 160] + palignr m5, m4, 2 + movu m6, [r1 + 176] + pavgw m0, m1 + palignr m1, m6, 2 + pavgw m2, m3 + pavgw m4, m5 + pavgw m6, m1 + pshufb m0, m0, m7 + pshufb m2, m2, m7 + pshufb m4, m4, m7 + pshufb m6, m6, m7 + + punpcklqdq m0, m2 + movu [r0 + 64], m0 + punpcklqdq m4, m6 + movu [r0 + 80], m4 + + movu m0, [r1 + 192] + palignr m1, m0, 2 + movu m2, [r1 + 208] + palignr m3, m2, 2 + movu m4, [r1 + 224] + palignr m5, m4, 2 + movu m6, [r1 + 240] + pavgw m0, m1 + palignr m1, m6, 2 + pavgw m2, m3 + pavgw m4, m5 + pavgw m6, m1 + pshufb m0, m0, m7 + pshufb m2, m2, m7 + pshufb m4, m4, m7 + pshufb m6, m6, m7 + + punpcklqdq m0, m2 + movu [r0 + 96], m0 + punpcklqdq m4, m6 + movu [r0 + 112], m4 + +%else + mova m7, [deinterleave_shuf] + + movu m0, [r1] + palignr m1, m0, 1 + movu m2, [r1 + 16] + palignr m3, m2, 1 + movu m4, [r1 + 32] + palignr m5, m4, 1 + movu m6, [r1 + 48] + + pavgb m0, m1 + + palignr m1, m6, 1 + + pavgb m2, m3 + pavgb m4, m5 + pavgb m6, m1 + + pshufb m0, m0, m7 + pshufb m2, m2, m7 + pshufb m4, m4, m7 + pshufb m6, m6, m7 + + punpcklqdq m0, m2 + movu [r0], m0 + punpcklqdq m4, m6 + movu [r0 + 16], m4 + + movu m0, [r1 + 64] + palignr m1, m0, 1 + movu m2, [r1 + 80] + palignr m3, m2, 1 + movu m4, [r1 + 96] + palignr m5, m4, 1 + movu m6, [r1 + 112] + + pavgb m0, m1 + + palignr m1, m6, 1 + + pavgb m2, m3 + pavgb m4, m5 + pavgb m6, m1 + + pshufb m0, m0, m7 + pshufb m2, m2, m7 + pshufb m4, m4, m7 + pshufb m6, m6, m7 + + punpcklqdq m0, m2 + movu [r0 + 32], m0 + punpcklqdq m4, m6 + movu [r0 + 48], m4 +%endif +RET + +%if HIGH_BIT_DEPTH == 1 +INIT_YMM avx2 +cglobal scale1D_128to64, 2, 2, 3 + pxor m2, m2 + + movu m0, [r1] + movu m1, [r1 + 32] + phaddw m0, m1 + pavgw m0, m2 + vpermq m0, m0, 0xD8 + movu [r0], m0 + + movu m0, [r1 + 64] + movu m1, [r1 + 96] + phaddw m0, m1 + pavgw m0, m2 + vpermq m0, m0, 0xD8 + movu [r0 + 32], m0 + + movu m0, [r1 + 128] + movu m1, [r1 + 160] + phaddw m0, m1 + pavgw m0, m2 + vpermq m0, m0, 0xD8 + movu [r0 + 64], m0 + + movu m0, [r1 + 192] + movu m1, [r1 + 224] + phaddw m0, m1 + pavgw m0, m2 + vpermq m0, m0, 0xD8 + movu [r0 + 96], m0 + RET +%else ; HIGH_BIT_DEPTH == 0 +INIT_YMM avx2 +cglobal scale1D_128to64, 2, 2, 4 + pxor m2, m2 + mova m3, [pb_1] + + movu m0, [r1] + pmaddubsw m0, m0, m3 + pavgw m0, m2 + movu m1, [r1 + 32] + pmaddubsw m1, m1, m3 + pavgw m1, m2 + packuswb m0, m1 + vpermq m0, m0, 0xD8 + movu [r0], m0 + + movu m0, [r1 + 64] + pmaddubsw m0, m0, m3 + pavgw m0, m2 + movu m1, [r1 + 96] + pmaddubsw m1, m1, m3 + pavgw m1, m2 + packuswb m0, m1 + vpermq m0, m0, 0xD8 + movu [r0 + 32], m0 + RET +%endif + +;----------------------------------------------------------------- +; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +%if HIGH_BIT_DEPTH +INIT_XMM ssse3 +cglobal scale2D_64to32, 3, 4, 8, dest, src, stride + mov r3d, 32 + mova m7, [deinterleave_word_shuf] + add r2, r2 +.loop: + movu m0, [r1] ;i + psrld m1, m0, 16 ;j + movu m2, [r1 + r2] ;k + psrld m3, m2, 16 ;l + movu m4, m0 + movu m5, m2 + pxor m4, m1 ;i^j + pxor m5, m3 ;k^l + por m4, m5 ;ij|kl + pavgw m0, m1 ;s + pavgw m2, m3 ;t + movu m5, m0 + pavgw m0, m2 ;(s+t+1)/2 + pxor m5, m2 ;s^t + pand m4, m5 ;(ij|kl)&st + pand m4, [hmulw_16p] + psubw m0, m4 ;Result + movu m1, [r1 + 16] ;i + psrld m2, m1, 16 ;j + movu m3, [r1 + r2 + 16] ;k + psrld m4, m3, 16 ;l + movu m5, m1 + movu m6, m3 + pxor m5, m2 ;i^j + pxor m6, m4 ;k^l + por m5, m6 ;ij|kl + pavgw m1, m2 ;s + pavgw m3, m4 ;t + movu m6, m1 + pavgw m1, m3 ;(s+t+1)/2 + pxor m6, m3 ;s^t + pand m5, m6 ;(ij|kl)&st + pand m5, [hmulw_16p] + psubw m1, m5 ;Result + pshufb m0, m7 + pshufb m1, m7 + + punpcklqdq m0, m1 + movu [r0], m0 + + movu m0, [r1 + 32] ;i + psrld m1, m0, 16 ;j + movu m2, [r1 + r2 + 32] ;k + psrld m3, m2, 16 ;l + movu m4, m0 + movu m5, m2 + pxor m4, m1 ;i^j + pxor m5, m3 ;k^l + por m4, m5 ;ij|kl + pavgw m0, m1 ;s + pavgw m2, m3 ;t + movu m5, m0 + pavgw m0, m2 ;(s+t+1)/2 + pxor m5, m2 ;s^t + pand m4, m5 ;(ij|kl)&st + pand m4, [hmulw_16p] + psubw m0, m4 ;Result + movu m1, [r1 + 48] ;i + psrld m2, m1, 16 ;j + movu m3, [r1 + r2 + 48] ;k + psrld m4, m3, 16 ;l + movu m5, m1 + movu m6, m3 + pxor m5, m2 ;i^j + pxor m6, m4 ;k^l + por m5, m6 ;ij|kl + pavgw m1, m2 ;s + pavgw m3, m4 ;t + movu m6, m1 + pavgw m1, m3 ;(s+t+1)/2 + pxor m6, m3 ;s^t + pand m5, m6 ;(ij|kl)&st + pand m5, [hmulw_16p] + psubw m1, m5 ;Result + pshufb m0, m7 + pshufb m1, m7 + + punpcklqdq m0, m1 + movu [r0 + 16], m0 + + movu m0, [r1 + 64] ;i + psrld m1, m0, 16 ;j + movu m2, [r1 + r2 + 64] ;k + psrld m3, m2, 16 ;l + movu m4, m0 + movu m5, m2 + pxor m4, m1 ;i^j + pxor m5, m3 ;k^l + por m4, m5 ;ij|kl + pavgw m0, m1 ;s + pavgw m2, m3 ;t + movu m5, m0 + pavgw m0, m2 ;(s+t+1)/2 + pxor m5, m2 ;s^t + pand m4, m5 ;(ij|kl)&st + pand m4, [hmulw_16p] + psubw m0, m4 ;Result + movu m1, [r1 + 80] ;i + psrld m2, m1, 16 ;j + movu m3, [r1 + r2 + 80] ;k + psrld m4, m3, 16 ;l + movu m5, m1 + movu m6, m3 + pxor m5, m2 ;i^j + pxor m6, m4 ;k^l + por m5, m6 ;ij|kl + pavgw m1, m2 ;s + pavgw m3, m4 ;t + movu m6, m1 + pavgw m1, m3 ;(s+t+1)/2 + pxor m6, m3 ;s^t + pand m5, m6 ;(ij|kl)&st + pand m5, [hmulw_16p] + psubw m1, m5 ;Result + pshufb m0, m7 + pshufb m1, m7 + + punpcklqdq m0, m1 + movu [r0 + 32], m0 + + movu m0, [r1 + 96] ;i + psrld m1, m0, 16 ;j + movu m2, [r1 + r2 + 96] ;k + psrld m3, m2, 16 ;l + movu m4, m0 + movu m5, m2 + pxor m4, m1 ;i^j + pxor m5, m3 ;k^l + por m4, m5 ;ij|kl + pavgw m0, m1 ;s + pavgw m2, m3 ;t + movu m5, m0 + pavgw m0, m2 ;(s+t+1)/2 + pxor m5, m2 ;s^t + pand m4, m5 ;(ij|kl)&st + pand m4, [hmulw_16p] + psubw m0, m4 ;Result + movu m1, [r1 + 112] ;i + psrld m2, m1, 16 ;j + movu m3, [r1 + r2 + 112] ;k + psrld m4, m3, 16 ;l + movu m5, m1 + movu m6, m3 + pxor m5, m2 ;i^j + pxor m6, m4 ;k^l + por m5, m6 ;ij|kl + pavgw m1, m2 ;s + pavgw m3, m4 ;t + movu m6, m1 + pavgw m1, m3 ;(s+t+1)/2 + pxor m6, m3 ;s^t + pand m5, m6 ;(ij|kl)&st + pand m5, [hmulw_16p] + psubw m1, m5 ;Result + pshufb m0, m7 + pshufb m1, m7 + + punpcklqdq m0, m1 + movu [r0 + 48], m0 + lea r0, [r0 + 64] + lea r1, [r1 + 2 * r2] + dec r3d + jnz .loop + RET +%else + +INIT_XMM ssse3 +cglobal scale2D_64to32, 3, 4, 8, dest, src, stride + mov r3d, 32 + mova m7, [deinterleave_shuf] +.loop: + + movu m0, [r1] ;i + psrlw m1, m0, 8 ;j + movu m2, [r1 + r2] ;k + psrlw m3, m2, 8 ;l + movu m4, m0 + movu m5, m2 + + pxor m4, m1 ;i^j + pxor m5, m3 ;k^l + por m4, m5 ;ij|kl + + pavgb m0, m1 ;s + pavgb m2, m3 ;t + movu m5, m0 + pavgb m0, m2 ;(s+t+1)/2 + pxor m5, m2 ;s^t + pand m4, m5 ;(ij|kl)&st + pand m4, [hmul_16p] + psubb m0, m4 ;Result + + movu m1, [r1 + 16] ;i + psrlw m2, m1, 8 ;j + movu m3, [r1 + r2 + 16] ;k + psrlw m4, m3, 8 ;l + movu m5, m1 + movu m6, m3 + + pxor m5, m2 ;i^j + pxor m6, m4 ;k^l + por m5, m6 ;ij|kl + + pavgb m1, m2 ;s + pavgb m3, m4 ;t + movu m6, m1 + pavgb m1, m3 ;(s+t+1)/2 + pxor m6, m3 ;s^t + pand m5, m6 ;(ij|kl)&st + pand m5, [hmul_16p] + psubb m1, m5 ;Result + + pshufb m0, m0, m7 + pshufb m1, m1, m7 + + punpcklqdq m0, m1 + movu [r0], m0 + + movu m0, [r1 + 32] ;i + psrlw m1, m0, 8 ;j + movu m2, [r1 + r2 + 32] ;k + psrlw m3, m2, 8 ;l + movu m4, m0 + movu m5, m2 + + pxor m4, m1 ;i^j + pxor m5, m3 ;k^l + por m4, m5 ;ij|kl + + pavgb m0, m1 ;s + pavgb m2, m3 ;t + movu m5, m0 + pavgb m0, m2 ;(s+t+1)/2 + pxor m5, m2 ;s^t + pand m4, m5 ;(ij|kl)&st + pand m4, [hmul_16p] + psubb m0, m4 ;Result + + movu m1, [r1 + 48] ;i + psrlw m2, m1, 8 ;j + movu m3, [r1 + r2 + 48] ;k + psrlw m4, m3, 8 ;l + movu m5, m1 + movu m6, m3 + + pxor m5, m2 ;i^j + pxor m6, m4 ;k^l + por m5, m6 ;ij|kl + + pavgb m1, m2 ;s + pavgb m3, m4 ;t + movu m6, m1 + pavgb m1, m3 ;(s+t+1)/2 + pxor m6, m3 ;s^t + pand m5, m6 ;(ij|kl)&st + pand m5, [hmul_16p] + psubb m1, m5 ;Result + + pshufb m0, m0, m7 + pshufb m1, m1, m7 + + punpcklqdq m0, m1 + movu [r0 + 16], m0 + + lea r0, [r0 + 32] + lea r1, [r1 + 2 * r2] + dec r3d + jnz .loop + RET +%endif + + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + add r4, r4 + add r5, r5 + add r1, r1 + movh m0, [r2] + movh m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movh m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movh [r0], m0 + movh [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movh [r0 + r1], m6 + + RET +%else +INIT_XMM sse4 +cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + add r1, r1 + movd m0, [r2] + movd m2, [r2 + r4] + movd m1, [r3] + movd m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movd m4, [r2] + movd m6, [r2 + r4] + movd m5, [r3] + movd m7, [r3 + r5] + punpckldq m0, m2 + punpckldq m1, m3 + punpckldq m4, m6 + punpckldq m5, m7 + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + + psubw m0, m1 + psubw m4, m5 + + movh [r0], m0 + movhps [r0 + r1], m0 + movh [r0 + r1 * 2], m4 + lea r0, [r0 + r1 * 2] + movhps [r0 + r1], m4 + + RET +%endif + + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PIXELSUB_PS_W4_H4 2 +%if HIGH_BIT_DEPTH +cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movh m0, [r2] + movh m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movh m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movh [r0], m0 + movh [r0 + r1], m2 + movh [r0 + r1 * 2], m4 + lea r0, [r0 + r1 * 2] + movh [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/4 + add r1, r1 +.loop: + movd m0, [r2] + movd m2, [r2 + r4] + movd m1, [r3] + movd m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movd m4, [r2] + movd m6, [r2 + r4] + movd m5, [r3] + movd m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + punpckldq m0, m2 + punpckldq m1, m3 + punpckldq m4, m6 + punpckldq m5, m7 + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + + psubw m0, m1 + psubw m4, m5 + + movh [r0], m0 + movhps [r0 + r1], m0 + movh [r0 + r1 * 2], m4 + lea r0, [r0 + r1 * 2] + movhps [r0 + r1], m4 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PIXELSUB_PS_W4_H4 4, 8 +%else +INIT_XMM sse4 +PIXELSUB_PS_W4_H4 4, 8 +%endif + + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PIXELSUB_PS_W8_H4 2 +%if HIGH_BIT_DEPTH +cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movu m4, [r2] + movu m6, [r2 + r4] + movu m5, [r3] + movu m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m4 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/4 + add r1, r1 +.loop: + movh m0, [r2] + movh m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movh m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + pmovzxbw m6, m6 + pmovzxbw m7, m7 + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m4 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PIXELSUB_PS_W8_H4 8, 8 +PIXELSUB_PS_W8_H4 8, 16 +%else +INIT_XMM sse4 +PIXELSUB_PS_W8_H4 8, 8 +PIXELSUB_PS_W8_H4 8, 16 +%endif + + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PIXELSUB_PS_W16_H4 2 +%if HIGH_BIT_DEPTH +cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + movu m4, [r2 + r4] + movu m6, [r2 + r4 + 16] + movu m5, [r3 + r5] + movu m7, [r3 + r5 + 16] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m6 + + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + movu m4, [r2 + r4] + movu m5, [r3 + r5] + movu m6, [r2 + r4 + 16] + movu m7, [r3 + r5 + 16] + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/4 + pxor m6, m6 + add r1, r1 +.loop: + movu m1, [r2] + movu m3, [r3] + pmovzxbw m0, m1 + pmovzxbw m2, m3 + punpckhbw m1, m6 + punpckhbw m3, m6 + + psubw m0, m2 + psubw m1, m3 + + movu m5, [r2 + r4] + movu m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, m5 + pmovzxbw m2, m3 + punpckhbw m5, m6 + punpckhbw m3, m6 + + psubw m4, m2 + psubw m5, m3 + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m5 + + movu m1, [r2] + movu m3, [r3] + pmovzxbw m0, m1 + pmovzxbw m2, m3 + punpckhbw m1, m6 + punpckhbw m3, m6 + + psubw m0, m2 + psubw m1, m3 + + movu m5, [r2 + r4] + movu m3, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + pmovzxbw m4, m5 + pmovzxbw m2, m3 + punpckhbw m5, m6 + punpckhbw m3, m6 + + psubw m4, m2 + psubw m5, m3 + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m5 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PIXELSUB_PS_W16_H4 16, 16 +PIXELSUB_PS_W16_H4 16, 32 +%else +INIT_XMM sse4 +PIXELSUB_PS_W16_H4 16, 16 +PIXELSUB_PS_W16_H4 16, 32 +%endif + + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PIXELSUB_PS_W32_H2 2 +%if HIGH_BIT_DEPTH +cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/2 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m4, [r2 + 32] + movu m6, [r2 + 48] + movu m1, [r3] + movu m3, [r3 + 16] + movu m5, [r3 + 32] + movu m7, [r3 + 48] + dec r6d + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + 32], m4 + movu [r0 + 48], m6 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m4, [r2 + r4 + 32] + movu m6, [r2 + r4 + 48] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + movu m5, [r3 + r5 + 32] + movu m7, [r3 + r5 + 48] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + movu [r0 + r1 + 32], m4 + movu [r0 + r1 + 48], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/2 + add r1, r1 +.loop: + movh m0, [r2] + movh m1, [r2 + 8] + movh m2, [r2 + 16] + movh m6, [r2 + 24] + movh m3, [r3] + movh m4, [r3 + 8] + movh m5, [r3 + 16] + movh m7, [r3 + 24] + dec r6d + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m6, m6 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + pmovzxbw m7, m7 + + psubw m0, m3 + psubw m1, m4 + psubw m2, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m6 + + movh m0, [r2 + r4] + movh m1, [r2 + r4 + 8] + movh m2, [r2 + r4 + 16] + movh m6, [r2 + r4 + 24] + movh m3, [r3 + r5] + movh m4, [r3 + r5 + 8] + movh m5, [r3 + r5 + 16] + movh m7, [r3 + r5 + 24] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m6, m6 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + pmovzxbw m7, m7 + + psubw m0, m3 + psubw m1, m4 + psubw m2, m5 + psubw m6, m7 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PIXELSUB_PS_W32_H2 32, 32 +PIXELSUB_PS_W32_H2 32, 64 +%else +INIT_XMM sse4 +PIXELSUB_PS_W32_H2 32, 32 +PIXELSUB_PS_W32_H2 32, 64 +%endif + + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PIXELSUB_PS_W64_H2 2 +%if HIGH_BIT_DEPTH +cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/2 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m4, [r2 + 32] + movu m6, [r2 + 48] + movu m1, [r3] + movu m3, [r3 + 16] + movu m5, [r3 + 32] + movu m7, [r3 + 48] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + 16], m2 + movu [r0 + 32], m4 + movu [r0 + 48], m6 + + movu m0, [r2 + 64] + movu m2, [r2 + 80] + movu m4, [r2 + 96] + movu m6, [r2 + 112] + movu m1, [r3 + 64] + movu m3, [r3 + 80] + movu m5, [r3 + 96] + movu m7, [r3 + 112] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0 + 64], m0 + movu [r0 + 80], m2 + movu [r0 + 96], m4 + movu [r0 + 112], m6 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m4, [r2 + r4 + 32] + movu m6, [r2 + r4 + 48] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + movu m5, [r3 + r5 + 32] + movu m7, [r3 + r5 + 48] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + movu [r0 + r1 + 32], m4 + movu [r0 + r1 + 48], m6 + + movu m0, [r2 + r4 + 64] + movu m2, [r2 + r4 + 80] + movu m4, [r2 + r4 + 96] + movu m6, [r2 + r4 + 112] + movu m1, [r3 + r5 + 64] + movu m3, [r3 + r5 + 80] + movu m5, [r3 + r5 + 96] + movu m7, [r3 + r5 + 112] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0 + r1 + 64], m0 + movu [r0 + r1 + 80], m2 + movu [r0 + r1 + 96], m4 + movu [r0 + r1 + 112], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, %2/2 + pxor m6, m6 + add r1, r1 +.loop: + movu m1, [r2] + movu m5, [r2 + 16] + movu m3, [r3] + movu m7, [r3 + 16] + + pmovzxbw m0, m1 + pmovzxbw m4, m5 + pmovzxbw m2, m3 + punpckhbw m1, m6 + punpckhbw m3, m6 + punpckhbw m5, m6 + + psubw m0, m2 + psubw m1, m3 + pmovzxbw m2, m7 + punpckhbw m7, m6 + psubw m4, m2 + psubw m5, m7 + + movu m3, [r2 + 32] + movu m7, [r3 + 32] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m4 + movu [r0 + 48], m5 + + movu m1, [r2 + 48] + movu m5, [r3 + 48] + pmovzxbw m0, m1 + pmovzxbw m4, m7 + punpckhbw m1, m6 + punpckhbw m7, m6 + + psubw m2, m4 + psubw m3, m7 + + movu [r0 + 64], m2 + movu [r0 + 80], m3 + + movu m7, [r2 + r4] + movu m3, [r3 + r5] + pmovzxbw m2, m5 + pmovzxbw m4, m7 + punpckhbw m5, m6 + punpckhbw m7, m6 + + psubw m0, m2 + psubw m1, m5 + + movu [r0 + 96], m0 + movu [r0 + 112], m1 + + movu m2, [r2 + r4 + 16] + movu m5, [r3 + r5 + 16] + pmovzxbw m0, m3 + pmovzxbw m1, m2 + punpckhbw m3, m6 + punpckhbw m2, m6 + + psubw m4, m0 + psubw m7, m3 + + movu [r0 + r1], m4 + movu [r0 + r1 + 16], m7 + + movu m0, [r2 + r4 + 32] + movu m3, [r3 + r5 + 32] + dec r6d + pmovzxbw m4, m5 + pmovzxbw m7, m0 + punpckhbw m5, m6 + punpckhbw m0, m6 + + psubw m1, m4 + psubw m2, m5 + + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 48], m2 + + movu m4, [r2 + r4 + 48] + movu m5, [r3 + r5 + 48] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m1, m3 + pmovzxbw m2, m4 + punpckhbw m3, m6 + punpckhbw m4, m6 + + psubw m7, m1 + psubw m0, m3 + + movu [r0 + r1 + 64], m7 + movu [r0 + r1 + 80], m0 + + pmovzxbw m7, m5 + punpckhbw m5, m6 + psubw m2, m7 + psubw m4, m5 + + movu [r0 + r1 + 96], m2 + movu [r0 + r1 + 112], m4 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PIXELSUB_PS_W64_H2 64, 64 +%else +INIT_XMM sse4 +PIXELSUB_PS_W64_H2 64, 64 +%endif + + +;============================================================================= +; variance +;============================================================================= + +%macro VAR_START 1 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared +%if HIGH_BIT_DEPTH == 0 +%if %1 + mova m7, [pw_00ff] +%elif mmsize < 32 + pxor m7, m7 ; zero +%endif +%endif ; !HIGH_BIT_DEPTH +%endmacro + +%macro VAR_END 2 +%if HIGH_BIT_DEPTH +%if mmsize == 8 && %1*%2 == 256 + HADDUW m5, m2 +%else +%if %1 >= 32 + HADDW m5, m2 + movd m7, r4d + paddd m5, m7 +%else + HADDW m5, m2 +%endif +%endif +%else ; !HIGH_BIT_DEPTH +%if %1 == 64 + HADDW m5, m2 + movd m7, r4d + paddd m5, m7 +%else + HADDW m5, m2 +%endif +%endif ; HIGH_BIT_DEPTH + HADDD m6, m1 +%if ARCH_X86_64 + punpckldq m5, m6 + movq rax, m5 +%else + movd eax, m5 + movd edx, m6 +%endif + RET +%endmacro + +%macro VAR_CORE 0 + paddw m5, m0 + paddw m5, m3 + paddw m5, m1 + paddw m5, m4 + pmaddwd m0, m0 + pmaddwd m3, m3 + pmaddwd m1, m1 + pmaddwd m4, m4 + paddd m6, m0 + paddd m6, m3 + paddd m6, m1 + paddd m6, m4 +%endmacro + +%macro VAR_2ROW 3 + mov r2d, %2 +.loop%3: +%if HIGH_BIT_DEPTH + movu m0, [r0] + movu m1, [r0+mmsize] + movu m3, [r0+%1] + movu m4, [r0+%1+mmsize] +%else ; !HIGH_BIT_DEPTH + mova m0, [r0] + punpckhbw m1, m0, m7 + mova m3, [r0+%1] + mova m4, m3 + punpcklbw m0, m7 +%endif ; HIGH_BIT_DEPTH +%ifidn %1, r1 + lea r0, [r0+%1*2] +%else + add r0, r1 +%endif +%if HIGH_BIT_DEPTH == 0 + punpcklbw m3, m7 + punpckhbw m4, m7 +%endif ; !HIGH_BIT_DEPTH + VAR_CORE + dec r2d + jg .loop%3 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_var_wxh( uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_MMX mmx2 +cglobal pixel_var_16x16, 2,3 + FIX_STRIDES r1 + VAR_START 0 + VAR_2ROW 8*SIZEOF_PIXEL, 16, 1 + VAR_END 16, 16 + +cglobal pixel_var_8x8, 2,3 + FIX_STRIDES r1 + VAR_START 0 + VAR_2ROW r1, 4, 1 + VAR_END 8, 8 + +%if HIGH_BIT_DEPTH +%macro VAR 0 +cglobal pixel_var_16x16, 2,3,8 + FIX_STRIDES r1 + VAR_START 0 + VAR_2ROW r1, 8, 1 + VAR_END 16, 16 + +cglobal pixel_var_8x8, 2,3,8 + lea r2, [r1*3] + VAR_START 0 + movu m0, [r0] + movu m1, [r0+r1*2] + movu m3, [r0+r1*4] + movu m4, [r0+r2*2] + lea r0, [r0+r1*8] + VAR_CORE + movu m0, [r0] + movu m1, [r0+r1*2] + movu m3, [r0+r1*4] + movu m4, [r0+r2*2] + VAR_CORE + VAR_END 8, 8 + +cglobal pixel_var_32x32, 2,6,8 + FIX_STRIDES r1 + mov r3, r0 + VAR_START 0 + VAR_2ROW r1, 8, 1 + HADDW m5, m2 + movd r4d, m5 + pxor m5, m5 + VAR_2ROW r1, 8, 2 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + lea r0, [r3 + 32] + VAR_2ROW r1, 8, 3 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 4 + VAR_END 32, 32 + +cglobal pixel_var_64x64, 2,6,8 + FIX_STRIDES r1 + mov r3, r0 + VAR_START 0 + VAR_2ROW r1, 8, 1 + HADDW m5, m2 + movd r4d, m5 + pxor m5, m5 + VAR_2ROW r1, 8, 2 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 3 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 4 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + lea r0, [r3 + 32] + VAR_2ROW r1, 8, 5 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 6 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 7 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 8 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + lea r0, [r3 + 64] + VAR_2ROW r1, 8, 9 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 10 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 11 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 12 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + lea r0, [r3 + 96] + VAR_2ROW r1, 8, 13 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 14 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 15 + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + VAR_2ROW r1, 8, 16 + VAR_END 64, 64 +%endmacro ; VAR + +INIT_XMM sse2 +VAR +INIT_XMM avx +VAR +INIT_XMM xop +VAR +%endif ; HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +%macro VAR 0 +cglobal pixel_var_8x8, 2,3,8 + VAR_START 1 + lea r2, [r1 * 3] + movh m0, [r0] + movh m3, [r0 + r1] + movhps m0, [r0 + r1 * 2] + movhps m3, [r0 + r2] + DEINTB 1, 0, 4, 3, 7 + lea r0, [r0 + r1 * 4] + VAR_CORE + movh m0, [r0] + movh m3, [r0 + r1] + movhps m0, [r0 + r1 * 2] + movhps m3, [r0 + r2] + DEINTB 1, 0, 4, 3, 7 + VAR_CORE + VAR_END 8, 8 + +cglobal pixel_var_16x16_internal + movu m0, [r0] + movu m3, [r0 + r1] + DEINTB 1, 0, 4, 3, 7 + VAR_CORE + movu m0, [r0 + 2 * r1] + movu m3, [r0 + r2] + DEINTB 1, 0, 4, 3, 7 + lea r0, [r0 + r1 * 4] + VAR_CORE + movu m0, [r0] + movu m3, [r0 + r1] + DEINTB 1, 0, 4, 3, 7 + VAR_CORE + movu m0, [r0 + 2 * r1] + movu m3, [r0 + r2] + DEINTB 1, 0, 4, 3, 7 + lea r0, [r0 + r1 * 4] + VAR_CORE + movu m0, [r0] + movu m3, [r0 + r1] + DEINTB 1, 0, 4, 3, 7 + VAR_CORE + movu m0, [r0 + 2 * r1] + movu m3, [r0 + r2] + DEINTB 1, 0, 4, 3, 7 + lea r0, [r0 + r1 * 4] + VAR_CORE + movu m0, [r0] + movu m3, [r0 + r1] + DEINTB 1, 0, 4, 3, 7 + VAR_CORE + movu m0, [r0 + 2 * r1] + movu m3, [r0 + r2] + DEINTB 1, 0, 4, 3, 7 + VAR_CORE + ret + +cglobal pixel_var_16x16, 2,3,8 + VAR_START 1 + lea r2, [r1 * 3] + call pixel_var_16x16_internal + VAR_END 16, 16 + +cglobal pixel_var_32x32, 2,4,8 + VAR_START 1 + lea r2, [r1 * 3] + mov r3, r0 + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r3 + 16] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + VAR_END 32, 32 + +cglobal pixel_var_64x64, 2,6,8 + VAR_START 1 + lea r2, [r1 * 3] + mov r3, r0 + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + HADDW m5, m2 + movd r4d, m5 + pxor m5, m5 + lea r0, [r3 + 16] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + lea r0, [r3 + 32] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r3 + 48] + HADDW m5, m2 + movd r5d, m5 + add r4, r5 + pxor m5, m5 + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + lea r0, [r0 + r1 * 4] + call pixel_var_16x16_internal + VAR_END 64, 64 +%endmacro ; VAR + +INIT_XMM sse2 +VAR +INIT_XMM avx +VAR +INIT_XMM xop +VAR + +INIT_YMM avx2 +cglobal pixel_var_16x16, 2,4,7 + VAR_START 0 + mov r2d, 4 + lea r3, [r1*3] +.loop: + pmovzxbw m0, [r0] + pmovzxbw m3, [r0+r1] + pmovzxbw m1, [r0+r1*2] + pmovzxbw m4, [r0+r3] + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + vextracti128 xm0, m5, 1 + vextracti128 xm1, m6, 1 + paddw xm5, xm0 + paddd xm6, xm1 + HADDW xm5, xm2 + HADDD xm6, xm1 +%if ARCH_X86_64 + punpckldq xm5, xm6 + movq rax, xm5 +%else + movd eax, xm5 + movd edx, xm6 +%endif + RET +%endif ; !HIGH_BIT_DEPTH + +%macro VAR2_END 3 + HADDW %2, xm1 + movd r1d, %2 + imul r1d, r1d + HADDD %3, xm1 + shr r1d, %1 + movd eax, %3 + movd [r4], %3 + sub eax, r1d ; sqr - (sum * sum >> shift) + RET +%endmacro + diff --git a/source/common/x86/pixel.h b/source/common/x86/pixel.h new file mode 100644 index 0000000..e99b1ee --- /dev/null +++ b/source/common/x86/pixel.h @@ -0,0 +1,227 @@ +/***************************************************************************** + * pixel.h: x86 pixel metrics + ***************************************************************************** + * Copyright (C) 2003-2013 x264 project + * + * Authors: Laurent Aimar + * Loren Merritt + * Fiona Glaser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_I386_PIXEL_H +#define X265_I386_PIXEL_H + +#define DECL_PIXELS(ret, name, suffix, args) \ + ret x265_pixel_ ## name ## _16x64_ ## suffix args; \ + ret x265_pixel_ ## name ## _16x32_ ## suffix args; \ + ret x265_pixel_ ## name ## _16x16_ ## suffix args; \ + ret x265_pixel_ ## name ## _16x12_ ## suffix args; \ + ret x265_pixel_ ## name ## _16x8_ ## suffix args; \ + ret x265_pixel_ ## name ## _16x4_ ## suffix args; \ + ret x265_pixel_ ## name ## _8x32_ ## suffix args; \ + ret x265_pixel_ ## name ## _8x16_ ## suffix args; \ + ret x265_pixel_ ## name ## _8x8_ ## suffix args; \ + ret x265_pixel_ ## name ## _8x4_ ## suffix args; \ + ret x265_pixel_ ## name ## _4x16_ ## suffix args; \ + ret x265_pixel_ ## name ## _4x8_ ## suffix args; \ + ret x265_pixel_ ## name ## _4x4_ ## suffix args; \ + ret x265_pixel_ ## name ## _32x8_ ## suffix args; \ + ret x265_pixel_ ## name ## _32x16_ ## suffix args; \ + ret x265_pixel_ ## name ## _32x24_ ## suffix args; \ + ret x265_pixel_ ## name ## _24x32_ ## suffix args; \ + ret x265_pixel_ ## name ## _32x32_ ## suffix args; \ + ret x265_pixel_ ## name ## _32x64_ ## suffix args; \ + ret x265_pixel_ ## name ## _64x16_ ## suffix args; \ + ret x265_pixel_ ## name ## _64x32_ ## suffix args; \ + ret x265_pixel_ ## name ## _64x48_ ## suffix args; \ + ret x265_pixel_ ## name ## _64x64_ ## suffix args; \ + ret x265_pixel_ ## name ## _48x64_ ## suffix args; \ + ret x265_pixel_ ## name ## _24x32_ ## suffix args; \ + ret x265_pixel_ ## name ## _12x16_ ## suffix args; \ + +#define DECL_X1(name, suffix) \ + DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t)) + +#define DECL_X1_SS(name, suffix) \ + DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t)) + +#define DECL_X1_SP(name, suffix) \ + DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, pixel *, intptr_t)) + +#define DECL_X4(name, suffix) \ + DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \ + DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) + +/* sad-a.asm */ +DECL_X1(sad, mmx2) +DECL_X1(sad, sse2) +DECL_X4(sad, sse2_misalign) +DECL_X1(sad, sse3) +DECL_X1(sad, sse2_aligned) +DECL_X1(sad, ssse3) +DECL_X1(sad, ssse3_aligned) +DECL_X1(sad, avx2) +DECL_X1(sad, avx2_aligned) +DECL_X4(sad, mmx2) +DECL_X4(sad, sse2) +DECL_X4(sad, sse3) +DECL_X4(sad, ssse3) +DECL_X4(sad, avx) +DECL_X4(sad, avx2) +DECL_X1(sad, cache32_mmx2); +DECL_X1(sad, cache64_mmx2); +DECL_X1(sad, cache64_sse2); +DECL_X1(sad, cache64_ssse3); +DECL_X4(sad, cache32_mmx2); +DECL_X4(sad, cache64_mmx2); +DECL_X4(sad, cache64_sse2); +DECL_X4(sad, cache64_ssse3); + +/* pixel-a.asm */ +DECL_X1(satd, mmx2) +DECL_X1(satd, sse2) +DECL_X1(satd, ssse3) +DECL_X1(satd, ssse3_atom) +DECL_X1(satd, sse4) +DECL_X1(satd, avx) +DECL_X1(satd, xop) +DECL_X1(satd, avx2) +int x265_pixel_satd_8x32_sse2(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_satd_16x4_sse2(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_satd_16x12_sse2(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t); + +DECL_X1(sa8d, mmx2) +DECL_X1(sa8d, sse2) +DECL_X1(sa8d, ssse3) +DECL_X1(sa8d, ssse3_atom) +DECL_X1(sa8d, sse4) +DECL_X1(sa8d, avx) +DECL_X1(sa8d, xop) +DECL_X1(sa8d, avx2) + +/* ssd-a.asm */ +DECL_X1(ssd, mmx) +DECL_X1(ssd, mmx2) +DECL_X1(ssd, sse2slow) +DECL_X1(ssd, sse2) +DECL_X1(ssd, ssse3) +DECL_X1(ssd, avx) +DECL_X1(ssd, xop) +DECL_X1(ssd, avx2) +DECL_X1_SS(ssd_ss, mmx) +DECL_X1_SS(ssd_ss, mmx2) +DECL_X1_SS(ssd_ss, sse2slow) +DECL_X1_SS(ssd_ss, sse2) +DECL_X1_SS(ssd_ss, ssse3) +DECL_X1_SS(ssd_ss, sse4) +DECL_X1_SS(ssd_ss, avx) +DECL_X1_SS(ssd_ss, xop) +DECL_X1_SS(ssd_ss, avx2) +DECL_X1_SP(ssd_sp, sse4) +#define DECL_HEVC_SSD(suffix) \ + int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_16x64_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_32x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_32x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_16x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_32x24_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_24x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_32x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_8x32_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_16x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_16x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_8x16_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_16x12_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_16x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_8x8_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); \ + int x265_pixel_ssd_8x4_ ## suffix(pixel *, intptr_t, pixel *, intptr_t); +DECL_HEVC_SSD(sse2) +DECL_HEVC_SSD(ssse3) +DECL_HEVC_SSD(avx) + +int x265_pixel_ssd_12x16_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_24x32_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_48x64_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_64x16_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t); +int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t); + +int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t); +int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t); +int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t); +int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t); +int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t); + +#define ADDAVG(func) \ + void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t); +ADDAVG(addAvg_2x4) +ADDAVG(addAvg_2x8) +ADDAVG(addAvg_4x2); +ADDAVG(addAvg_4x4) +ADDAVG(addAvg_4x8) +ADDAVG(addAvg_4x16) +ADDAVG(addAvg_6x8) +ADDAVG(addAvg_8x2) +ADDAVG(addAvg_8x4) +ADDAVG(addAvg_8x6) +ADDAVG(addAvg_8x8) +ADDAVG(addAvg_8x16) +ADDAVG(addAvg_8x32) +ADDAVG(addAvg_12x16) +ADDAVG(addAvg_16x4) +ADDAVG(addAvg_16x8) +ADDAVG(addAvg_16x12) +ADDAVG(addAvg_16x16) +ADDAVG(addAvg_16x32) +ADDAVG(addAvg_16x64) +ADDAVG(addAvg_24x32) +ADDAVG(addAvg_32x8) +ADDAVG(addAvg_32x16) +ADDAVG(addAvg_32x24) +ADDAVG(addAvg_32x32) +ADDAVG(addAvg_32x64) +ADDAVG(addAvg_48x64) +ADDAVG(addAvg_64x16) +ADDAVG(addAvg_64x32) +ADDAVG(addAvg_64x48) +ADDAVG(addAvg_64x64) + +ADDAVG(addAvg_2x16) +ADDAVG(addAvg_4x32) +ADDAVG(addAvg_6x16) +ADDAVG(addAvg_8x12) +ADDAVG(addAvg_8x64) +ADDAVG(addAvg_12x32) +ADDAVG(addAvg_16x24) +ADDAVG(addAvg_24x64) +ADDAVG(addAvg_32x48) + +void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); +void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift); + +#undef DECL_PIXELS +#undef DECL_HEVC_SSD +#undef DECL_X1 +#undef DECL_X4 + +#endif // ifndef X265_I386_PIXEL_H diff --git a/source/common/x86/pixeladd8.asm b/source/common/x86/pixeladd8.asm new file mode 100644 index 0000000..27b6684 --- /dev/null +++ b/source/common/x86/pixeladd8.asm @@ -0,0 +1,740 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Praveen Kumar Tiwari +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +SECTION .text + +cextern pw_pixel_max + +;----------------------------------------------------------------------------- +; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m1, [pw_pixel_max] + pxor m0, m0 + add r4, r4 + add r5, r5 + add r1, r1 + movh m2, [r2] + movhps m2, [r2 + r4] + movh m3, [r3] + movhps m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movhps m4, [r2 + r4] + movh m5, [r3] + movhps m5, [r3 + r5] + + paddw m2, m3 + paddw m4, m5 + CLIPW2 m2, m4, m0, m1 + + movh [r0], m2 + movhps [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movhps [r0 + r1], m4 + + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + add r5, r5 + pmovzxbw m0, [r2] + pmovzxbw m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, [r2] + pmovzxbw m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m0 + packuswb m2, m2 + packuswb m4, m4 + packuswb m6, m6 + + movd [r0], m0 + movd [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movd [r0], m4 + movd [r0 + r1], m6 + + RET +%endif + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W4_H4 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m1, [pw_pixel_max] + pxor m0, m0 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movh m2, [r2] + movhps m2, [r2 + r4] + movh m3, [r3] + movhps m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movhps m4, [r2 + r4] + movh m5, [r3] + movhps m5, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m2, m3 + paddw m4, m5 + CLIPW2 m2, m4, m0, m1 + + movh [r0], m2 + movhps [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movhps [r0 + r1], m4 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, [r2] + pmovzxbw m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m0 + packuswb m2, m2 + packuswb m4, m4 + packuswb m6, m6 + + movd [r0], m0 + movd [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movd [r0], m4 + movd [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W4_H4 4, 8 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W8_H4 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + r1], m2 + + movu m0, [r2] + movu m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + dec r6d + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, [r2] + pmovzxbw m6, [r2 + r4] + movu m5, [r3] + movu m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m0 + packuswb m2, m2 + packuswb m4, m4 + packuswb m6, m6 + + movh [r0], m0 + movh [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movh [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W8_H4 8, 8 +PIXEL_ADD_PS_W8_H4 8, 16 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W16_H4 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + lea r0, [r0 + r1 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 8] + movu m2, [r3] + movu m3, [r3 + 16] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 16] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + + movu [r0], m0 + movu [r0 + r1], m4 + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 8] + movu m2, [r3] + movu m3, [r3 + 16] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 16] + dec r6d + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + + movu [r0], m0 + movu [r0 + r1], m4 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W16_H4 16, 16 +PIXEL_ADD_PS_W16_H4 16, 32 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W32_H2 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/2 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + 32] + movu m2, [r2 + 48] + movu m1, [r3 + 32] + movu m3, [r3 + 48] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 32], m0 + movu [r0 + 48], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + movu m0, [r2 + r4 + 32] + movu m2, [r2 + r4 + 48] + movu m1, [r3 + r5 + 32] + movu m3, [r3 + r5 + 48] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/2 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m2, [r2 + 16] + pmovzxbw m3, [r2 + 24] + movu m4, [r3] + movu m5, [r3 + 16] + movu m6, [r3 + 32] + movu m7, [r3 + 48] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0], m0 + movu [r0 + 16], m2 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 8] + pmovzxbw m2, [r2 + r4 + 16] + pmovzxbw m3, [r2 + r4 + 24] + movu m4, [r3 + r5] + movu m5, [r3 + r5 + 16] + movu m6, [r3 + r5 + 32] + movu m7, [r3 + r5 + 48] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W32_H2 32, 32 +PIXEL_ADD_PS_W32_H2 32, 64 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W64_H2 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/2 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + 32] + movu m2, [r2 + 48] + movu m1, [r3 + 32] + movu m3, [r3 + 48] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 32], m0 + movu [r0 + 48], m2 + + movu m0, [r2 + 64] + movu m2, [r2 + 80] + movu m1, [r3 + 64] + movu m3, [r3 + 80] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 64], m0 + movu [r0 + 80], m2 + + movu m0, [r2 + 96] + movu m2, [r2 + 112] + movu m1, [r3 + 96] + movu m3, [r3 + 112] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 96], m0 + movu [r0 + 112], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + movu m0, [r2 + r4 + 32] + movu m2, [r2 + r4 + 48] + movu m1, [r3 + r5 + 32] + movu m3, [r3 + r5 + 48] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m2 + + movu m0, [r2 + r4 + 64] + movu m2, [r2 + r4 + 80] + movu m1, [r3 + r5 + 64] + movu m3, [r3 + r5 + 80] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 64], m0 + movu [r0 + r1 + 80], m2 + + movu m0, [r2 + r4 + 96] + movu m2, [r2 + r4 + 112] + movu m1, [r3 + r5 + 96] + movu m3, [r3 + r5 + 112] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 96], m0 + movu [r0 + r1 + 112], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/2 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m2, [r2 + 16] + pmovzxbw m3, [r2 + 24] + movu m4, [r3] + movu m5, [r3 + 16] + movu m6, [r3 + 32] + movu m7, [r3 + 48] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0], m0 + movu [r0 + 16], m2 + + pmovzxbw m0, [r2 + 32] + pmovzxbw m1, [r2 + 40] + pmovzxbw m2, [r2 + 48] + pmovzxbw m3, [r2 + 56] + movu m4, [r3 + 64] + movu m5, [r3 + 80] + movu m6, [r3 + 96] + movu m7, [r3 + 112] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + 32], m0 + movu [r0 + 48], m2 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 8] + pmovzxbw m2, [r2 + r4 + 16] + pmovzxbw m3, [r2 + r4 + 24] + movu m4, [r3 + r5] + movu m5, [r3 + r5 + 16] + movu m6, [r3 + r5 + 32] + movu m7, [r3 + r5 + 48] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + pmovzxbw m0, [r2 + r4 + 32] + pmovzxbw m1, [r2 + r4 + 40] + pmovzxbw m2, [r2 + r4 + 48] + pmovzxbw m3, [r2 + r4 + 56] + movu m4, [r3 + r5 + 64] + movu m5, [r3 + r5 + 80] + movu m6, [r3 + r5 + 96] + movu m7, [r3 + r5 + 112] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W64_H2 64, 64 diff --git a/source/common/x86/sad-a.asm b/source/common/x86/sad-a.asm new file mode 100644 index 0000000..8a8f4b5 --- /dev/null +++ b/source/common/x86/sad-a.asm @@ -0,0 +1,3712 @@ +;***************************************************************************** +;* sad-a.asm: x86 sad functions +;***************************************************************************** +;* Copyright (C) 2003-2013 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* Laurent Aimar +;* Alex Izvorski +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +MSK: db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0 +pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1 +hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11 + +SECTION .text + +cextern pb_3 +cextern pb_shuf8x8c +cextern pw_8 +cextern sw_64 + +;============================================================================= +; SAD MMX +;============================================================================= + +%macro SAD_INC_2x16P 0 + movq mm1, [r0] + movq mm2, [r0+8] + movq mm3, [r0+r1] + movq mm4, [r0+r1+8] + psadbw mm1, [r2] + psadbw mm2, [r2+8] + psadbw mm3, [r2+r3] + psadbw mm4, [r2+r3+8] + lea r0, [r0+2*r1] + paddw mm1, mm2 + paddw mm3, mm4 + lea r2, [r2+2*r3] + paddw mm0, mm1 + paddw mm0, mm3 +%endmacro + +%macro SAD_INC_2x8P 0 + movq mm1, [r0] + movq mm2, [r0+r1] + psadbw mm1, [r2] + psadbw mm2, [r2+r3] + lea r0, [r0+2*r1] + paddw mm0, mm1 + paddw mm0, mm2 + lea r2, [r2+2*r3] +%endmacro + +%macro SAD_INC_2x4P 0 + movd mm1, [r0] + movd mm2, [r2] + punpckldq mm1, [r0+r1] + punpckldq mm2, [r2+r3] + psadbw mm1, mm2 + paddw mm0, mm1 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SAD 2 +cglobal pixel_sad_%1x%2_mmx2, 4,4 + pxor mm0, mm0 +%rep %2/2 + SAD_INC_2x%1P +%endrep + movd eax, mm0 + RET +%endmacro + +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +SAD 8, 4 +SAD 4, 16 +SAD 4, 8 +SAD 4, 4 + + + +;============================================================================= +; SAD XMM +;============================================================================= + +%macro SAD_END_SSE2 0 + movhlps m1, m0 + paddw m0, m1 + movd eax, m0 + RET +%endmacro + +%macro PROCESS_SAD_12x4 0 + movu m1, [r2] + movu m2, [r0] + pand m1, m4 + pand m2, m4 + psadbw m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + movu m1, [r2] + movu m2, [r0] + pand m1, m4 + pand m2, m4 + psadbw m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + movu m1, [r2] + movu m2, [r0] + pand m1, m4 + pand m2, m4 + psadbw m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + movu m1, [r2] + movu m2, [r0] + pand m1, m4 + pand m2, m4 + psadbw m1, m2 + paddd m0, m1 +%endmacro + +%macro PROCESS_SAD_16x4 0 + movu m1, [r2] + movu m2, [r2 + r3] + psadbw m1, [r0] + psadbw m2, [r0 + r1] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + movu m1, [r2] + movu m2, [r2 + r3] + psadbw m1, [r0] + psadbw m2, [r0 + r1] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] +%endmacro + +%macro PROCESS_SAD_24x4 0 + movu m1, [r2] + movq m2, [r2 + 16] + lea r2, [r2 + r3] + movu m3, [r2] + movq m4, [r2 + 16] + psadbw m1, [r0] + psadbw m3, [r0 + r1] + paddd m0, m1 + paddd m0, m3 + movq m1, [r0 + 16] + lea r0, [r0 + r1] + movq m3, [r0 + 16] + punpcklqdq m2, m4 + punpcklqdq m1, m3 + psadbw m2, m1 + paddd m0, m2 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movq m2, [r2 + 16] + lea r2, [r2 + r3] + movu m3, [r2] + movq m4, [r2 + 16] + psadbw m1, [r0] + psadbw m3, [r0 + r1] + paddd m0, m1 + paddd m0, m3 + movq m1, [r0 + 16] + lea r0, [r0 + r1] + movq m3, [r0 + 16] + punpcklqdq m2, m4 + punpcklqdq m1, m3 + psadbw m2, m1 + paddd m0, m2 +%endmacro + +%macro PROCESS_SAD_32x4 0 + movu m1, [r2] + movu m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + movu m1, [r2] + movu m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + movu m1, [r2] + movu m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + movu m1, [r2] + movu m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endmacro + +%macro PROCESS_SAD_48x4 0 + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + paddd m1, m2 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + paddd m1, m2 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + paddd m1, m2 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + paddd m1, m2 + paddd m0, m1 + paddd m0, m3 +%endmacro + +%macro PROCESS_SAD_8x4 0 + movq m1, [r2] + movq m2, [r2 + r3] + lea r2, [r2 + 2 * r3] + movq m3, [r0] + movq m4, [r0 + r1] + lea r0, [r0 + 2 * r1] + punpcklqdq m1, m2 + punpcklqdq m3, m4 + psadbw m1, m3 + paddd m0, m1 + movq m1, [r2] + movq m2, [r2 + r3] + lea r2, [r2 + 2 * r3] + movq m3, [r0] + movq m4, [r0 + r1] + lea r0, [r0 + 2 * r1] + punpcklqdq m1, m2 + punpcklqdq m3, m4 + psadbw m1, m3 + paddd m0, m1 +%endmacro + +%macro PROCESS_SAD_64x4 0 + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + movu m4, [r2 + 48] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + psadbw m4, [r0 + 48] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + movu m4, [r2 + 48] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + psadbw m4, [r0 + 48] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + movu m4, [r2 + 48] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + psadbw m4, [r0 + 48] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + movu m4, [r2 + 48] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + psadbw m3, [r0 + 32] + psadbw m4, [r0 + 48] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endmacro + +%macro SAD_W16 0 +;----------------------------------------------------------------------------- +; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_16x16, 4,4,8 + movu m0, [r2] + movu m1, [r2+r3] + lea r2, [r2+2*r3] + movu m2, [r2] + movu m3, [r2+r3] + lea r2, [r2+2*r3] + psadbw m0, [r0] + psadbw m1, [r0+r1] + lea r0, [r0+2*r1] + movu m4, [r2] + paddw m0, m1 + psadbw m2, [r0] + psadbw m3, [r0+r1] + lea r0, [r0+2*r1] + movu m5, [r2+r3] + lea r2, [r2+2*r3] + paddw m2, m3 + movu m6, [r2] + movu m7, [r2+r3] + lea r2, [r2+2*r3] + paddw m0, m2 + psadbw m4, [r0] + psadbw m5, [r0+r1] + lea r0, [r0+2*r1] + movu m1, [r2] + paddw m4, m5 + psadbw m6, [r0] + psadbw m7, [r0+r1] + lea r0, [r0+2*r1] + movu m2, [r2+r3] + lea r2, [r2+2*r3] + paddw m6, m7 + movu m3, [r2] + paddw m0, m4 + movu m4, [r2+r3] + lea r2, [r2+2*r3] + paddw m0, m6 + psadbw m1, [r0] + psadbw m2, [r0+r1] + lea r0, [r0+2*r1] + movu m5, [r2] + paddw m1, m2 + psadbw m3, [r0] + psadbw m4, [r0+r1] + lea r0, [r0+2*r1] + movu m6, [r2+r3] + lea r2, [r2+2*r3] + paddw m3, m4 + movu m7, [r2] + paddw m0, m1 + movu m1, [r2+r3] + paddw m0, m3 + psadbw m5, [r0] + psadbw m6, [r0+r1] + lea r0, [r0+2*r1] + paddw m5, m6 + psadbw m7, [r0] + psadbw m1, [r0+r1] + paddw m7, m1 + paddw m0, m5 + paddw m0, m7 + SAD_END_SSE2 + +;----------------------------------------------------------------------------- +; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_16x8, 4,4 + movu m0, [r2] + movu m2, [r2+r3] + lea r2, [r2+2*r3] + movu m3, [r2] + movu m4, [r2+r3] + psadbw m0, [r0] + psadbw m2, [r0+r1] + lea r0, [r0+2*r1] + psadbw m3, [r0] + psadbw m4, [r0+r1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddw m0, m2 + paddw m3, m4 + paddw m0, m3 + movu m1, [r2] + movu m2, [r2+r3] + lea r2, [r2+2*r3] + movu m3, [r2] + movu m4, [r2+r3] + psadbw m1, [r0] + psadbw m2, [r0+r1] + lea r0, [r0+2*r1] + psadbw m3, [r0] + psadbw m4, [r0+r1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddw m1, m2 + paddw m3, m4 + paddw m0, m1 + paddw m0, m3 + SAD_END_SSE2 + +;----------------------------------------------------------------------------- +; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_16x12, 4,4,3 + pxor m0, m0 + + PROCESS_SAD_16x4 + PROCESS_SAD_16x4 + PROCESS_SAD_16x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_16x32, 4,5,3 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_16x4 + PROCESS_SAD_16x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_16x64, 4,5,3 + pxor m0, m0 + mov r4d, 8 +.loop: + PROCESS_SAD_16x4 + PROCESS_SAD_16x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_16x4, 4,4,3 + + movu m0, [r2] + movu m1, [r2 + r3] + psadbw m0, [r0] + psadbw m1, [r0 + r1] + paddd m0, m1 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + movu m1, [r2] + movu m2, [r2 + r3] + psadbw m1, [r0] + psadbw m2, [r0 + r1] + paddd m1, m2 + paddd m0, m1 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_32x8, 4,4,3 + pxor m0, m0 + + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_32x24, 4,5,3 + pxor m0, m0 + mov r4d, 3 +.loop: + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_32x32, 4,5,3 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_32x16, 4,4,3 + pxor m0, m0 + + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_32x64, 4,5,3 + pxor m0, m0 + mov r4d, 8 +.loop: + PROCESS_SAD_32x4 + PROCESS_SAD_32x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_8x32, 4,5,3 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_8x4 + PROCESS_SAD_8x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_64x16, 4,4,5 + pxor m0, m0 + + PROCESS_SAD_64x4 + PROCESS_SAD_64x4 + PROCESS_SAD_64x4 + PROCESS_SAD_64x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_64x32, 4,5,5 + pxor m0, m0 + mov r4, 4 + +.loop: + PROCESS_SAD_64x4 + PROCESS_SAD_64x4 + + dec r4 + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_64x48, 4,5,5 + pxor m0, m0 + mov r4, 6 + +.loop: + PROCESS_SAD_64x4 + PROCESS_SAD_64x4 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_64x64, 4,5,5 + pxor m0, m0 + mov r4, 8 + +.loop: + PROCESS_SAD_64x4 + PROCESS_SAD_64x4 + dec r4 + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_48x64, 4,5,5 + pxor m0, m0 + mov r4, 64 + +.loop: + PROCESS_SAD_48x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + PROCESS_SAD_48x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + sub r4, 8 + cmp r4, 8 + +jnz .loop + PROCESS_SAD_48x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + PROCESS_SAD_48x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_24x32, 4,5,4 + pxor m0, m0 + mov r4, 32 + +.loop: + PROCESS_SAD_24x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + PROCESS_SAD_24x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + sub r4, 8 + cmp r4, 8 +jnz .loop + PROCESS_SAD_24x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + PROCESS_SAD_24x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;----------------------------------------------------------------------------- +; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +cglobal pixel_sad_12x16, 4,4,4 + mova m4, [MSK] + pxor m0, m0 + + PROCESS_SAD_12x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + PROCESS_SAD_12x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + PROCESS_SAD_12x4 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + PROCESS_SAD_12x4 + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +%endmacro + +INIT_XMM sse2 +SAD_W16 +INIT_XMM sse3 +SAD_W16 +INIT_XMM sse2, aligned +SAD_W16 + +%macro SAD_INC_4x8P_SSE 1 + movq m1, [r0] + movq m2, [r0+r1] + lea r0, [r0+2*r1] + movq m3, [r2] + movq m4, [r2+r3] + lea r2, [r2+2*r3] + movhps m1, [r0] + movhps m2, [r0+r1] + movhps m3, [r2] + movhps m4, [r2+r3] + lea r0, [r0+2*r1] + psadbw m1, m3 + psadbw m2, m4 + lea r2, [r2+2*r3] + ACCUM paddw, 0, 1, %1 + paddw m0, m2 +%endmacro + +INIT_XMM +;Even on Nehalem, no sizes other than 8x16 benefit from this method. +cglobal pixel_sad_8x16_sse2, 4,4 + SAD_INC_4x8P_SSE 0 + SAD_INC_4x8P_SSE 1 + SAD_INC_4x8P_SSE 1 + SAD_INC_4x8P_SSE 1 + SAD_END_SSE2 + RET + +;============================================================================= +; SAD x3/x4 MMX +;============================================================================= + +%macro SAD_X3_START_1x8P 0 + movq mm3, [r0] + movq mm0, [r1] + movq mm1, [r2] + movq mm2, [r3] + psadbw mm0, mm3 + psadbw mm1, mm3 + psadbw mm2, mm3 +%endmacro + +%macro SAD_X3_1x8P 2 + movq mm3, [r0+%1] + movq mm4, [r1+%2] + movq mm5, [r2+%2] + movq mm6, [r3+%2] + psadbw mm4, mm3 + psadbw mm5, mm3 + psadbw mm6, mm3 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 +%endmacro + +%macro SAD_X3_START_2x4P 3 + movd mm3, [r0] + movd %1, [r1] + movd %2, [r2] + movd %3, [r3] + punpckldq mm3, [r0+FENC_STRIDE] + punpckldq %1, [r1+r4] + punpckldq %2, [r2+r4] + punpckldq %3, [r3+r4] + psadbw %1, mm3 + psadbw %2, mm3 + psadbw %3, mm3 +%endmacro + +%macro SAD_X3_2x16P 1 +%if %1 + SAD_X3_START_1x8P +%else + SAD_X3_1x8P 0, 0 +%endif + SAD_X3_1x8P 8, 8 + SAD_X3_1x8P FENC_STRIDE, r4 + SAD_X3_1x8P FENC_STRIDE+8, r4+8 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X3_2x8P 1 +%if %1 + SAD_X3_START_1x8P +%else + SAD_X3_1x8P 0, 0 +%endif + SAD_X3_1x8P FENC_STRIDE, r4 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X3_2x4P 1 +%if %1 + SAD_X3_START_2x4P mm0, mm1, mm2 +%else + SAD_X3_START_2x4P mm4, mm5, mm6 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 +%endif + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r4] + lea r2, [r2+2*r4] + lea r3, [r3+2*r4] +%endmacro + +%macro SAD_X4_START_1x8P 0 + movq mm7, [r0] + movq mm0, [r1] + movq mm1, [r2] + movq mm2, [r3] + movq mm3, [r4] + psadbw mm0, mm7 + psadbw mm1, mm7 + psadbw mm2, mm7 + psadbw mm3, mm7 +%endmacro + +%macro SAD_X4_1x8P 2 + movq mm7, [r0+%1] + movq mm4, [r1+%2] + movq mm5, [r2+%2] + movq mm6, [r3+%2] + psadbw mm4, mm7 + psadbw mm5, mm7 + psadbw mm6, mm7 + psadbw mm7, [r4+%2] + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm6 + paddw mm3, mm7 +%endmacro + +%macro SAD_X4_START_2x4P 0 + movd mm7, [r0] + movd mm0, [r1] + movd mm1, [r2] + movd mm2, [r3] + movd mm3, [r4] + punpckldq mm7, [r0+FENC_STRIDE] + punpckldq mm0, [r1+r5] + punpckldq mm1, [r2+r5] + punpckldq mm2, [r3+r5] + punpckldq mm3, [r4+r5] + psadbw mm0, mm7 + psadbw mm1, mm7 + psadbw mm2, mm7 + psadbw mm3, mm7 +%endmacro + +%macro SAD_X4_INC_2x4P 0 + movd mm7, [r0] + movd mm4, [r1] + movd mm5, [r2] + punpckldq mm7, [r0+FENC_STRIDE] + punpckldq mm4, [r1+r5] + punpckldq mm5, [r2+r5] + psadbw mm4, mm7 + psadbw mm5, mm7 + paddw mm0, mm4 + paddw mm1, mm5 + movd mm4, [r3] + movd mm5, [r4] + punpckldq mm4, [r3+r5] + punpckldq mm5, [r4+r5] + psadbw mm4, mm7 + psadbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 +%endmacro + +%macro SAD_X4_2x16P 1 +%if %1 + SAD_X4_START_1x8P +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P 8, 8 + SAD_X4_1x8P FENC_STRIDE, r5 + SAD_X4_1x8P FENC_STRIDE+8, r5+8 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X4_2x8P 1 +%if %1 + SAD_X4_START_1x8P +%else + SAD_X4_1x8P 0, 0 +%endif + SAD_X4_1x8P FENC_STRIDE, r5 + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X4_2x4P 1 +%if %1 + SAD_X4_START_2x4P +%else + SAD_X4_INC_2x4P +%endif + add r0, 2*FENC_STRIDE + lea r1, [r1+2*r5] + lea r2, [r2+2*r5] + lea r3, [r3+2*r5] + lea r4, [r4+2*r5] +%endmacro + +%macro SAD_X3_END 0 +%if UNIX64 + movd [r5+0], mm0 + movd [r5+4], mm1 + movd [r5+8], mm2 +%else + mov r0, r5mp + movd [r0+0], mm0 + movd [r0+4], mm1 + movd [r0+8], mm2 +%endif + RET +%endmacro + +%macro SAD_X4_END 0 + mov r0, r6mp + movd [r0+0], mm0 + movd [r0+4], mm1 + movd [r0+8], mm2 + movd [r0+12], mm3 + RET +%endmacro + +%macro SAD_X3_12x4 0 + mova m3, [r0] + movu m5, [r1] + pand m3, m4 + pand m5, m4 + psadbw m5, m3 + paddd m0, m5 + movu m5, [r2] + pand m5, m4 + psadbw m5, m3 + paddd m1, m5 + movu m5, [r3] + pand m5, m4 + psadbw m5, m3 + paddd m2, m5 + mova m3, [r0 + FENC_STRIDE] + movu m5, [r1 + r4] + pand m3, m4 + pand m5, m4 + psadbw m5, m3 + paddd m0, m5 + movu m5, [r2 + r4] + pand m5, m4 + psadbw m5, m3 + paddd m1, m5 + movu m5, [r3 + r4] + pand m5, m4 + psadbw m5, m3 + paddd m2, m5 + mova m3, [r0 + FENC_STRIDE * 2] + movu m5, [r1 + r4 * 2] + pand m3, m4 + pand m5, m4 + psadbw m5, m3 + paddd m0, m5 + movu m5, [r2 + r4 * 2] + pand m5, m4 + psadbw m5, m3 + paddd m1, m5 + movu m5, [r3 + r4 * 2] + pand m5, m4 + psadbw m5, m3 + paddd m2, m5 + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] + mova m3, [r0 + FENC_STRIDE + FENC_STRIDE * 2] + movu m5, [r1 + r4] + pand m3, m4 + pand m5, m4 + psadbw m5, m3 + paddd m0, m5 + movu m5, [r2 + r4] + pand m5, m4 + psadbw m5, m3 + paddd m1, m5 + movu m5, [r3 + r4] + pand m5, m4 + psadbw m5, m3 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE * 4] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] +%endmacro + +%macro SAD_X4_12x4 0 + mova m4, [r0] + movu m5, [r1] + pand m4, m6 + pand m5, m6 + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2] + pand m5, m6 + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3] + pand m5, m6 + psadbw m5, m4 + paddd m2, m5 + movu m5, [r4] + pand m5, m6 + psadbw m5, m4 + paddd m3, m5 + mova m4, [r0 + FENC_STRIDE] + movu m5, [r1 + r5] + pand m4, m6 + pand m5, m6 + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + r5] + pand m5, m6 + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + r5] + pand m5, m6 + psadbw m5, m4 + paddd m2, m5 + movu m5, [r4 + r5] + pand m5, m6 + psadbw m5, m4 + paddd m3, m5 + mova m4, [r0 + FENC_STRIDE * 2] + movu m5, [r1 + r5 * 2] + pand m4, m6 + pand m5, m6 + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + r5 * 2] + pand m5, m6 + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + r5 * 2] + pand m5, m6 + psadbw m5, m4 + paddd m2, m5 + movu m5, [r4 + r5 * 2] + pand m5, m6 + psadbw m5, m4 + paddd m3, m5 + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] + mova m4, [r0 + FENC_STRIDE + FENC_STRIDE * 2] + movu m5, [r1 + r5] + pand m4, m6 + pand m5, m6 + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + r5] + pand m5, m6 + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + r5] + pand m5, m6 + psadbw m5, m4 + paddd m2, m5 + movu m5, [r4 + r5] + pand m5, m6 + psadbw m5, m4 + paddd m3, m5 + lea r0, [r0 + FENC_STRIDE * 4] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] +%endmacro + +%macro SAD_X3_24x4 0 + mova m3, [r0] + mova m4, [r0 + 16] + movu m5, [r1] + movu m6, [r1 + 16] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2] + movu m6, [r2 + 16] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3] + movu m6, [r3 + 16] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m2, m5 + + mova m3, [r0 + FENC_STRIDE] + mova m4, [r0 + 16 + FENC_STRIDE] + movu m5, [r1 + r4] + movu m6, [r1 + 16 + r4] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2 + r4] + movu m6, [r2 + 16 + r4] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3 + r4] + movu m6, [r3 + 16 + r4] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m2, m5 + + mova m3, [r0 + FENC_STRIDE * 2] + mova m4, [r0 + 16 + FENC_STRIDE * 2] + movu m5, [r1 + r4 * 2] + movu m6, [r1 + 16 + r4 * 2] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2 + r4 * 2] + movu m6, [r2 + 16 + r4 * 2] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3 + r4 * 2] + movu m6, [r3 + 16 + r4 * 2] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] + + mova m3, [r0 + FENC_STRIDE] + mova m4, [r0 + 16 + FENC_STRIDE] + movu m5, [r1 + r4] + movu m6, [r1 + 16 + r4] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2 + r4] + movu m6, [r2 + 16 + r4] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3 + r4] + movu m6, [r3 + 16 + r4] + psadbw m5, m3 + psadbw m6, m4 + pshufd m6, m6, 84 + paddd m5, m6 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] +%endmacro + +%macro SAD_X4_24x4 0 + mova m4, [r0] + mova m5, [r0 + 16] + movu m6, [r1] + movu m7, [r1 + 16] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2] + movu m7, [r2 + 16] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3] + movu m7, [r3 + 16] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4] + movu m7, [r4 + 16] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m3, m6 + + mova m4, [r0 + FENC_STRIDE] + mova m5, [r0 + 16 + FENC_STRIDE] + movu m6, [r1 + r5] + movu m7, [r1 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2 + r5] + movu m7, [r2 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3 + r5] + movu m7, [r3 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4 + r5] + movu m7, [r4 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m3, m6 + + mova m4, [r0 + FENC_STRIDE * 2] + mova m5, [r0 + 16 + FENC_STRIDE * 2] + movu m6, [r1 + r5 * 2] + movu m7, [r1 + 16 + r5 * 2] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2 + r5 * 2] + movu m7, [r2 + 16 + r5 * 2] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3 + r5 * 2] + movu m7, [r3 + 16 + r5 * 2] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4 + r5 * 2] + movu m7, [r4 + 16 + r5 * 2] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] + mova m4, [r0 + FENC_STRIDE] + mova m5, [r0 + 16 + FENC_STRIDE] + movu m6, [r1 + r5] + movu m7, [r1 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2 + r5] + movu m7, [r2 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3 + r5] + movu m7, [r3 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4 + r5] + movu m7, [r4 + 16 + r5] + psadbw m6, m4 + psadbw m7, m5 + pshufd m7, m7, 84 + paddd m6, m7 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] +%endmacro + +%macro SAD_X3_32x4 0 + mova m3, [r0] + mova m4, [r0 + 16] + movu m5, [r1] + movu m6, [r1 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2] + movu m6, [r2 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3] + movu m6, [r3 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r4] + lea r2, [r2 + r4] + lea r3, [r3 + r4] + mova m3, [r0] + mova m4, [r0 + 16] + movu m5, [r1] + movu m6, [r1 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2] + movu m6, [r2 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3] + movu m6, [r3 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r4] + lea r2, [r2 + r4] + lea r3, [r3 + r4] + mova m3, [r0] + mova m4, [r0 + 16] + movu m5, [r1] + movu m6, [r1 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2] + movu m6, [r2 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3] + movu m6, [r3 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r4] + lea r2, [r2 + r4] + lea r3, [r3 + r4] + mova m3, [r0] + mova m4, [r0 + 16] + movu m5, [r1] + movu m6, [r1 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m0, m5 + movu m5, [r2] + movu m6, [r2 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m1, m5 + movu m5, [r3] + movu m6, [r3 + 16] + psadbw m5, m3 + psadbw m6, m4 + paddd m5, m6 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r4] + lea r2, [r2 + r4] + lea r3, [r3 + r4] +%endmacro + +%macro SAD_X4_32x4 0 + mova m4, [r0] + mova m5, [r0 + 16] + movu m6, [r1] + movu m7, [r1 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2] + movu m7, [r2 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3] + movu m7, [r3 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4] + movu m7, [r4 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r5] + lea r2, [r2 + r5] + lea r3, [r3 + r5] + lea r4, [r4 + r5] + mova m4, [r0] + mova m5, [r0 + 16] + movu m6, [r1] + movu m7, [r1 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2] + movu m7, [r2 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3] + movu m7, [r3 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4] + movu m7, [r4 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r5] + lea r2, [r2 + r5] + lea r3, [r3 + r5] + lea r4, [r4 + r5] + mova m4, [r0] + mova m5, [r0 + 16] + movu m6, [r1] + movu m7, [r1 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2] + movu m7, [r2 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3] + movu m7, [r3 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4] + movu m7, [r4 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r5] + lea r2, [r2 + r5] + lea r3, [r3 + r5] + lea r4, [r4 + r5] + mova m4, [r0] + mova m5, [r0 + 16] + movu m6, [r1] + movu m7, [r1 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m0, m6 + movu m6, [r2] + movu m7, [r2 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m1, m6 + movu m6, [r3] + movu m7, [r3 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m2, m6 + movu m6, [r4] + movu m7, [r4 + 16] + psadbw m6, m4 + psadbw m7, m5 + paddd m6, m7 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE] + lea r1, [r1 + r5] + lea r2, [r2 + r5] + lea r3, [r3 + r5] + lea r4, [r4 + r5] +%endmacro + +%macro SAD_X3_48x4 0 + mova m3, [r0] + mova m4, [r0 + 16] + mova m5, [r0 + 32] + movu m6, [r1] + psadbw m6, m3 + paddd m0, m6 + movu m6, [r1 + 16] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 32] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2] + psadbw m6, m3 + paddd m1, m6 + movu m6, [r2 + 16] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 32] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3] + psadbw m6, m3 + paddd m2, m6 + movu m6, [r3 + 16] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 32] + psadbw m6, m5 + paddd m2, m6 + + mova m3, [r0 + FENC_STRIDE] + mova m4, [r0 + 16 + FENC_STRIDE] + mova m5, [r0 + 32 + FENC_STRIDE] + movu m6, [r1 + r4] + psadbw m6, m3 + paddd m0, m6 + movu m6, [r1 + 16 + r4] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 32 + r4] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + r4] + psadbw m6, m3 + paddd m1, m6 + movu m6, [r2 + 16 + r4] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 32 + r4] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + r4] + psadbw m6, m3 + paddd m2, m6 + movu m6, [r3 + 16 + r4] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 32 + r4] + psadbw m6, m5 + paddd m2, m6 + + mova m3, [r0 + FENC_STRIDE * 2] + mova m4, [r0 + 16 + FENC_STRIDE * 2] + mova m5, [r0 + 32 + FENC_STRIDE * 2] + movu m6, [r1 + r4 * 2] + psadbw m6, m3 + paddd m0, m6 + movu m6, [r1 + 16 + r4 * 2] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 32 + r4 * 2] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + r4 * 2] + psadbw m6, m3 + paddd m1, m6 + movu m6, [r2 + 16 + r4 * 2] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 32 + r4 * 2] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + r4 * 2] + psadbw m6, m3 + paddd m2, m6 + movu m6, [r3 + 16 + r4 * 2] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 32 + r4 * 2] + psadbw m6, m5 + paddd m2, m6 + + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] + mova m3, [r0 + FENC_STRIDE] + mova m4, [r0 + 16 + FENC_STRIDE] + mova m5, [r0 + 32 + FENC_STRIDE] + movu m6, [r1 + r4] + psadbw m6, m3 + paddd m0, m6 + movu m6, [r1 + 16 + r4] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 32 + r4] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + r4] + psadbw m6, m3 + paddd m1, m6 + movu m6, [r2 + 16 + r4] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 32 + r4] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + r4] + psadbw m6, m3 + paddd m2, m6 + movu m6, [r3 + 16 + r4] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 32 + r4] + psadbw m6, m5 + paddd m2, m6 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] +%endmacro + +%macro SAD_X4_48x4 0 + mova m4, [r0] + mova m5, [r0 + 16] + mova m6, [r0 + 32] + movu m7, [r1] + psadbw m7, m4 + paddd m0, m7 + movu m7, [r1 + 16] + psadbw m7, m5 + paddd m0, m7 + movu m7, [r1 + 32] + psadbw m7, m6 + paddd m0, m7 + movu m7, [r2] + psadbw m7, m4 + paddd m1, m7 + movu m7, [r2 + 16] + psadbw m7, m5 + paddd m1, m7 + movu m7, [r2 + 32] + psadbw m7, m6 + paddd m1, m7 + movu m7, [r3] + psadbw m7, m4 + paddd m2, m7 + movu m7, [r3 + 16] + psadbw m7, m5 + paddd m2, m7 + movu m7, [r3 + 32] + psadbw m7, m6 + paddd m2, m7 + movu m7, [r4] + psadbw m7, m4 + paddd m3, m7 + movu m7, [r4 + 16] + psadbw m7, m5 + paddd m3, m7 + movu m7, [r4 + 32] + psadbw m7, m6 + paddd m3, m7 + + mova m4, [r0 + FENC_STRIDE] + mova m5, [r0 + 16 + FENC_STRIDE] + mova m6, [r0 + 32 + FENC_STRIDE] + movu m7, [r1 + r5] + psadbw m7, m4 + paddd m0, m7 + movu m7, [r1 + 16 + r5] + psadbw m7, m5 + paddd m0, m7 + movu m7, [r1 + 32 + r5] + psadbw m7, m6 + paddd m0, m7 + movu m7, [r2 + r5] + psadbw m7, m4 + paddd m1, m7 + movu m7, [r2 + 16 + r5] + psadbw m7, m5 + paddd m1, m7 + movu m7, [r2 + 32 + r5] + psadbw m7, m6 + paddd m1, m7 + movu m7, [r3 + r5] + psadbw m7, m4 + paddd m2, m7 + movu m7, [r3 + 16 + r5] + psadbw m7, m5 + paddd m2, m7 + movu m7, [r3 + 32 + r5] + psadbw m7, m6 + paddd m2, m7 + movu m7, [r4 + r5] + psadbw m7, m4 + paddd m3, m7 + movu m7, [r4 + 16 + r5] + psadbw m7, m5 + paddd m3, m7 + movu m7, [r4 + 32 + r5] + psadbw m7, m6 + paddd m3, m7 + + mova m4, [r0 + FENC_STRIDE * 2] + mova m5, [r0 + 16 + FENC_STRIDE * 2] + mova m6, [r0 + 32 + FENC_STRIDE * 2] + movu m7, [r1 + r5 * 2] + psadbw m7, m4 + paddd m0, m7 + movu m7, [r1 + 16 + r5 * 2] + psadbw m7, m5 + paddd m0, m7 + movu m7, [r1 + 32 + r5 * 2] + psadbw m7, m6 + paddd m0, m7 + movu m7, [r2 + r5 * 2] + psadbw m7, m4 + paddd m1, m7 + movu m7, [r2 + 16 + r5 * 2] + psadbw m7, m5 + paddd m1, m7 + movu m7, [r2 + 32 + r5 * 2] + psadbw m7, m6 + paddd m1, m7 + movu m7, [r3 + r5 * 2] + psadbw m7, m4 + paddd m2, m7 + movu m7, [r3 + 16 + r5 * 2] + psadbw m7, m5 + paddd m2, m7 + movu m7, [r3 + 32 + r5 * 2] + psadbw m7, m6 + paddd m2, m7 + movu m7, [r4 + r5 * 2] + psadbw m7, m4 + paddd m3, m7 + movu m7, [r4 + 16 + r5 * 2] + psadbw m7, m5 + paddd m3, m7 + movu m7, [r4 + 32 + r5 * 2] + psadbw m7, m6 + paddd m3, m7 + + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] + mova m4, [r0 + FENC_STRIDE] + mova m5, [r0 + 16 + FENC_STRIDE] + mova m6, [r0 + 32 + FENC_STRIDE] + movu m7, [r1 + r5] + psadbw m7, m4 + paddd m0, m7 + movu m7, [r1 + 16 + r5] + psadbw m7, m5 + paddd m0, m7 + movu m7, [r1 + 32 + r5] + psadbw m7, m6 + paddd m0, m7 + movu m7, [r2 + r5] + psadbw m7, m4 + paddd m1, m7 + movu m7, [r2 + 16 + r5] + psadbw m7, m5 + paddd m1, m7 + movu m7, [r2 + 32 + r5] + psadbw m7, m6 + paddd m1, m7 + movu m7, [r3 + r5] + psadbw m7, m4 + paddd m2, m7 + movu m7, [r3 + 16 + r5] + psadbw m7, m5 + paddd m2, m7 + movu m7, [r3 + 32 + r5] + psadbw m7, m6 + paddd m2, m7 + movu m7, [r4 + r5] + psadbw m7, m4 + paddd m3, m7 + movu m7, [r4 + 16 + r5] + psadbw m7, m5 + paddd m3, m7 + movu m7, [r4 + 32 + r5] + psadbw m7, m6 + paddd m3, m7 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] +%endmacro + +%macro SAD_X3_64x4 0 + mova m3, [r0] + mova m4, [r0 + 16] + movu m5, [r1] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 16] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 16] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 16] + psadbw m5, m4 + paddd m2, m5 + mova m3, [r0 + 32] + mova m4, [r0 + 48] + movu m5, [r1 + 32] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 48] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + 32] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 48] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + 32] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 48] + psadbw m5, m4 + paddd m2, m5 + + mova m3, [r0 + FENC_STRIDE] + mova m4, [r0 + 16 + FENC_STRIDE] + movu m5, [r1 + r4] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 16 + r4] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + r4] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 16 + r4] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + r4] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 16 + r4] + psadbw m5, m4 + paddd m2, m5 + mova m3, [r0 + 32 + FENC_STRIDE] + mova m4, [r0 + 48 + FENC_STRIDE] + movu m5, [r1 + 32 + r4] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 48 + r4] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + 32 + r4] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 48 + r4] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + 32 + r4] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 48 + r4] + psadbw m5, m4 + paddd m2, m5 + + mova m3, [r0 + FENC_STRIDE * 2] + mova m4, [r0 + 16 + FENC_STRIDE * 2] + movu m5, [r1 + r4 * 2] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 16 + r4 * 2] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + r4 * 2] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 16 + r4 * 2] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + r4 * 2] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 16 + r4 * 2] + psadbw m5, m4 + paddd m2, m5 + mova m3, [r0 + 32 + FENC_STRIDE * 2] + mova m4, [r0 + 48 + FENC_STRIDE * 2] + movu m5, [r1 + 32 + r4 * 2] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 48 + r4 * 2] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + 32 + r4 * 2] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 48 + r4 * 2] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + 32 + r4 * 2] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 48 + r4 * 2] + psadbw m5, m4 + paddd m2, m5 + + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] + mova m3, [r0 + FENC_STRIDE] + mova m4, [r0 + 16 + FENC_STRIDE] + movu m5, [r1 + r4] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 16 + r4] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + r4] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 16 + r4] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + r4] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 16 + r4] + psadbw m5, m4 + paddd m2, m5 + mova m3, [r0 + 32 + FENC_STRIDE] + mova m4, [r0 + 48 + FENC_STRIDE] + movu m5, [r1 + 32 + r4] + psadbw m5, m3 + paddd m0, m5 + movu m5, [r1 + 48 + r4] + psadbw m5, m4 + paddd m0, m5 + movu m5, [r2 + 32 + r4] + psadbw m5, m3 + paddd m1, m5 + movu m5, [r2 + 48 + r4] + psadbw m5, m4 + paddd m1, m5 + movu m5, [r3 + 32 + r4] + psadbw m5, m3 + paddd m2, m5 + movu m5, [r3 + 48 + r4] + psadbw m5, m4 + paddd m2, m5 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r4 * 2] +%endmacro + +%macro SAD_X4_64x4 0 + mova m4, [r0] + mova m5, [r0 + 16] + movu m6, [r1] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 16] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 16] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 16] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 16] + psadbw m6, m5 + paddd m3, m6 + mova m4, [r0 + 32] + mova m5, [r0 + 48] + movu m6, [r1 + 32] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 48] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + 32] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 48] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + 32] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 48] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + 32] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 48] + psadbw m6, m5 + paddd m3, m6 + + mova m4, [r0 + FENC_STRIDE] + mova m5, [r0 + 16 + FENC_STRIDE] + movu m6, [r1 + r5] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 16 + r5] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + r5] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 16 + r5] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + r5] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 16 + r5] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + r5] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 16 + r5] + psadbw m6, m5 + paddd m3, m6 + mova m4, [r0 + 32 + FENC_STRIDE] + mova m5, [r0 + 48 + FENC_STRIDE] + movu m6, [r1 + 32 + r5] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 48 + r5] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + 32 + r5] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 48 + r5] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + 32 + r5] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 48 + r5] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + 32 + r5] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 48 + r5] + psadbw m6, m5 + paddd m3, m6 + + mova m4, [r0 + FENC_STRIDE * 2] + mova m5, [r0 + 16 + FENC_STRIDE * 2] + movu m6, [r1 + r5 * 2] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 16 + r5 * 2] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + r5 * 2] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 16 + r5 * 2] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + r5 * 2] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 16 + r5 * 2] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + r5 * 2] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 16 + r5 * 2] + psadbw m6, m5 + paddd m3, m6 + mova m4, [r0 + 32 + FENC_STRIDE * 2] + mova m5, [r0 + 48 + FENC_STRIDE * 2] + movu m6, [r1 + 32 + r5 * 2] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 48 + r5 * 2] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + 32 + r5 * 2] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 48 + r5 * 2] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + 32 + r5 * 2] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 48 + r5 * 2] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + 32 + r5 * 2] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 48 + r5 * 2] + psadbw m6, m5 + paddd m3, m6 + + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] + mova m4, [r0 + FENC_STRIDE] + mova m5, [r0 + 16 + FENC_STRIDE] + movu m6, [r1 + r5] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 16 + r5] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + r5] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 16 + r5] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + r5] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 16 + r5] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + r5] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 16 + r5] + psadbw m6, m5 + paddd m3, m6 + mova m4, [r0 + 32 + FENC_STRIDE] + mova m5, [r0 + 48 + FENC_STRIDE] + movu m6, [r1 + 32 + r5] + psadbw m6, m4 + paddd m0, m6 + movu m6, [r1 + 48 + r5] + psadbw m6, m5 + paddd m0, m6 + movu m6, [r2 + 32 + r5] + psadbw m6, m4 + paddd m1, m6 + movu m6, [r2 + 48 + r5] + psadbw m6, m5 + paddd m1, m6 + movu m6, [r3 + 32 + r5] + psadbw m6, m4 + paddd m2, m6 + movu m6, [r3 + 48 + r5] + psadbw m6, m5 + paddd m2, m6 + movu m6, [r4 + 32 + r5] + psadbw m6, m4 + paddd m3, m6 + movu m6, [r4 + 48 + r5] + psadbw m6, m5 + paddd m3, m6 + lea r0, [r0 + FENC_STRIDE * 2] + lea r1, [r1 + r5 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r5 * 2] + lea r4, [r4 + r5 * 2] +%endmacro + +;----------------------------------------------------------------------------- +; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, intptr_t i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 3 +cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2 + SAD_X%1_2x%2P 1 +%rep %3/2-1 + SAD_X%1_2x%2P 0 +%endrep + SAD_X%1_END +%endmacro + +INIT_MMX +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +SAD_X 3, 4, 16 +SAD_X 3, 4, 8 +SAD_X 3, 4, 4 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +SAD_X 4, 4, 16 +SAD_X 4, 4, 8 +SAD_X 4, 4, 4 + + + +;============================================================================= +; SAD x3/x4 XMM +;============================================================================= + +%macro SAD_X3_START_1x16P_SSE2 0 + mova m2, [r0] +%if cpuflag(avx) + psadbw m0, m2, [r1] + psadbw m1, m2, [r2] + psadbw m2, [r3] +%else + movu m0, [r1] + movu m1, [r2] + movu m3, [r3] + psadbw m0, m2 + psadbw m1, m2 + psadbw m2, m3 +%endif +%endmacro + +%macro SAD_X3_1x16P_SSE2 2 + mova m3, [r0+%1] +%if cpuflag(avx) + psadbw m4, m3, [r1+%2] + psadbw m5, m3, [r2+%2] + psadbw m3, [r3+%2] +%else + movu m4, [r1+%2] + movu m5, [r2+%2] + movu m6, [r3+%2] + psadbw m4, m3 + psadbw m5, m3 + psadbw m3, m6 +%endif + paddd m0, m4 + paddd m1, m5 + paddd m2, m3 +%endmacro + +%if ARCH_X86_64 + DECLARE_REG_TMP 6 +%else + DECLARE_REG_TMP 5 +%endif + +%macro SAD_X3_4x16P_SSE2 2 +%if %1==0 + lea t0, [r4*3] + SAD_X3_START_1x16P_SSE2 +%else + SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 +%endif + SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] +%endif +%endmacro + +%macro SAD_X3_START_2x8P_SSE2 0 + movq m3, [r0] + movq m0, [r1] + movq m1, [r2] + movq m2, [r3] + movhps m3, [r0+FENC_STRIDE] + movhps m0, [r1+r4] + movhps m1, [r2+r4] + movhps m2, [r3+r4] + psadbw m0, m3 + psadbw m1, m3 + psadbw m2, m3 +%endmacro + +%macro SAD_X3_2x8P_SSE2 4 + movq m6, [r0+%1] + movq m3, [r1+%2] + movq m4, [r2+%2] + movq m5, [r3+%2] + movhps m6, [r0+%3] + movhps m3, [r1+%4] + movhps m4, [r2+%4] + movhps m5, [r3+%4] + psadbw m3, m6 + psadbw m4, m6 + psadbw m5, m6 + paddd m0, m3 + paddd m1, m4 + paddd m2, m5 +%endmacro + +%macro SAD_X4_START_2x8P_SSE2 0 + movq m4, [r0] + movq m0, [r1] + movq m1, [r2] + movq m2, [r3] + movq m3, [r4] + movhps m4, [r0+FENC_STRIDE] + movhps m0, [r1+r5] + movhps m1, [r2+r5] + movhps m2, [r3+r5] + movhps m3, [r4+r5] + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m4 + psadbw m3, m4 +%endmacro + +%macro SAD_X4_2x8P_SSE2 4 + movq m6, [r0+%1] + movq m4, [r1+%2] + movq m5, [r2+%2] + movhps m6, [r0+%3] + movhps m4, [r1+%4] + movhps m5, [r2+%4] + psadbw m4, m6 + psadbw m5, m6 + paddd m0, m4 + paddd m1, m5 + movq m4, [r3+%2] + movq m5, [r4+%2] + movhps m4, [r3+%4] + movhps m5, [r4+%4] + psadbw m4, m6 + psadbw m5, m6 + paddd m2, m4 + paddd m3, m5 +%endmacro + +%macro SAD_X4_START_1x16P_SSE2 0 + mova m3, [r0] +%if cpuflag(avx) + psadbw m0, m3, [r1] + psadbw m1, m3, [r2] + psadbw m2, m3, [r3] + psadbw m3, [r4] +%else + movu m0, [r1] + movu m1, [r2] + movu m2, [r3] + movu m4, [r4] + psadbw m0, m3 + psadbw m1, m3 + psadbw m2, m3 + psadbw m3, m4 +%endif +%endmacro + +%macro SAD_X4_1x16P_SSE2 2 + mova m6, [r0+%1] +%if cpuflag(avx) + psadbw m4, m6, [r1+%2] + psadbw m5, m6, [r2+%2] +%else + movu m4, [r1+%2] + movu m5, [r2+%2] + psadbw m4, m6 + psadbw m5, m6 +%endif + paddd m0, m4 + paddd m1, m5 +%if cpuflag(avx) + psadbw m4, m6, [r3+%2] + psadbw m5, m6, [r4+%2] +%else + movu m4, [r3+%2] + movu m5, [r4+%2] + psadbw m4, m6 + psadbw m5, m6 +%endif + paddd m2, m4 + paddd m3, m5 +%endmacro + +%macro SAD_X4_4x16P_SSE2 2 +%if %1==0 + lea r6, [r5*3] + SAD_X4_START_1x16P_SSE2 +%else + SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0 +%endif + SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1 + SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2 + SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endif +%endmacro + +%macro SAD_X3_4x8P_SSE2 2 +%if %1==0 + lea t0, [r4*3] + SAD_X3_START_2x8P_SSE2 +%else + SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1 +%endif + SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] +%endif +%endmacro + +%macro SAD_X4_4x8P_SSE2 2 +%if %1==0 + lea r6, [r5*3] + SAD_X4_START_2x8P_SSE2 +%else + SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 +%endif + SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endif +%endmacro + +%macro SAD_X3_END_SSE2 1 + movifnidn r5, r5mp + movhlps m3, m0 + movhlps m4, m1 + movhlps m5, m2 + paddd m0, m3 + paddd m1, m4 + paddd m2, m5 + movd [r5+0], m0 + movd [r5+4], m1 + movd [r5+8], m2 + RET +%endmacro + +%macro SAD_X4_END_SSE2 1 + mov r0, r6mp + psllq m1, 32 + psllq m3, 32 + paddd m0, m1 + paddd m2, m3 + movhlps m1, m0 + movhlps m3, m2 + paddd m0, m1 + paddd m2, m3 + movq [r0+0], m0 + movq [r0+8], m2 + RET +%endmacro + +%macro SAD_X3_START_2x16P_AVX2 0 + movu m3, [r0] ; assumes FENC_STRIDE == 16 + movu xm0, [r1] + movu xm1, [r2] + movu xm2, [r3] + vinserti128 m0, m0, [r1+r4], 1 + vinserti128 m1, m1, [r2+r4], 1 + vinserti128 m2, m2, [r3+r4], 1 + psadbw m0, m3 + psadbw m1, m3 + psadbw m2, m3 +%endmacro + +%macro SAD_X3_2x16P_AVX2 3 + movu m3, [r0+%1] ; assumes FENC_STRIDE == 16 + movu xm4, [r1+%2] + movu xm5, [r2+%2] + movu xm6, [r3+%2] + vinserti128 m4, m4, [r1+%3], 1 + vinserti128 m5, m5, [r2+%3], 1 + vinserti128 m6, m6, [r3+%3], 1 + psadbw m4, m3 + psadbw m5, m3 + psadbw m6, m3 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 +%endmacro + +%macro SAD_X3_4x16P_AVX2 2 +%if %1==0 + lea t0, [r4*3] + SAD_X3_START_2x16P_AVX2 +%else + SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1 +%endif + SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] +%endif +%endmacro + +%macro SAD_X4_START_2x16P_AVX2 0 + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0+FENC_STRIDE] + movu xm0, [r1] + movu xm1, [r2] + movu xm2, [r1+r5] + movu xm3, [r2+r5] + vinserti128 m0, m0, [r3], 1 + vinserti128 m1, m1, [r4], 1 + vinserti128 m2, m2, [r3+r5], 1 + vinserti128 m3, m3, [r4+r5], 1 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m5 + psadbw m3, m5 + paddw m0, m2 + paddw m1, m3 +%endmacro + +%macro SAD_X4_2x16P_AVX2 4 + vbroadcasti128 m6, [r0+%1] + vbroadcasti128 m7, [r0+%3] + movu xm2, [r1+%2] + movu xm3, [r2+%2] + movu xm4, [r1+%4] + movu xm5, [r2+%4] + vinserti128 m2, m2, [r3+%2], 1 + vinserti128 m3, m3, [r4+%2], 1 + vinserti128 m4, m4, [r3+%4], 1 + vinserti128 m5, m5, [r4+%4], 1 + psadbw m2, m6 + psadbw m3, m6 + psadbw m4, m7 + psadbw m5, m7 + paddd m0, m2 + paddd m1, m3 + paddd m0, m4 + paddd m1, m5 +%endmacro + +%macro SAD_X4_4x16P_AVX2 2 +%if %1==0 + lea r6, [r5*3] + SAD_X4_START_2x16P_AVX2 +%else + SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 +%endif + SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endif +%endmacro + +%macro SAD_X3_END_AVX2 0 + movifnidn r5, r5mp + packssdw m0, m1 ; 0 0 1 1 0 0 1 1 + packssdw m2, m2 ; 2 2 _ _ 2 2 _ _ + phaddd m0, m2 ; 0 1 2 _ 0 1 2 _ + vextracti128 xm1, m0, 1 + paddd xm0, xm1 ; 0 1 2 _ + mova [r5], xm0 + RET +%endmacro + +%macro SAD_X4_END_AVX2 0 + mov r0, r6mp + pshufd m0, m0, 0x8 + pshufd m1, m1, 0x8 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + punpcklqdq xm0, xm1 + punpcklqdq xm2, xm3 + phaddd xm0, xm2 ; 0 1 2 3 + mova [r0], xm0 + RET +%endmacro + +;----------------------------------------------------------------------------- +; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, intptr_t i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X_SSE2 4 +cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 +%assign x 0 +%rep %3/4 + SAD_X%1_4x%2P_SSE2 x, %3/4 +%assign x x+1 +%endrep +%if %3 == 64 + SAD_X%1_END_SSE2 1 +%else + SAD_X%1_END_SSE2 0 +%endif +%endmacro + +%macro SAD_X3_W12 0 +cglobal pixel_sad_x3_12x16, 5, 7, 8 + mova m4, [MSK] + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + + SAD_X3_12x4 + SAD_X3_12x4 + SAD_X3_12x4 + SAD_X3_12x4 + SAD_X3_END_SSE2 1 +%endmacro + +%macro SAD_X4_W12 0 +cglobal pixel_sad_x4_12x16, 6, 8, 8 + mova m6, [MSK] + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + + SAD_X4_12x4 + SAD_X4_12x4 + SAD_X4_12x4 + SAD_X4_12x4 + SAD_X4_END_SSE2 1 +%endmacro + +%macro SAD_X3_W24 0 +cglobal pixel_sad_x3_24x32, 5, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 32 + +.loop: + SAD_X3_24x4 + SAD_X3_24x4 + SAD_X3_24x4 + SAD_X3_24x4 + + sub r6, 16 + cmp r6, 0 +jnz .loop + SAD_X3_END_SSE2 1 +%endmacro + +%macro SAD_X4_W24 0 +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_24x32, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 32 + +.loop: + SAD_X4_24x4 + SAD_X4_24x4 + SAD_X4_24x4 + SAD_X4_24x4 + + sub count, 16 + jnz .loop + SAD_X4_END_SSE2 1 + +%endmacro + +%macro SAD_X3_W32 0 +cglobal pixel_sad_x3_32x8, 5, 6, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_32x16, 5, 6, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_32x24, 5, 6, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_32x32, 5, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 32 + +.loop: + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + + sub r6, 16 + cmp r6, 0 +jnz .loop + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_32x64, 5, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 64 + +.loop1: + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + SAD_X3_32x4 + + sub r6, 16 + cmp r6, 0 +jnz .loop1 + SAD_X3_END_SSE2 1 +%endmacro + +%macro SAD_X4_W32 0 +cglobal pixel_sad_x4_32x8, 6, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_END_SSE2 1 + +cglobal pixel_sad_x4_32x16, 6, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_END_SSE2 1 + +cglobal pixel_sad_x4_32x24, 6, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_END_SSE2 1 + +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_32x32, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 32 + +.loop: + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + + sub count, 16 + jnz .loop + SAD_X4_END_SSE2 1 + +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_32x64, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 64 + +.loop: + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + SAD_X4_32x4 + + sub count, 16 + jnz .loop + SAD_X4_END_SSE2 1 + +%endmacro + +%macro SAD_X3_W48 0 +cglobal pixel_sad_x3_48x64, 5, 7, 8 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 64 + +.loop: + SAD_X3_48x4 + SAD_X3_48x4 + SAD_X3_48x4 + SAD_X3_48x4 + + sub r6, 16 + jnz .loop + SAD_X3_END_SSE2 1 +%endmacro + +%macro SAD_X4_W48 0 +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_48x64, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_48x64, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 64 + +.loop: + SAD_X4_48x4 + SAD_X4_48x4 + SAD_X4_48x4 + SAD_X4_48x4 + + sub count, 16 + jnz .loop + SAD_X4_END_SSE2 1 +%endmacro + +%macro SAD_X3_W64 0 +cglobal pixel_sad_x3_64x16, 5, 7, 7 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 16 + +.loop: + SAD_X3_64x4 + SAD_X3_64x4 + + sub r6, 8 + jnz .loop + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_64x32, 5, 7, 7 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 32 + +.loop: + SAD_X3_64x4 + SAD_X3_64x4 + + sub r6, 8 + jnz .loop + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_64x48, 5, 7, 7 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 48 + +.loop: + SAD_X3_64x4 + SAD_X3_64x4 + + sub r6, 8 + jnz .loop + SAD_X3_END_SSE2 1 + +cglobal pixel_sad_x3_64x64, 5, 7, 7 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + mov r6, 64 + +.loop: + SAD_X3_64x4 + SAD_X3_64x4 + + sub r6, 8 + jnz .loop + SAD_X3_END_SSE2 1 +%endmacro + +%macro SAD_X4_W64 0 +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_64x16, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_64x16, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 16 + +.loop: + SAD_X4_64x4 + SAD_X4_64x4 + + sub count, 8 + jnz .loop + SAD_X4_END_SSE2 1 + +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_64x32, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_64x32, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 32 + +.loop: + SAD_X4_64x4 + SAD_X4_64x4 + + sub count, 8 + jnz .loop + SAD_X4_END_SSE2 1 + +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_64x48, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_64x48, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 48 + +.loop: + SAD_X4_64x4 + SAD_X4_64x4 + + sub count, 8 + jnz .loop + SAD_X4_END_SSE2 1 + +%if ARCH_X86_64 == 1 +cglobal pixel_sad_x4_64x64, 6, 8, 8 +%define count r7 +%else +cglobal pixel_sad_x4_64x64, 6, 7, 8, 0-4 +%define count dword [rsp] +%endif + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + mov count, 64 + +.loop: + SAD_X4_64x4 + SAD_X4_64x4 + + sub count, 8 + jnz .loop + SAD_X4_END_SSE2 1 +%endmacro + +INIT_XMM sse2 +SAD_X_SSE2 3, 16, 16, 7 +SAD_X_SSE2 3, 16, 8, 7 +SAD_X_SSE2 3, 8, 16, 7 +SAD_X_SSE2 3, 8, 8, 7 +SAD_X_SSE2 3, 8, 4, 7 +SAD_X_SSE2 4, 16, 16, 7 +SAD_X_SSE2 4, 16, 8, 7 +SAD_X_SSE2 4, 8, 16, 7 +SAD_X_SSE2 4, 8, 8, 7 +SAD_X_SSE2 4, 8, 4, 7 + +INIT_XMM sse3 +SAD_X_SSE2 3, 16, 16, 7 +SAD_X_SSE2 3, 16, 8, 7 +SAD_X_SSE2 3, 16, 4, 7 +SAD_X_SSE2 4, 16, 16, 7 +SAD_X_SSE2 4, 16, 8, 7 +SAD_X_SSE2 4, 16, 4, 7 + +INIT_XMM ssse3 +SAD_X3_W12 +SAD_X3_W32 +SAD_X3_W24 +SAD_X3_W48 +SAD_X3_W64 +SAD_X_SSE2 3, 16, 64, 7 +SAD_X_SSE2 3, 16, 32, 7 +SAD_X_SSE2 3, 16, 16, 7 +SAD_X_SSE2 3, 16, 12, 7 +SAD_X_SSE2 3, 16, 8, 7 +SAD_X_SSE2 3, 8, 32, 7 +SAD_X_SSE2 3, 8, 16, 7 +SAD_X4_W12 +SAD_X4_W24 +SAD_X4_W32 +SAD_X4_W48 +SAD_X4_W64 +SAD_X_SSE2 4, 16, 64, 7 +SAD_X_SSE2 4, 16, 32, 7 +SAD_X_SSE2 4, 16, 16, 7 +SAD_X_SSE2 4, 16, 12, 7 +SAD_X_SSE2 4, 16, 8, 7 +SAD_X_SSE2 4, 8, 32, 7 +SAD_X_SSE2 4, 8, 16, 7 +SAD_X_SSE2 4, 8, 8, 7 +SAD_X_SSE2 4, 8, 4, 7 + +INIT_XMM avx +SAD_X3_W12 +SAD_X3_W32 +SAD_X3_W24 +SAD_X3_W48 +SAD_X3_W64 +SAD_X_SSE2 3, 16, 64, 7 +SAD_X_SSE2 3, 16, 32, 6 +SAD_X_SSE2 3, 16, 16, 6 +SAD_X_SSE2 3, 16, 12, 6 +SAD_X_SSE2 3, 16, 8, 6 +SAD_X_SSE2 3, 16, 4, 6 +SAD_X4_W12 +SAD_X4_W24 +SAD_X4_W32 +SAD_X4_W48 +SAD_X4_W64 +SAD_X_SSE2 4, 16, 64, 7 +SAD_X_SSE2 4, 16, 32, 7 +SAD_X_SSE2 4, 16, 16, 7 +SAD_X_SSE2 4, 16, 12, 7 +SAD_X_SSE2 4, 16, 8, 7 +SAD_X_SSE2 4, 16, 4, 7 + +%macro SAD_X_AVX2 4 +cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 +%assign x 0 +%rep %3/4 + SAD_X%1_4x%2P_AVX2 x, %3/4 +%assign x x+1 +%endrep + SAD_X%1_END_AVX2 +%endmacro + +INIT_YMM avx2 +SAD_X_AVX2 3, 16, 32, 7 +SAD_X_AVX2 3, 16, 16, 7 +SAD_X_AVX2 3, 16, 12, 7 +SAD_X_AVX2 3, 16, 8, 7 +SAD_X_AVX2 4, 16, 32, 8 +SAD_X_AVX2 4, 16, 16, 8 +SAD_X_AVX2 4, 16, 12, 8 +SAD_X_AVX2 4, 16, 8, 8 + +;============================================================================= +; SAD cacheline split +;============================================================================= + +; Core2 (Conroe) can load unaligned data just as quickly as aligned data... +; unless the unaligned data spans the border between 2 cachelines, in which +; case it's really slow. The exact numbers may differ, but all Intel cpus prior +; to Nehalem have a large penalty for cacheline splits. +; (8-byte alignment exactly half way between two cachelines is ok though.) +; LDDQU was supposed to fix this, but it only works on Pentium 4. +; So in the split case we load aligned data and explicitly perform the +; alignment between registers. Like on archs that have only aligned loads, +; except complicated by the fact that PALIGNR takes only an immediate, not +; a variable alignment. +; It is also possible to hoist the realignment to the macroblock level (keep +; 2 copies of the reference frame, offset by 32 bytes), but the extra memory +; needed for that method makes it often slower. + +; sad 16x16 costs on Core2: +; good offsets: 49 cycles (50/64 of all mvs) +; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) +; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) +; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) + +; computed jump assumes this loop is exactly 80 bytes +%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment +ALIGN 16 +sad_w16_align%1_sse2: + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r3+16] + movdqa xmm3, [r2] + movdqa xmm4, [r2+r3] + pslldq xmm1, 16-%1 + pslldq xmm2, 16-%1 + psrldq xmm3, %1 + psrldq xmm4, %1 + por xmm1, xmm3 + por xmm2, xmm4 + psadbw xmm1, [r0] + psadbw xmm2, [r0+r1] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + dec r4 + jg sad_w16_align%1_sse2 + ret +%endmacro + +; computed jump assumes this loop is exactly 64 bytes +%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment +ALIGN 16 +sad_w16_align%1_ssse3: + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r3+16] + palignr xmm1, [r2], %1 + palignr xmm2, [r2+r3], %1 + psadbw xmm1, [r0] + psadbw xmm2, [r0+r1] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + dec r4 + jg sad_w16_align%1_ssse3 + ret +%endmacro + +%macro SAD16_CACHELINE_FUNC 2 ; cpu, height +cglobal pixel_sad_16x%2_cache64_%1 + mov eax, r2m + and eax, 0x37 + cmp eax, 0x30 + jle pixel_sad_16x%2_sse2 + PROLOGUE 4,6 + mov r4d, r2d + and r4d, 15 +%ifidn %1, ssse3 + shl r4d, 6 ; code size = 64 +%else + lea r4, [r4*5] + shl r4d, 4 ; code size = 80 +%endif +%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) +%ifdef PIC + lea r5, [sad_w16_addr] + add r5, r4 +%else + lea r5, [sad_w16_addr + r4] +%endif + and r2, ~15 + mov r4d, %2/2 + pxor xmm0, xmm0 + call r5 + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + movd eax, xmm0 + RET +%endmacro + +%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline + mov eax, r2m + and eax, 0x17|%1|(%4>>1) + cmp eax, 0x10|%1|(%4>>1) + jle pixel_sad_%1x%2_mmx2 + and eax, 7 + shl eax, 3 + movd mm6, [sw_64] + movd mm7, eax + psubw mm6, mm7 + PROLOGUE 4,5 + and r2, ~7 + mov r4d, %3 + pxor mm0, mm0 +%endmacro + +%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline +cglobal pixel_sad_16x%1_cache%2_mmx2 + SAD_CACHELINE_START_MMX2 16, %1, %1, %2 +.loop: + movq mm1, [r2] + movq mm2, [r2+8] + movq mm3, [r2+16] + movq mm4, mm2 + psrlq mm1, mm7 + psllq mm2, mm6 + psllq mm3, mm6 + psrlq mm4, mm7 + por mm1, mm2 + por mm3, mm4 + psadbw mm1, [r0] + psadbw mm3, [r0+8] + paddw mm0, mm1 + paddw mm0, mm3 + add r2, r3 + add r0, r1 + dec r4 + jg .loop + movd eax, mm0 + RET +%endmacro + +%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline +cglobal pixel_sad_8x%1_cache%2_mmx2 + SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 +.loop: + movq mm1, [r2+8] + movq mm2, [r2+r3+8] + movq mm3, [r2] + movq mm4, [r2+r3] + psllq mm1, mm6 + psllq mm2, mm6 + psrlq mm3, mm7 + psrlq mm4, mm7 + por mm1, mm3 + por mm2, mm4 + psadbw mm1, [r0] + psadbw mm2, [r0+r1] + paddw mm0, mm1 + paddw mm0, mm2 + lea r2, [r2+2*r3] + lea r0, [r0+2*r1] + dec r4 + jg .loop + movd eax, mm0 + RET +%endmacro + +; sad_x3/x4_cache64: check each mv. +; if they're all within a cacheline, use normal sad_x3/x4. +; otherwise, send them individually to sad_cache64. +%macro CHECK_SPLIT 3 ; pix, width, cacheline + mov eax, %1 + and eax, 0x17|%2|(%3>>1) + cmp eax, 0x10|%2|(%3>>1) + jg .split +%endmacro + +%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name +cglobal pixel_sad_x3_%1x%2_cache%3_%6 + CHECK_SPLIT r1m, %1, %3 + CHECK_SPLIT r2m, %1, %3 + CHECK_SPLIT r3m, %1, %3 + jmp pixel_sad_x3_%1x%2_%4 +.split: +%if ARCH_X86_64 + PROLOGUE 6,9 + push r3 + push r2 +%if WIN64 + movsxd r4, r4d + sub rsp, 40 ; shadow space and alignment +%endif + mov r2, r1 + mov r1, FENC_STRIDE + mov r3, r4 + mov r7, r0 + mov r8, r5 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8], eax +%if WIN64 + mov r2, [rsp+40+0*8] +%else + pop r2 +%endif + mov r0, r7 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8+4], eax +%if WIN64 + mov r2, [rsp+40+1*8] +%else + pop r2 +%endif + mov r0, r7 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8+8], eax +%if WIN64 + add rsp, 40+2*8 +%endif + RET +%else + push edi + mov edi, [esp+28] + push dword [esp+24] + push dword [esp+16] + push dword 16 + push dword [esp+20] + call pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+32] + mov [edi], eax + mov [esp+8], ecx + call pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+36] + mov [edi+4], eax + mov [esp+8], ecx + call pixel_sad_%1x%2_cache%3_%5 + mov [edi+8], eax + add esp, 16 + pop edi + ret +%endif +%endmacro + +%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name +cglobal pixel_sad_x4_%1x%2_cache%3_%6 + CHECK_SPLIT r1m, %1, %3 + CHECK_SPLIT r2m, %1, %3 + CHECK_SPLIT r3m, %1, %3 + CHECK_SPLIT r4m, %1, %3 + jmp pixel_sad_x4_%1x%2_%4 +.split: +%if ARCH_X86_64 + PROLOGUE 6,9 + mov r8, r6mp + push r4 + push r3 + push r2 +%if WIN64 + sub rsp, 32 ; shadow space +%endif + mov r2, r1 + mov r1, FENC_STRIDE + mov r3, r5 + mov r7, r0 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8], eax +%if WIN64 + mov r2, [rsp+32+0*8] +%else + pop r2 +%endif + mov r0, r7 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8+4], eax +%if WIN64 + mov r2, [rsp+32+1*8] +%else + pop r2 +%endif + mov r0, r7 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8+8], eax +%if WIN64 + mov r2, [rsp+32+2*8] +%else + pop r2 +%endif + mov r0, r7 + call pixel_sad_%1x%2_cache%3_%5 + mov [r8+12], eax +%if WIN64 + add rsp, 32+3*8 +%endif + RET +%else + push edi + mov edi, [esp+32] + push dword [esp+28] + push dword [esp+16] + push dword 16 + push dword [esp+20] + call pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+32] + mov [edi], eax + mov [esp+8], ecx + call pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+36] + mov [edi+4], eax + mov [esp+8], ecx + call pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+40] + mov [edi+8], eax + mov [esp+8], ecx + call pixel_sad_%1x%2_cache%3_%5 + mov [edi+12], eax + add esp, 16 + pop edi + ret +%endif +%endmacro + +%macro SADX34_CACHELINE_FUNC 1+ + SADX3_CACHELINE_FUNC %1 + SADX4_CACHELINE_FUNC %1 +%endmacro + + +; instantiate the aligned sads + +INIT_MMX +%if ARCH_X86_64 == 0 +SAD16_CACHELINE_FUNC_MMX2 8, 32 +SAD16_CACHELINE_FUNC_MMX2 16, 32 +SAD8_CACHELINE_FUNC_MMX2 4, 32 +SAD8_CACHELINE_FUNC_MMX2 8, 32 +SAD8_CACHELINE_FUNC_MMX2 16, 32 +SAD16_CACHELINE_FUNC_MMX2 8, 64 +SAD16_CACHELINE_FUNC_MMX2 16, 64 +%endif ; !ARCH_X86_64 +SAD8_CACHELINE_FUNC_MMX2 4, 64 +SAD8_CACHELINE_FUNC_MMX2 8, 64 +SAD8_CACHELINE_FUNC_MMX2 16, 64 + +%if ARCH_X86_64 == 0 +SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2 +SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2 +SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2 +SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2 +SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2 +SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2 +%endif ; !ARCH_X86_64 +SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2 +SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2 + +%if ARCH_X86_64 == 0 +SAD16_CACHELINE_FUNC sse2, 8 +SAD16_CACHELINE_FUNC sse2, 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP_SSE2 i +%assign i i+1 +%endrep +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2 +%endif ; !ARCH_X86_64 +SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2 + +SAD16_CACHELINE_FUNC ssse3, 8 +SAD16_CACHELINE_FUNC ssse3, 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP_SSSE3 i +%assign i i+1 +%endrep +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3 + diff --git a/source/common/x86/sad16-a.asm b/source/common/x86/sad16-a.asm new file mode 100644 index 0000000..20884d4 --- /dev/null +++ b/source/common/x86/sad16-a.asm @@ -0,0 +1,833 @@ +;***************************************************************************** +;* sad16-a.asm: x86 high depth sad functions +;***************************************************************************** +;* Copyright (C) 2010-2013 x264 project +;* +;* Authors: Oskar Arvidsson +;* Henrik Gramner +;* Dnyaneshwar Gorade +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION .text + +cextern pw_1 + +;============================================================================= +; SAD MMX +;============================================================================= + +%macro SAD_INC_1x16P_MMX 0 + movu m1, [r0+ 0] + movu m2, [r0+ 8] + movu m3, [r0+16] + movu m4, [r0+24] + psubw m1, [r2+ 0] + psubw m2, [r2+ 8] + psubw m3, [r2+16] + psubw m4, [r2+24] + ABSW2 m1, m2, m1, m2, m5, m6 + ABSW2 m3, m4, m3, m4, m7, m5 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddw m1, m2 + paddw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endmacro + +%macro SAD_INC_2x8P_MMX 0 + movu m1, [r0+0] + movu m2, [r0+8] + movu m3, [r0+2*r1+0] + movu m4, [r0+2*r1+8] + psubw m1, [r2+0] + psubw m2, [r2+8] + psubw m3, [r2+2*r3+0] + psubw m4, [r2+2*r3+8] + ABSW2 m1, m2, m1, m2, m5, m6 + ABSW2 m3, m4, m3, m4, m7, m5 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m1, m2 + paddw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endmacro + +%macro SAD_INC_2x4P_MMX 0 + movu m1, [r0] + movu m2, [r0+2*r1] + psubw m1, [r2] + psubw m2, [r2+2*r3] + ABSW2 m1, m2, m1, m2, m3, m4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m0, m1 + paddw m0, m2 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SAD_MMX 3 +cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) + pxor m0, m0 +%if %2 == 4 + SAD_INC_%3x%1P_MMX + SAD_INC_%3x%1P_MMX +%else + mov r4d, %2/%3 +.loop: + SAD_INC_%3x%1P_MMX + dec r4d + jg .loop +%endif +%if %1*%2 == 256 + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmx2 +SAD_MMX 16, 16, 1 +SAD_MMX 16, 8, 1 +SAD_MMX 8, 16, 2 +SAD_MMX 8, 8, 2 +SAD_MMX 8, 4, 2 +SAD_MMX 4, 8, 2 +SAD_MMX 4, 4, 2 +SAD_MMX 4, 16, 2 +INIT_MMX ssse3 +SAD_MMX 4, 8, 2 +SAD_MMX 4, 4, 2 + +;============================================================================= +; SAD XMM +;============================================================================= + +%macro SAD_1x32 0 + movu m1, [r2+ 0] + movu m2, [r2+16] + movu m3, [r2+32] + movu m4, [r2+48] + psubw m1, [r0+0] + psubw m2, [r0+16] + psubw m3, [r0+32] + psubw m4, [r0+48] + ABSW2 m1, m2, m1, m2, m5, m6 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + ABSW2 m3, m4, m3, m4, m7, m5 + pmaddwd m3, [pw_1] + pmaddwd m4, [pw_1] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 +%endmacro + +%macro SAD_1x24 0 + movu m1, [r2+ 0] + movu m2, [r2+16] + movu m3, [r2+32] + psubw m1, [r0+0] + psubw m2, [r0+16] + psubw m3, [r0+32] + ABSW2 m1, m2, m1, m2, m4, m6 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + pxor m4, m4 + psubw m4, m3 + pmaxsw m3, m4 + pmaddwd m3, [pw_1] + paddd m1, m2 + paddd m0, m1 + paddd m0, m3 +%endmacro + +%macro SAD_1x48 0 + movu m1, [r2+ 0] + movu m2, [r2+16] + movu m3, [r2+32] + movu m4, [r2+48] + psubw m1, [r0+0] + psubw m2, [r0+16] + psubw m3, [r0+32] + psubw m4, [r0+48] + ABSW2 m1, m2, m1, m2, m5, m6 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + ABSW2 m3, m4, m3, m4, m7, m5 + pmaddwd m3, [pw_1] + pmaddwd m4, [pw_1] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + movu m1, [r2+64] + movu m2, [r2+80] + psubw m1, [r0+64] + psubw m2, [r0+80] + ABSW2 m1, m2, m1, m2, m3, m4 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddd m0, m1 + paddd m0, m2 +%endmacro + +%macro SAD_1x64 0 + movu m1, [r2+ 0] + movu m2, [r2+16] + movu m3, [r2+32] + movu m4, [r2+48] + psubw m1, [r0+0] + psubw m2, [r0+16] + psubw m3, [r0+32] + psubw m4, [r0+48] + ABSW2 m1, m2, m1, m2, m5, m6 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + ABSW2 m3, m4, m3, m4, m7, m5 + pmaddwd m3, [pw_1] + pmaddwd m4, [pw_1] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + movu m1, [r2+64] + movu m2, [r2+80] + movu m3, [r2+96] + movu m4, [r2+112] + psubw m1, [r0+64] + psubw m2, [r0+80] + psubw m3, [r0+96] + psubw m4, [r0+112] + ABSW2 m1, m2, m1, m2, m5, m6 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + ABSW2 m3, m4, m3, m4, m7, m5 + pmaddwd m3, [pw_1] + pmaddwd m4, [pw_1] + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] +%endmacro + +%macro SAD_1x12 0 + movu m1, [r2+0] + movh m2, [r2+16] + psubw m1, [r0+0] + movh m3, [r0+16] + psubw m2, m3 + ABSW2 m1, m2, m1, m2, m4, m6 + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddd m1, m2 + paddd m0, m1 +%endmacro + +%macro SAD_INC_2ROW 1 +%if 2*%1 > mmsize + movu m1, [r2+ 0] + movu m2, [r2+16] + movu m3, [r2+2*r3+ 0] + movu m4, [r2+2*r3+16] + psubw m1, [r0+ 0] + psubw m2, [r0+16] + psubw m3, [r0+2*r1+ 0] + psubw m4, [r0+2*r1+16] + ABSW2 m1, m2, m1, m2, m5, m6 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + ABSW2 m3, m4, m3, m4, m7, m5 + paddw m1, m2 + paddw m3, m4 + paddw m3, m1 + pmaddwd m3, [pw_1] + paddd m0, m3 +%else + movu m1, [r2] + movu m2, [r2+2*r3] + psubw m1, [r0] + psubw m2, [r0+2*r1] + ABSW2 m1, m2, m1, m2, m3, m4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m2, m1 + pmaddwd m2, [pw_1] + paddd m0, m2 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SAD 2 +cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize) + pxor m0, m0 +%if %2 == 4 + SAD_INC_2ROW %1 + SAD_INC_2ROW %1 +%else + mov r4d, %2/2 +.loop: + SAD_INC_2ROW %1 + dec r4d + jg .loop +%endif + + HADDD m0, m1 + movd eax, xm0 + RET +%endmacro + +INIT_XMM sse2 +SAD 16, 4 +SAD 16, 8 +SAD 16, 12 +SAD 16, 16 +SAD 16, 32 +SAD 16, 64 + +INIT_XMM sse2 +SAD 8, 4 +SAD 8, 8 +SAD 8, 16 +SAD 8, 32 + +;------------------------------------------------------------------ +; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;------------------------------------------------------------------ +%macro SAD_32 2 +cglobal pixel_sad_%1x%2, 4,5,8 + pxor m0, m0 + mov r4d, %2/4 +.loop: + SAD_1x32 + SAD_1x32 + SAD_1x32 + SAD_1x32 + dec r4d + jnz .loop + + HADDD m0, m1 + movd eax, xm0 + RET +%endmacro + +INIT_XMM sse2 +SAD_32 32, 8 +SAD_32 32, 16 +SAD_32 32, 24 +SAD_32 32, 32 +SAD_32 32, 64 + +;------------------------------------------------------------------ +; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;------------------------------------------------------------------ +%macro SAD_64 2 +cglobal pixel_sad_%1x%2, 4,5,8 + pxor m0, m0 + mov r4d, %2/4 +.loop: + SAD_1x64 + SAD_1x64 + SAD_1x64 + SAD_1x64 + dec r4d + jnz .loop + + HADDD m0, m1 + movd eax, xmm0 + RET +%endmacro + +INIT_XMM sse2 +SAD_64 64, 16 +SAD_64 64, 32 +SAD_64 64, 48 +SAD_64 64, 64 + +;------------------------------------------------------------------ +; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;------------------------------------------------------------------ +%macro SAD_48 2 +cglobal pixel_sad_%1x%2, 4,5,8 + pxor m0, m0 + mov r4d, %2/4 +.loop: + SAD_1x48 + SAD_1x48 + SAD_1x48 + SAD_1x48 + dec r4d + jnz .loop + + HADDD m0, m1 + movd eax, xmm0 + RET +%endmacro + +INIT_XMM sse2 +SAD_48 48, 64 + +;------------------------------------------------------------------ +; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;------------------------------------------------------------------ +%macro SAD_24 2 +cglobal pixel_sad_%1x%2, 4,5,8 + pxor m0, m0 + mov r4d, %2/4 +.loop: + SAD_1x24 + SAD_1x24 + SAD_1x24 + SAD_1x24 + dec r4d + jnz .loop + + HADDD m0, m1 + movd eax, xmm0 + RET +%endmacro + +INIT_XMM sse2 +SAD_24 24, 32 + +;------------------------------------------------------------------ +; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;------------------------------------------------------------------ +%macro SAD_12 2 +cglobal pixel_sad_%1x%2, 4,5,8 + pxor m0, m0 + mov r4d, %2/4 +.loop: + SAD_1x12 + SAD_1x12 + SAD_1x12 + SAD_1x12 + dec r4d + jnz .loop + + HADDD m0, m1 + movd eax, xmm0 + RET +%endmacro + +INIT_XMM sse2 +SAD_12 12, 16 + + +;============================================================================= +; SAD x3/x4 +;============================================================================= + +%macro SAD_X3_INC_P 0 + add r0, 4*FENC_STRIDE + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] +%endmacro + +%macro SAD_X3_ONE_START 0 + mova m3, [r0] + movu m0, [r1] + movu m1, [r2] + movu m2, [r3] + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + ABSW2 m0, m1, m0, m1, m4, m5 + ABSW m2, m2, m6 + pmaddwd m0, [pw_1] + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] +%endmacro + +%macro SAD_X3_ONE 2 + mova m6, [r0+%1] + movu m3, [r1+%2] + movu m4, [r2+%2] + movu m5, [r3+%2] + psubw m3, m6 + psubw m4, m6 + psubw m5, m6 + ABSW2 m3, m4, m3, m4, m7, m6 + ABSW m5, m5, m6 + pmaddwd m3, [pw_1] + pmaddwd m4, [pw_1] + pmaddwd m5, [pw_1] + paddd m0, m3 + paddd m1, m4 + paddd m2, m5 +%endmacro + +%macro SAD_X3_END 2 +%if mmsize == 8 && %1*%2 == 256 + HADDUW m0, m3 + HADDUW m1, m4 + HADDUW m2, m5 +%else + HADDD m0, m3 + HADDD m1, m4 + HADDD m2, m5 +%endif +%if UNIX64 + movd [r5+0], xm0 + movd [r5+4], xm1 + movd [r5+8], xm2 +%else + mov r0, r5mp + movd [r0+0], xm0 + movd [r0+4], xm1 + movd [r0+8], xm2 +%endif + RET +%endmacro + +%macro SAD_X4_INC_P 0 + add r0, 4*FENC_STRIDE + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endmacro + +%macro SAD_X4_ONE_START 0 + mova m4, [r0] + movu m0, [r1] + movu m1, [r2] + movu m2, [r3] + movu m3, [r4] + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + ABSW2 m0, m1, m0, m1, m5, m6 + ABSW2 m2, m3, m2, m3, m4, m7 + pmaddwd m0, [pw_1] + pmaddwd m1, [pw_1] + pmaddwd m2, [pw_1] + pmaddwd m3, [pw_1] +%endmacro + +%macro SAD_X4_ONE 2 + mova m4, [r0+%1] + movu m5, [r1+%2] + movu m6, [r2+%2] +%if num_mmregs > 8 + movu m7, [r3+%2] + movu m8, [r4+%2] + psubw m5, m4 + psubw m6, m4 + psubw m7, m4 + psubw m8, m4 + ABSW2 m5, m6, m5, m6, m9, m10 + ABSW2 m7, m8, m7, m8, m9, m10 + pmaddwd m5, [pw_1] + pmaddwd m6, [pw_1] + pmaddwd m7, [pw_1] + pmaddwd m8, [pw_1] + paddd m0, m5 + paddd m1, m6 + paddd m2, m7 + paddd m3, m8 +%elif cpuflag(ssse3) + movu m7, [r3+%2] + psubw m5, m4 + psubw m6, m4 + psubw m7, m4 + movu m4, [r4+%2] + pabsw m5, m5 + psubw m4, [r0+%1] + pabsw m6, m6 + pabsw m7, m7 + pabsw m4, m4 + pmaddwd m5, [pw_1] + pmaddwd m6, [pw_1] + pmaddwd m7, [pw_1] + pmaddwd m4, [pw_1] + paddd m0, m5 + paddd m1, m6 + paddd m2, m7 + paddd m3, m4 +%else ; num_mmregs == 8 && !ssse3 + psubw m5, m4 + psubw m6, m4 + ABSW m5, m5, m7 + ABSW m6, m6, m7 + pmaddwd m5, [pw_1] + pmaddwd m6, [pw_1] + paddd m0, m5 + paddd m1, m6 + movu m5, [r3+%2] + movu m6, [r4+%2] + psubw m5, m4 + psubw m6, m4 + ABSW2 m5, m6, m5, m6, m7, m4 + pmaddwd m5, [pw_1] + pmaddwd m6, [pw_1] + paddd m2, m5 + paddd m3, m6 +%endif +%endmacro + +%macro SAD_X4_END 2 +%if mmsize == 8 && %1*%2 == 256 + HADDUW m0, m4 + HADDUW m1, m5 + HADDUW m2, m6 + HADDUW m3, m7 +%else + HADDD m0, m4 + HADDD m1, m5 + HADDD m2, m6 + HADDD m3, m7 +%endif + mov r0, r6mp + movd [r0+ 0], xm0 + movd [r0+ 4], xm1 + movd [r0+ 8], xm2 + movd [r0+12], xm3 + RET +%endmacro + +%macro SAD_X_2xNP 4 + %assign x %3 +%rep %4 + SAD_X%1_ONE x*mmsize, x*mmsize + SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize + %assign x x+1 +%endrep +%endmacro + +%macro PIXEL_VSAD 0 +cglobal pixel_vsad, 3,3,8 + mova m0, [r0] + mova m1, [r0+16] + mova m2, [r0+2*r1] + mova m3, [r0+2*r1+16] + lea r0, [r0+4*r1] + psubw m0, m2 + psubw m1, m3 + ABSW2 m0, m1, m0, m1, m4, m5 + paddw m0, m1 + sub r2d, 2 + je .end +.loop: + mova m4, [r0] + mova m5, [r0+16] + mova m6, [r0+2*r1] + mova m7, [r0+2*r1+16] + lea r0, [r0+4*r1] + psubw m2, m4 + psubw m3, m5 + psubw m4, m6 + psubw m5, m7 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + ABSW m4, m4, m1 + ABSW m5, m5, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + mova m2, m6 + mova m3, m7 + sub r2d, 2 + jg .loop +.end: +%if BIT_DEPTH == 9 + HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682 +%else + HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426 +%endif + movd eax, m0 + RET +%endmacro +INIT_XMM sse2 +PIXEL_VSAD +INIT_XMM ssse3 +PIXEL_VSAD +INIT_XMM xop +PIXEL_VSAD + +INIT_YMM avx2 +cglobal pixel_vsad, 3,3 + mova m0, [r0] + mova m1, [r0+2*r1] + lea r0, [r0+4*r1] + psubw m0, m1 + pabsw m0, m0 + sub r2d, 2 + je .end +.loop: + mova m2, [r0] + mova m3, [r0+2*r1] + lea r0, [r0+4*r1] + psubw m1, m2 + psubw m2, m3 + pabsw m1, m1 + pabsw m2, m2 + paddw m0, m1 + paddw m0, m2 + mova m1, m3 + sub r2d, 2 + jg .loop +.end: +%if BIT_DEPTH == 9 + HADDW m0, m1 +%else + HADDUW m0, m1 +%endif + movd eax, xm0 + RET + +;----------------------------------------------------------------------------- +; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, +; uint16_t *pix2, intptr_t i_stride, int scores[3] ) +;----------------------------------------------------------------------------- +%macro SAD_X 3 +cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS + %assign regnum %1+1 + %xdefine STRIDE r %+ regnum + mov r6, %3/2-1 + SAD_X%1_ONE_START + SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE + SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1 +.loop: + SAD_X%1_INC_P + SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2) + dec r6 + jg .loop +%if %1 == 4 + mov r6, r6m +%endif + SAD_X%1_END %2, %3 +%endmacro + +INIT_MMX mmx2 +%define XMM_REGS 0 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 12, 16 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +SAD_X 3, 4, 16 +SAD_X 3, 4, 8 +SAD_X 3, 4, 4 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 12, 16 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +SAD_X 4, 4, 16 +SAD_X 4, 4, 8 +SAD_X 4, 4, 4 +INIT_MMX ssse3 +SAD_X 3, 4, 8 +SAD_X 3, 4, 4 +SAD_X 4, 4, 8 +SAD_X 4, 4, 4 +INIT_XMM ssse3 +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +INIT_XMM sse2 +%define XMM_REGS 8 +SAD_X 3, 64, 64 +SAD_X 3, 64, 48 +SAD_X 3, 64, 32 +SAD_X 3, 64, 16 +SAD_X 3, 48, 64 +SAD_X 3, 32, 64 +SAD_X 3, 32, 32 +SAD_X 3, 32, 24 +SAD_X 3, 32, 16 +SAD_X 3, 32, 8 +SAD_X 3, 24, 32 +SAD_X 3, 16, 64 +SAD_X 3, 16, 32 +SAD_X 3, 16, 16 +SAD_X 3, 16, 12 +SAD_X 3, 16, 8 +SAD_X 3, 16, 4 +SAD_X 3, 8, 32 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +%define XMM_REGS 11 +SAD_X 4, 64, 64 +SAD_X 4, 64, 48 +SAD_X 4, 64, 32 +SAD_X 4, 64, 16 +SAD_X 4, 48, 64 +SAD_X 4, 32, 64 +SAD_X 4, 32, 32 +SAD_X 4, 32, 24 +SAD_X 4, 32, 16 +SAD_X 4, 32, 8 +SAD_X 4, 24, 32 +SAD_X 4, 16, 64 +SAD_X 4, 16, 32 +SAD_X 4, 16, 16 +SAD_X 4, 16, 12 +SAD_X 4, 16, 8 +SAD_X 4, 16, 4 +SAD_X 4, 8, 32 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +INIT_YMM avx2 +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 + diff --git a/source/common/x86/ssd-a.asm b/source/common/x86/ssd-a.asm new file mode 100644 index 0000000..03c4c9c --- /dev/null +++ b/source/common/x86/ssd-a.asm @@ -0,0 +1,2595 @@ +;***************************************************************************** +;* ssd-a.asm: x86 ssd functions +;***************************************************************************** +;* Copyright (C) 2003-2013 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* Laurent Aimar +;* Alex Izvorski +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +SECTION .text + +cextern pw_00ff +cextern hsub_mul + +;============================================================================= +; SSD +;============================================================================= + +%if HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SSD_ONE 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 + FIX_STRIDES r1, r3 +%if mmsize == %1*2 + %define offset0_1 r1 + %define offset0_2 r1*2 + %define offset0_3 r5 + %define offset1_1 r3 + %define offset1_2 r3*2 + %define offset1_3 r6 + lea r5, [3*r1] + lea r6, [3*r3] +%elif mmsize == %1 + %define offset0_1 mmsize + %define offset0_2 r1 + %define offset0_3 r1+mmsize + %define offset1_1 mmsize + %define offset1_2 r3 + %define offset1_3 r3+mmsize +%elif mmsize == %1/2 + %define offset0_1 mmsize + %define offset0_2 mmsize*2 + %define offset0_3 mmsize*3 + %define offset1_1 mmsize + %define offset1_2 mmsize*2 + %define offset1_3 mmsize*3 +%endif + %assign %%n %2/(2*mmsize/%1) +%if %%n > 1 + mov r4d, %%n +%endif + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r0+offset0_1] + movu m3, [r0+offset0_2] + movu m4, [r0+offset0_3] + movu m6, [r2] + movu m7, [r2+offset1_1] + psubw m1, m6 + psubw m2, m7 + movu m6, [r2+offset1_2] + movu m7, [r2+offset1_3] + psubw m3, m6 + psubw m4, m7 +%if %%n > 1 + lea r0, [r0+r1*(%2/%%n)] + lea r2, [r2+r3*(%2/%%n)] +%endif + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 +%if %%n > 1 + dec r4d + jg .loop +%endif + HADDD m0, m5 + movd eax, xm0 +%ifidn movu,movq ; detect MMX + EMMS +%endif + RET +%endmacro + +%macro SSD_TWO 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 + FIX_STRIDES r1, r3 + pxor m0, m0 + mov r4d, %2/2 + lea r5, [r1 * 2] + lea r6, [r3 * 2] +.loop: + movu m1, [r0] + movu m2, [r0 + 16] + movu m3, [r0 + 32] + movu m4, [r0 + 48] + movu m6, [r2] + movu m7, [r2 + 16] + psubw m1, m6 + psubw m2, m7 + movu m6, [r2 + 32] + movu m7, [r2 + 48] + psubw m3, m6 + psubw m4, m7 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + movu m1, [r0 + 64] + movu m2, [r0 + 80] + movu m6, [r2 + 64] + movu m7, [r2 + 80] + psubw m1, m6 + psubw m2, m7 + pmaddwd m1, m1 + pmaddwd m2, m2 + paddd m1, m2 + paddd m0, m1 +%if %1 == 64 + movu m3, [r0 + 96] + movu m4, [r0 + 112] + movu m6, [r2 + 96] + movu m7, [r2 + 112] + psubw m3, m6 + psubw m4, m7 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m3, m4 + paddd m0, m3 +%endif + movu m1, [r0 + r1] + movu m2, [r0 + r1 + 16] + movu m3, [r0 + r1 + 32] + movu m4, [r0 + r1 + 48] + movu m6, [r2 + r3] + movu m7, [r2 + r3 + 16] + psubw m1, m6 + psubw m2, m7 + movu m6, [r2 + r3 + 32] + movu m7, [r2 + r3 + 48] + psubw m3, m6 + psubw m4, m7 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + movu m1, [r0 + r1 + 64] + movu m2, [r0 + r1 + 80] + movu m6, [r2 + r3 + 64] + movu m7, [r2 + r3 + 80] + psubw m1, m6 + psubw m2, m7 + pmaddwd m1, m1 + pmaddwd m2, m2 + paddd m1, m2 + paddd m0, m1 +%if %1 == 64 + movu m3, [r0 + r1 + 96] + movu m4, [r0 + r1 + 112] + movu m6, [r2 + r3 + 96] + movu m7, [r2 + r3 + 112] + psubw m3, m6 + psubw m4, m7 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m3, m4 + paddd m0, m3 +%endif + lea r0, [r0 + r5] + lea r2, [r2 + r6] + dec r4d + jnz .loop + HADDD m0, m5 + movd eax, xm0 + RET +%endmacro +%macro SSD_24 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 + FIX_STRIDES r1, r3 + pxor m0, m0 + mov r4d, %2/2 + lea r5, [r1 * 2] + lea r6, [r3 * 2] +.loop: + movu m1, [r0] + movu m2, [r0 + 16] + movu m3, [r0 + 32] + movu m5, [r2] + movu m6, [r2 + 16] + movu m7, [r2 + 32] + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m1, m2 + paddd m0, m1 + movu m1, [r0 + r1] + movu m2, [r0 + r1 + 16] + movu m4, [r0 + r1 + 32] + movu m5, [r2 + r3] + movu m6, [r2 + r3 + 16] + movu m7, [r2 + r3 + 32] + psubw m1, m5 + psubw m2, m6 + psubw m4, m7 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + lea r0, [r0 + r5] + lea r2, [r2 + r6] + dec r4d + jnz .loop + HADDD m0, m5 + movd eax, xm0 + RET +%endmacro +%macro SSD_12 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 + FIX_STRIDES r1, r3 + pxor m0, m0 + mov r4d, %2/4 + lea r5, [r1 * 2] + lea r6, [r3 * 2] +.loop: + movu m1, [r0] + movh m2, [r0 + 16] + movu m3, [r0 + r1] + punpcklqdq m2, [r0 + r1 + 16] + movu m7, [r2] + psubw m1, m7 + movh m4, [r2 + 16] + movu m7, [r2 + r3] + psubw m3, m7 + punpcklqdq m4, [r2 + r3 + 16] + psubw m2, m4 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m1, m2 + paddd m0, m1 + + movu m1, [r0 + r5] + movh m2, [r0 + r5 + 16] + lea r0, [r0 + r5] + movu m6, [r0 + r1] + punpcklqdq m2, [r0 + r1 + 16] + movu m7, [r2 + r6] + psubw m1, m7 + movh m4, [r2 + r6 + 16] + lea r2, [r2 + r6] + movu m7, [r2 + r3] + psubw m6, m7 + punpcklqdq m4, [r2 + r3 + 16] + psubw m2, m4 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m6, m6 + paddd m1, m2 + paddd m3, m6 + paddd m0, m1 + paddd m0, m3 + lea r0, [r0 + r5] + lea r2, [r2 + r6] + dec r4d + jnz .loop + HADDD m0, m5 + movd eax, xm0 + RET +%endmacro +INIT_MMX mmx2 +SSD_ONE 4, 4 +SSD_ONE 4, 8 +SSD_ONE 4, 16 +SSD_ONE 8, 4 +SSD_ONE 8, 8 +SSD_ONE 8, 16 +SSD_ONE 16, 8 +SSD_ONE 16, 16 +INIT_XMM sse2 +SSD_ONE 8, 4 +SSD_ONE 8, 8 +SSD_ONE 8, 16 +SSD_ONE 8, 32 +SSD_12 12, 16 +SSD_ONE 16, 4 +SSD_ONE 16, 8 +SSD_ONE 16, 12 +SSD_ONE 16, 16 +SSD_ONE 16, 32 +SSD_ONE 16, 64 +SSD_24 24, 32 +SSD_ONE 32, 8 +SSD_ONE 32, 16 +SSD_ONE 32, 24 +SSD_ONE 32, 32 +SSD_ONE 32, 64 +SSD_TWO 48, 64 +SSD_TWO 64, 16 +SSD_TWO 64, 32 +SSD_TWO 64, 48 +SSD_TWO 64, 64 +INIT_YMM avx2 +SSD_ONE 16, 8 +SSD_ONE 16, 16 +%endif ; HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH == 0 +%macro SSD_SS 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,6 + FIX_STRIDES r1, r3 +%if mmsize == %1*4 || mmsize == %1*2 + %define offset0_1 r1*2 + %define offset0_2 r1*4 + %define offset0_3 r5 + %define offset1_1 r3*2 + %define offset1_2 r3*4 + %define offset1_3 r6 + lea r5, [4*r1] + lea r6, [4*r3] + lea r5, [r5 + 2*r1] + lea r6, [r6 + 2*r3] +%elif mmsize == %1 + %define offset0_1 16 + %define offset0_2 r1*2 + %define offset0_3 r1*2+16 + %define offset1_1 16 + %define offset1_2 r3*2 + %define offset1_3 r3*2+16 +%endif +%if %1 == 4 + %assign %%n %2/(mmsize/%1) +%else + %assign %%n %2/(2*mmsize/%1) +%endif +%if %%n > 1 + mov r4d, %%n +%endif + pxor m0, m0 +.loop: +%if %1 == 4 + movh m1, [r0] + movh m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movh m1, [r0 + offset0_1] + movh m2, [r2 + offset1_1] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movh m1, [r0 + offset0_2] + movh m2, [r2 + offset1_2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movh m1, [r0 + offset0_3] + movh m2, [r2 + offset1_3] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 +%else + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + offset0_1] + movu m2, [r2 + offset1_1] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + offset0_2] + movu m2, [r2 + offset1_2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + offset0_3] + movu m2, [r2 + offset1_3] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 +%endif + lea r0, [r0+r1*(%2/%%n)*2] + lea r2, [r2+r3*(%2/%%n)*2] +%if %%n > 1 + dec r4d + jg .loop +%endif +%if %1 == 4 + %if notcpuflag(ssse3) + pshufd m1, m0, 1 + paddd m0, m1 + %else + phaddd m0, m0 + %endif +%else + HADDD m0, m1 +%endif + movd eax, m0 + RET +%endmacro +%macro SSD_SS_ONE 0 +SSD_SS 4, 4 +SSD_SS 4, 8 +SSD_SS 4, 16 +SSD_SS 8, 4 +SSD_SS 8, 8 +SSD_SS 8, 16 +SSD_SS 8, 32 +SSD_SS 16, 4 +SSD_SS 16, 8 +SSD_SS 16, 12 +SSD_SS 16, 16 +SSD_SS 16, 32 +SSD_SS 16, 64 +%endmacro + +%macro SSD_SS_12x16 0 +cglobal pixel_ssd_ss_12x16, 4,7,6 + FIX_STRIDES r1, r3 + mov r4d, 8 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + pslldq m1, 8 + psrldq m1, 8 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + pslldq m1, 8 + psrldq m1, 8 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + dec r4d + jnz .loop + HADDD m0, m1 + movd eax, m0 + RET +%endmacro + +%macro SSD_SS_32 1 +cglobal pixel_ssd_ss_32x%1, 4,7,6 + FIX_STRIDES r1, r3 + mov r4d, %1/2 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 48] + movu m2, [r2 + 48] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 48] + movu m2, [r2 + 48] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + dec r4d + jnz .loop + HADDD m0, m1 + movd eax, m0 + RET +%endmacro + +%macro SSD_SS_32xN 0 +SSD_SS_32 8 +SSD_SS_32 16 +SSD_SS_32 24 +SSD_SS_32 32 +SSD_SS_32 64 +%endmacro + +%macro SSD_SS_24 0 +cglobal pixel_ssd_ss_24x32, 4,7,6 + FIX_STRIDES r1, r3 + mov r4d, 16 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + dec r4d + jnz .loop + HADDD m0, m1 + movd eax, m0 + RET +%endmacro + +%macro SSD_SS_48 0 +cglobal pixel_ssd_ss_48x64, 4,7,6 + FIX_STRIDES r1, r3 + mov r4d, 32 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 48] + movu m2, [r2 + 48] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 64] + movu m2, [r2 + 64] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 80] + movu m2, [r2 + 80] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 48] + movu m2, [r2 + 48] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 64] + movu m2, [r2 + 64] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 80] + movu m2, [r2 + 80] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + dec r4d + jnz .loop + HADDD m0, m1 + movd eax, m0 + RET +%endmacro + +%macro SSD_SS_64 1 +cglobal pixel_ssd_ss_64x%1, 4,7,6 + FIX_STRIDES r1, r3 + mov r4d, %1/2 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 48] + movu m2, [r2 + 48] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 64] + movu m2, [r2 + 64] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 80] + movu m2, [r2 + 80] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 96] + movu m2, [r2 + 96] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 112] + movu m2, [r2 + 112] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + movu m1, [r0] + movu m2, [r2] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 16] + movu m2, [r2 + 16] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 32] + movu m2, [r2 + 32] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 48] + movu m2, [r2 + 48] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 64] + movu m2, [r2 + 64] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 80] + movu m2, [r2 + 80] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 96] + movu m2, [r2 + 96] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + movu m1, [r0 + 112] + movu m2, [r2 + 112] + psubw m1, m2 + pmaddwd m1, m1 + paddd m0, m1 + lea r0, [r0 + 2*r1] + lea r2, [r2 + 2*r3] + dec r4d + jnz .loop + HADDD m0, m1 + movd eax, m0 + RET +%endmacro + +%macro SSD_SS_64xN 0 +SSD_SS_64 16 +SSD_SS_64 32 +SSD_SS_64 48 +SSD_SS_64 64 +%endmacro + +INIT_XMM sse2 +SSD_SS_ONE +SSD_SS_12x16 +SSD_SS_24 +SSD_SS_32xN +SSD_SS_48 +SSD_SS_64xN +INIT_XMM sse4 +SSD_SS_ONE +SSD_SS_12x16 +SSD_SS_24 +SSD_SS_32xN +SSD_SS_48 +SSD_SS_64xN +INIT_XMM avx +SSD_SS_ONE +SSD_SS_12x16 +SSD_SS_24 +SSD_SS_32xN +SSD_SS_48 +SSD_SS_64xN +%endif ; !HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +%macro SSD_LOAD_FULL 5 + mova m1, [t0+%1] + mova m2, [t2+%2] + mova m3, [t0+%3] + mova m4, [t2+%4] +%if %5==1 + add t0, t1 + add t2, t3 +%elif %5==2 + lea t0, [t0+2*t1] + lea t2, [t2+2*t3] +%endif +%endmacro + +%macro LOAD 5 + movh m%1, %3 + movh m%2, %4 +%if %5 + lea t0, [t0+2*t1] +%endif +%endmacro + +%macro JOIN 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea t2, [t2+2*t3] +%endif + punpcklbw m%1, m7 + punpcklbw m%3, m7 + psubw m%1, m%3 + punpcklbw m%2, m7 + punpcklbw m%4, m7 + psubw m%2, m%4 +%endmacro + +%macro JOIN_SSE2 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea t2, [t2+2*t3] +%endif + punpcklqdq m%1, m%2 + punpcklqdq m%3, m%4 + DEINTB %2, %1, %4, %3, 7 + psubw m%2, m%4 + psubw m%1, m%3 +%endmacro + +%macro JOIN_SSSE3 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea t2, [t2+2*t3] +%endif + punpcklbw m%1, m%3 + punpcklbw m%2, m%4 +%endmacro + +%macro LOAD_AVX2 5 + mova xm%1, %3 + vinserti128 m%1, m%1, %4, 1 +%if %5 + lea t0, [t0+2*t1] +%endif +%endmacro + +%macro JOIN_AVX2 7 + mova xm%2, %5 + vinserti128 m%2, m%2, %6, 1 +%if %7 + lea t2, [t2+2*t3] +%endif + SBUTTERFLY bw, %1, %2, %3 +%endmacro + +%macro SSD_LOAD_HALF 5 + LOAD 1, 2, [t0+%1], [t0+%3], 1 + JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 + LOAD 3, 4, [t0+%1], [t0+%3], %5 + JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 +%endmacro + +%macro SSD_CORE 7-8 +%ifidn %8, FULL + mova m%6, m%2 + mova m%7, m%4 + psubusb m%2, m%1 + psubusb m%4, m%3 + psubusb m%1, m%6 + psubusb m%3, m%7 + por m%1, m%2 + por m%3, m%4 + punpcklbw m%2, m%1, m%5 + punpckhbw m%1, m%5 + punpcklbw m%4, m%3, m%5 + punpckhbw m%3, m%5 +%endif + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_CORE_SSE2 7-8 +%ifidn %8, FULL + DEINTB %6, %1, %7, %2, %5 + psubw m%6, m%7 + psubw m%1, m%2 + SWAP %6, %2, %1 + DEINTB %6, %3, %7, %4, %5 + psubw m%6, m%7 + psubw m%3, m%4 + SWAP %6, %4, %3 +%endif + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_CORE_SSSE3 7-8 +%ifidn %8, FULL + punpckhbw m%6, m%1, m%2 + punpckhbw m%7, m%3, m%4 + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + SWAP %6, %2, %3 + SWAP %7, %4 +%endif + pmaddubsw m%1, m%5 + pmaddubsw m%2, m%5 + pmaddubsw m%3, m%5 + pmaddubsw m%4, m%5 + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_ITER 6 + SSD_LOAD_%1 %2,%3,%4,%5,%6 + SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SSD 2 +%if %1 != %2 + %assign function_align 8 +%else + %assign function_align 16 +%endif +cglobal pixel_ssd_%1x%2, 0,0,0 + mov al, %1*%2/mmsize/2 + +%if %1 != %2 + jmp mangle(x265_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop) +%else + +.startloop: +%if ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3 + PROLOGUE 0,0,8 +%else + PROLOGUE 0,5 + DECLARE_REG_TMP 1,2,3,4 + mov t0, r0m + mov t1, r1m + mov t2, r2m + mov t3, r3m +%endif + +%if cpuflag(ssse3) + mova m7, [hsub_mul] +%elifidn cpuname, sse2 + mova m7, [pw_00ff] +%elif %1 >= mmsize + pxor m7, m7 +%endif + pxor m0, m0 + +ALIGN 16 +.loop: +%if %1 > mmsize + SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 +%elif %1 == mmsize + SSD_ITER FULL, 0, 0, t1, t3, 2 +%else + SSD_ITER HALF, 0, 0, t1, t3, 2 +%endif + dec al + jg .loop +%if mmsize==32 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + HADDD xm0, xm1 + movd eax, xm0 +%else + HADDD m0, m1 + movd eax, m0 +%endif +%if (mmsize == 8) + emms +%endif + RET +%endif +%endmacro + +%macro HEVC_SSD 0 +SSD 32, 64 +SSD 16, 64 +SSD 32, 32 +SSD 32, 16 +SSD 16, 32 +SSD 32, 8 +SSD 8, 32 +SSD 32, 24 +SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol +SSD 8, 4 +SSD 8, 8 +SSD 16, 16 +SSD 16, 12 +SSD 16, 8 +SSD 8, 16 +SSD 16, 4 +%endmacro + +INIT_MMX mmx +SSD 16, 16 +SSD 16, 8 +SSD 8, 8 +SSD 8, 16 +SSD 4, 4 +SSD 8, 4 +SSD 4, 8 +SSD 4, 16 +INIT_XMM sse2slow +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +INIT_XMM sse2 +%define SSD_CORE SSD_CORE_SSE2 +%define JOIN JOIN_SSE2 +HEVC_SSD +INIT_XMM ssse3 +%define SSD_CORE SSD_CORE_SSSE3 +%define JOIN JOIN_SSSE3 +HEVC_SSD +INIT_XMM avx +HEVC_SSD +INIT_MMX ssse3 +SSD 4, 4 +SSD 4, 8 +SSD 4, 16 +INIT_XMM xop +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +%define LOAD LOAD_AVX2 +%define JOIN JOIN_AVX2 +INIT_YMM avx2 +SSD 16, 16 +SSD 16, 8 +%assign function_align 16 +%endif ; !HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2 + + pxor m6, m6 + mov r4d, 4 + +.loop: + movu m0, [r0] + movu m1, [r2] + movu m2, [r0 + r1] + movu m3, [r2 + r3] + + punpckhdq m4, m0, m2 + punpckhdq m5, m1, m3 + + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + + pmaddwd m0, m0 + pmaddwd m2, m2 + pmaddwd m4, m4 + + paddd m0, m2 + paddd m6, m4 + paddd m6, m0 + + movu m0, [r0 + 2 * r1] + movu m1, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + movu m2, [r0 + r1] + movu m3, [r2 + r3] + + punpckhdq m4, m0, m2 + punpckhdq m5, m1, m3 + + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m5, m5 + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + + pmaddwd m0, m0 + pmaddwd m2, m2 + pmaddwd m4, m4 + + paddd m0, m2 + paddd m6, m4 + paddd m6, m0 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + + HADDD m6, m1 + movd eax, m6 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r4d, 16 + +.loop: + movu m1, [r0] + pmovzxbw m0, m1 + punpckhbw m1, m6 + pmovzxbw m2, [r0 + 16] + movu m4, [r2] + pmovzxbw m3, m4 + punpckhbw m4, m6 + pmovzxbw m5, [r2 + 16] + + psubw m0, m3 + psubw m1, m4 + psubw m2, m5 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + + paddd m0, m1 + paddd m7, m2 + paddd m7, m0 + + movu m1, [r0 + r1] + pmovzxbw m0, m1 + punpckhbw m1, m6 + pmovzxbw m2, [r0 + r1 + 16] + movu m4, [r2 + r3] + pmovzxbw m3, m4 + punpckhbw m4, m6 + pmovzxbw m5, [r2 + r3 + 16] + + psubw m0, m3 + psubw m1, m4 + psubw m2, m5 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + + paddd m0, m1 + paddd m7, m2 + paddd m7, m0 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + + HADDD m7, m1 + movd eax, m7 + + RET + +%macro PIXEL_SSD_16x4 0 + movu m1, [r0] + pmovzxbw m0, m1 + punpckhbw m1, m6 + movu m3, [r2] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m0, m2 + psubw m1, m3 + + movu m5, [r0 + r1] + pmovzxbw m4, m5 + punpckhbw m5, m6 + movu m3, [r2 + r3] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m4, m2 + psubw m5, m3 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m4, m4 + pmaddwd m5, m5 + + paddd m0, m1 + paddd m4, m5 + paddd m4, m0 + paddd m7, m4 + + movu m1, [r0 + r6] + pmovzxbw m0, m1 + punpckhbw m1, m6 + movu m3, [r2 + 2 * r3] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m0, m2 + psubw m1, m3 + + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + movu m5, [r0 + r1] + pmovzxbw m4, m5 + punpckhbw m5, m6 + movu m3, [r2 + r3] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m4, m2 + psubw m5, m3 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m4, m4 + pmaddwd m5, m5 + + paddd m0, m1 + paddd m4, m5 + paddd m4, m0 + paddd m7, m4 +%endmacro + +cglobal pixel_ssd_16x16_internal + PIXEL_SSD_16x4 + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_16x4 + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_16x4 + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_16x4 + ret + +;----------------------------------------------------------------------------- +; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r4, r0 + mov r5, r2 + lea r6, [r1 * 2] + + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 16] + lea r2, [r5 + 16] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 32] + lea r2, [r5 + 32] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + + HADDD m7, m1 + movd eax, m7 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r4, r0 + mov r5, r2 + lea r6, [r1 * 2] + + call pixel_ssd_16x16_internal + lea r0, [r4 + 16] + lea r2, [r5 + 16] + call pixel_ssd_16x16_internal + lea r0, [r4 + 32] + lea r2, [r5 + 32] + call pixel_ssd_16x16_internal + lea r0, [r4 + 48] + lea r2, [r5 + 48] + call pixel_ssd_16x16_internal + + HADDD m7, m1 + movd eax, m7 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r4, r0 + mov r5, r2 + lea r6, [r1 * 2] + + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 16] + lea r2, [r5 + 16] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 32] + lea r2, [r5 + 32] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 48] + lea r2, [r5 + 48] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + + HADDD m7, m1 + movd eax, m7 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r4, r0 + mov r5, r2 + lea r6, [r1 * 2] + + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 16] + lea r2, [r5 + 16] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 32] + lea r2, [r5 + 32] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 48] + lea r2, [r5 + 48] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + + HADDD m7, m1 + movd eax, m7 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r4, r0 + mov r5, r2 + lea r6, [r1 * 2] + + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 16] + lea r2, [r5 + 16] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 32] + lea r2, [r5 + 32] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r4 + 48] + lea r2, [r5 + 48] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + lea r0, [r0 + r6] + lea r2, [r2 + 2 * r3] + call pixel_ssd_16x16_internal + + HADDD m7, m1 + movd eax, m7 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- + +cglobal pixel_ssd_sp_4x4_internal + movh m0, [r0] + movh m1, [r0 + r1] + punpcklqdq m0, m1 + movd m2, [r2] + movd m3, [r2 + r3] + punpckldq m2, m3 + pmovzxbw m2, m2 + psubw m0, m2 + movh m4, [r0 + 2 * r1] + movh m5, [r0 + r4] + punpcklqdq m4, m5 + movd m6, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] + movd m1, [r2 + r3] + punpckldq m6, m1 + pmovzxbw m6, m6 + psubw m4, m6 + pmaddwd m0, m0 + pmaddwd m4, m4 + paddd m0, m4 + paddd m7, m0 + ret + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + call pixel_ssd_sp_4x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +cglobal pixel_ssd_sp_8x4_internal + movu m0, [r0] + movu m1, [r0 + r1] + movh m2, [r2] + movh m3, [r2 + r3] + pmovzxbw m2, m2 + pmovzxbw m3, m3 + + psubw m0, m2 + psubw m1, m3 + + movu m4, [r0 + 2 * r1] + movu m5, [r0 + r4] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r5] + pmovzxbw m2, m2 + pmovzxbw m3, m3 + + psubw m4, m2 + psubw m5, m3 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m4, m4 + pmaddwd m5, m5 + + paddd m0, m1 + paddd m4, m5 + paddd m4, m0 + paddd m7, m4 + ret + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + call pixel_ssd_sp_8x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 3] + mov r5, r0 + mov r6, r2 + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_4x4_internal + lea r0, [r5 + 8] + lea r2, [r6 + 4] + lea r5, [r3 * 3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +%macro PIXEL_SSD_SP_16x4 0 + movu m0, [r0] + movu m1, [r0 + 16] + movu m3, [r2] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m0, m2 + psubw m1, m3 + + movu m4, [r0 + r1] + movu m5, [r0 + r1 +16] + movu m3, [r2 + r3] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m4, m2 + psubw m5, m3 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m4, m4 + pmaddwd m5, m5 + + paddd m0, m1 + paddd m4, m5 + paddd m4, m0 + paddd m7, m4 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + 2 * r1 + 16] + movu m3, [r2 + 2 * r3] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m0, m2 + psubw m1, m3 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + movu m4, [r0 + r1] + movu m5, [r0 + r1 + 16] + movu m3, [r2 + r3] + pmovzxbw m2, m3 + punpckhbw m3, m6 + + psubw m4, m2 + psubw m5, m3 + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m4, m4 + pmaddwd m5, m5 + + paddd m0, m1 + paddd m4, m5 + paddd m4, m0 + paddd m7, m4 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2 + + pxor m6, m6 + pxor m7, m7 + add r1, r1 + PIXEL_SSD_SP_16x4 + HADDD m7, m1 + movd eax, m7 + + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2 + + pxor m6, m6 + pxor m7, m7 + add r1, r1 + PIXEL_SSD_SP_16x4 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2 + + pxor m6, m6 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 2] + lea r5, [r3 * 2] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + r5] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + r5] + PIXEL_SSD_SP_16x4 + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2 + + pxor m6, m6 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 2] + lea r5, [r3 * 2] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + r5] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + r5] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + r5] + PIXEL_SSD_SP_16x4 + HADDD m7, m1 + movd eax, m7 + RET + +cglobal pixel_ssd_sp_16x16_internal + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + ret + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2 + + pxor m6, m6 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2 + + pxor m6, m6 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 2] + lea r5, [r3 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + r5] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + r5] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + r5] + call pixel_ssd_sp_16x16_internal + + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2 + pxor m6, m6 + pxor m7, m7 + add r1, r1 + lea r4, [r1 * 2] + mov r5, r0 + mov r6, r2 + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + lea r4, [r1 * 3] + lea r5, [r3 * 3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + call pixel_ssd_sp_8x4_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + lea r0, [r5 + 32] + lea r2, [r6 + 16] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + PIXEL_SSD_SP_16x4 + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 64] + lea r2, [r6 + 32] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 64] + lea r2, [r6 + 32] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 96] + lea r2, [r6 + 48] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 64] + lea r2, [r6 + 32] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 96] + lea r2, [r6 + 48] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 64] + lea r2, [r6 + 32] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 96] + lea r2, [r6 + 48] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + +;----------------------------------------------------------------------------- +; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2 + + pxor m7, m7 + pxor m6, m6 + mov r5, r0 + mov r6, r2 + add r1, r1 + lea r4, [r1 * 2] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 32] + lea r2, [r6 + 16] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 64] + lea r2, [r6 + 32] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r5 + 96] + lea r2, [r6 + 48] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + lea r0, [r0 + r4] + lea r2, [r2 + 2 * r3] + call pixel_ssd_sp_16x16_internal + HADDD m7, m1 + movd eax, m7 + RET + + +;----------------------------------------------------------------------------- +; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pixel_ssd_s_4, 2,2,2 + add r1, r1 + movh m0, [r0] + movhps m0, [r0 + r1] + + lea r0, [r0 + r1 * 2] + movh m1, [r0] + movhps m1, [r0 + r1] + + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m0, m1 + + ; calculate sum and return + HADDD m0, m1 + movd eax, m0 + RET + + +INIT_XMM sse2 +cglobal pixel_ssd_s_8, 2,3,5 + add r1, r1 + lea r2, [r1 * 3] + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r2] + + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m0, m1 + paddd m2, m3 + paddd m0, m2 + + lea r0, [r0 + r1 * 4] + movu m4, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r2] + + pmaddwd m4, m4 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m4, m1 + paddd m2, m3 + paddd m4, m2 + paddd m0, m4 + + ; calculate sum and return + HADDD m0, m1 + movd eax, m0 + RET + + +INIT_XMM sse2 +cglobal pixel_ssd_s_16, 2,3,5 + add r1, r1 + + mov r2d, 4 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r0 + mmsize] + movu m3, [r0 + r1] + movu m4, [r0 + r1 + mmsize] + lea r0, [r0 + r1 * 2] + + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + movu m1, [r0] + movu m2, [r0 + mmsize] + movu m3, [r0 + r1] + movu m4, [r0 + r1 + mmsize] + lea r0, [r0 + r1 * 2] + + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + dec r2d + jnz .loop + + ; calculate sum and return + HADDD m0, m1 + movd eax, m0 + RET + + +INIT_XMM sse2 +cglobal pixel_ssd_s_32, 2,3,5 + add r1, r1 + + mov r2d, 16 + pxor m0, m0 +.loop: + movu m1, [r0 + 0 * mmsize] + movu m2, [r0 + 1 * mmsize] + movu m3, [r0 + 2 * mmsize] + movu m4, [r0 + 3 * mmsize] + add r0, r1 + + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + movu m1, [r0 + 0 * mmsize] + movu m2, [r0 + 1 * mmsize] + movu m3, [r0 + 2 * mmsize] + movu m4, [r0 + 3 * mmsize] + add r0, r1 + + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + dec r2d + jnz .loop + + ; calculate sum and return + HADDD m0, m1 + movd eax, m0 + RET + + +INIT_YMM avx2 +cglobal pixel_ssd_s_32, 2,4,5 + add r1, r1 + lea r3, [r1 * 3] + + mov r2d, 8 + pxor m0, m0 +.loop: + movu m1, [r0 + 0 * mmsize] + movu m2, [r0 + 1 * mmsize] + movu m3, [r0 + r1 + 0 * mmsize] + movu m4, [r0 + r1 + 1 * mmsize] + + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + movu m1, [r0 + r1 * 2 + 0 * mmsize] + movu m2, [r0 + r1 * 2 + 1 * mmsize] + movu m3, [r0 + r3 + 0 * mmsize] + movu m4, [r0 + r3 + 1 * mmsize] + lea r0, [r0 + 4 * r1] + + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + dec r2d + jnz .loop + + ; calculate sum and return + HADDD m0, m1 + movd eax, xm0 + RET diff --git a/source/common/x86/x86inc.asm b/source/common/x86/x86inc.asm new file mode 100644 index 0000000..5b44bb9 --- /dev/null +++ b/source/common/x86/x86inc.asm @@ -0,0 +1,1494 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2014 x264 project +;* 2013-2014 x265 project +;* +;* Authors: Loren Merritt +;* Anton Mitrofanov +;* Fiona Glaser +;* Henrik Gramner +;* Min Chen +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%ifndef private_prefix + %define private_prefix x265 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +%macro SECTION_RODATA 0-1 16 + SECTION .rodata align=%1 +%endmacro + +%macro SECTION_TEXT 0-1 16 + SECTION .text align=%1 +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 +%if ARCH_X86_64 == 0 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + %assign regs_used (regs_used + 1) + %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 + %warning "Stack pointer will overwrite register argument" + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 8 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %assign %%pad_size 0 + %if xmm_regs_used > 8 + %assign %%i xmm_regs_used + %rep xmm_regs_used-8 + %assign %%i %%i-1 + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add %1, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset (stack_offset-stack_size_padded) + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 + vzeroupper +%endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 + +%macro RET 0 +%if stack_size_padded > 0 +%if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm +%else + add rsp, stack_size_padded +%endif +%endif + POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 + vzeroupper +%endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 + +%macro RET 0 +%if stack_size_padded > 0 +%if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm +%else + add rsp, stack_size_padded +%endif +%endif + POP_IF_USED 6, 5, 4, 3 +%if mmsize == 32 + vzeroupper +%endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%macro WIN64_PUSH_XMM 0 +%endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %ifndef cpuflags + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. + %elif notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep + %endif + ret +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %%branch_instr: + %xdefine last_branch_adr %%branch_instr + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %ifidn __OUTPUT_FORMAT__,elf + global %2:function %%VISIBILITY + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %ifidn __OUTPUT_FORMAT__,elf + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 +%assign cpuflags_avx (1<<11)| cpuflags_sse42 +%assign cpuflags_xop (1<<12)| cpuflags_avx +%assign cpuflags_fma4 (1<<13)| cpuflags_avx +%assign cpuflags_avx2 (1<<14)| cpuflags_avx +%assign cpuflags_fma3 (1<<15)| cpuflags_avx + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<21) +%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 + +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPU amdnop + %else + CPU basicnop + %endif +%endmacro + +; Merge mmx and sse* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; (All 3 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define ymm%1xmm xmm%1 + %define xmm%1ymm ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i +%assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE n, m%1, %1 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) +%ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 +%else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 +%endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args n %+ %1 + %rep %0-1 + %xdefine %%args %%args, n %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1, %1 %+ SUFFIX +%endmacro +%macro call_internal 2 + %xdefine %%i %1 + %ifndef cglobaled_%1 + %ifdef cglobaled_%2 + %xdefine %%i %2 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 +%assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %ifnidn %6, %7 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 + %endif + %if %5 && %4 == 0 + %ifnid %8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 1, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 1 +AVX_INSTR addss, sse, 1, 0, 1 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, fnord, 0, 0, 0 +AVX_INSTR aesdeclast, fnord, 0, 0, 0 +AVX_INSTR aesenc, fnord, 0, 0, 0 +AVX_INSTR aesenclast, fnord, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 0, 0 +AVX_INSTR blendps, sse4, 1, 0, 0 +AVX_INSTR blendvpd, sse4, 1, 0, 0 +AVX_INSTR blendvps, sse4, 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2 +AVX_INSTR cvtsi2sd, sse2 +AVX_INSTR cvtsi2ss, sse +AVX_INSTR cvtss2sd, sse2 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 1 +AVX_INSTR maxss, sse, 1, 0, 1 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 1 +AVX_INSTR minss, sse, 1, 0, 1 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 1 +AVX_INSTR mulss, sse, 1, 0, 1 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4, 0, 0, 0 +AVX_INSTR pblendw, sse4 +AVX_INSTR pclmulqdq +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4 +AVX_INSTR pinsrd, sse4 +AVX_INSTR pinsrq, sse4 +AVX_INSTR pinsrw, mmx2 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse, 1, 0, 0 +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4 +AVX_INSTR roundss, sse4 +AVX_INSTR rsqrtps, sse, 1, 0, 0 +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2, 1, 0, 0 +AVX_INSTR sqrtps, sse, 1, 0, 0 +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif +%assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; convert FMA4 to FMA3 if possible +%macro FMA4_INSTR 4 + %macro %1 4-8 %1, %2, %3, %4 + %if cpuflag(fma4) + v%5 %1, %2, %3, %4 + %elifidn %1, %2 + v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 + %elifidn %1, %3 + v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 + %elifidn %1, %4 + v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 + %else + %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd +FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps +FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd +FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss + +FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd +FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps +FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd +FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps + +FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd +FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps +FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd +FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss + +FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd +FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps +FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd +FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss + +FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd +FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps +FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd +FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug +%if ARCH_X86_64 == 0 +%macro vpbroadcastq 2 +%if sizeof%1 == 16 + movddup %1, %2 +%else + vbroadcastsd %1, %2 +%endif +%endmacro +%endif + +; workaround: vpbroadcastd with register, the yasm will generate wrong code +%macro vpbroadcastd 2 + %ifid %2 + movd %1 %+ xmm, %2 + vpbroadcastd %1, %1 %+ xmm + %else + vpbroadcastd %1, %2 + %endif +%endmacro diff --git a/source/common/x86/x86util.asm b/source/common/x86/x86util.asm new file mode 100644 index 0000000..f2c8715 --- /dev/null +++ b/source/common/x86/x86util.asm @@ -0,0 +1,893 @@ +;***************************************************************************** +;* x86util.asm: x86 utility macros +;***************************************************************************** +;* Copyright (C) 2008-2013 x264 project +;* +;* Authors: Holger Lubitz +;* Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%assign FENC_STRIDE 64 +%assign FDEC_STRIDE 32 + +%assign SIZEOF_PIXEL 1 +%assign SIZEOF_DCTCOEF 2 +%define pixel byte +%define vpbroadcastdct vpbroadcastw +%define vpbroadcastpix vpbroadcastb +%if HIGH_BIT_DEPTH + %assign SIZEOF_PIXEL 2 + %assign SIZEOF_DCTCOEF 4 + %define pixel word + %define vpbroadcastdct vpbroadcastd + %define vpbroadcastpix vpbroadcastw +%endif + +%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE +%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE + +%assign PIXEL_MAX ((1 << BIT_DEPTH)-1) + +%macro FIX_STRIDES 1-* +%if HIGH_BIT_DEPTH +%rep %0 + add %1, %1 + %rotate 1 +%endrep +%endif +%endmacro + + +%macro SBUTTERFLY 4 +%ifidn %1, dqqq + vperm2i128 m%4, m%2, m%3, q0301 ; punpckh + vinserti128 m%2, m%2, xm%3, 1 ; punpckl +%elif avx_enabled && mmsize >= 16 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%else + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 +%endif + SWAP %3, %4 +%endmacro + +%macro SBUTTERFLY2 4 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 + SWAP %2, %4, %3 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%if ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + +%macro WIDEN_SXWD 2 + punpckhwd m%2, m%1 + psrad m%2, 16 +%if cpuflag(sse4) + pmovsxwd m%1, m%1 +%else + punpcklwd m%1, m%1 + psrad m%1, 16 +%endif +%endmacro + +%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src) +%if cpuflag(ssse3) + pabsw %1, %2 +%elifidn %3, sign ; version for pairing with PSIGNW: modifies src + pxor %1, %1 + pcmpgtw %1, %2 + pxor %2, %1 + psubw %2, %1 + SWAP %1, %2 +%elifidn %1, %2 + pxor %3, %3 + psubw %3, %1 + pmaxsw %1, %3 +%elifid %2 + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 +%elif %0 == 2 + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 +%else + mova %1, %2 + pxor %3, %3 + psubw %3, %1 + pmaxsw %1, %3 +%endif +%endmacro + +%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp +%if cpuflag(ssse3) + pabsw %1, %3 + pabsw %2, %4 +%elifidn %1, %3 + pxor %5, %5 + pxor %6, %6 + psubw %5, %1 + psubw %6, %2 + pmaxsw %1, %5 + pmaxsw %2, %6 +%else + pxor %1, %1 + pxor %2, %2 + psubw %1, %3 + psubw %2, %4 + pmaxsw %1, %3 + pmaxsw %2, %4 +%endif +%endmacro + +%macro ABSB 2 +%if cpuflag(ssse3) + pabsb %1, %1 +%else + pxor %2, %2 + psubb %2, %1 + pminub %1, %2 +%endif +%endmacro + +%macro ABSD 2-3 +%if cpuflag(ssse3) + pabsd %1, %2 +%else + %define %%s %2 +%if %0 == 3 + mova %3, %2 + %define %%s %3 +%endif + pxor %1, %1 + pcmpgtd %1, %%s + pxor %%s, %1 + psubd %%s, %1 + SWAP %1, %%s +%endif +%endmacro + +%macro PSIGN 3-4 +%if cpuflag(ssse3) && %0 == 4 + psign%1 %2, %3, %4 +%elif cpuflag(ssse3) + psign%1 %2, %3 +%elif %0 == 4 + pxor %2, %3, %4 + psub%1 %2, %4 +%else + pxor %2, %3 + psub%1 %2, %3 +%endif +%endmacro + +%define PSIGNW PSIGN w, +%define PSIGND PSIGN d, + +%macro SPLATB_LOAD 3 +%if cpuflag(ssse3) + movd %1, [%2-3] + pshufb %1, %3 +%else + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 + SPLATW %1, %1, 3 +%endif +%endmacro + +%imacro SPLATW 2-3 0 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%else + PSHUFLW %1, %2, (%3)*q1111 +%if mmsize == 16 + punpcklqdq %1, %1 +%endif +%endif +%endmacro + +%imacro SPLATD 2-3 0 +%if mmsize == 16 + pshufd %1, %2, (%3)*q1111 +%else + pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010 +%endif +%endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro + +%macro CLIPW2 4 ;(dst0, dst1, min, max) + pmaxsw %1, %3 + pmaxsw %2, %3 + pminsw %1, %4 + pminsw %2, %4 +%endmacro + +%macro HADDD 2 ; sum junk +%if sizeof%1 == 32 +%define %2 xmm%2 + vextracti128 %2, %1, 1 +%define %1 xmm%1 + paddd %1, %2 +%endif +%if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif + movhlps %2, %1 + paddd %1, %2 +%endif +%if notcpuflag(xop) + PSHUFLW %2, %1, q0032 + paddd %1, %2 +%endif +%undef %1 +%undef %2 +%endmacro + +%macro HADDW 2 ; reg, tmp +%if cpuflag(xop) && sizeof%1 == 16 + vphaddwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + pmaddwd %1, [pw_1] + HADDD %1, %2 +%endif +%endmacro + +%macro HADDUWD 2 +%if cpuflag(xop) && sizeof%1 == 16 + vphadduwd %1, %1 +%else + psrld %2, %1, 16 + pslld %1, 16 + psrld %1, 16 + paddd %1, %2 +%endif +%endmacro + +%macro HADDUW 2 +%if cpuflag(xop) && sizeof%1 == 16 + vphadduwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + HADDUWD %1, %2 + HADDD %1, %2 +%endif +%endmacro + +%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp +; AVX2 version uses a precalculated extra input that +; can be re-used across calls +%if sizeof%1==32 + ; %3 = abcdefgh ijklmnop (lower address) + ; %2 = ABCDEFGH IJKLMNOP (higher address) +; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH +%if %4 < 16 + palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA +%else + palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO +%endif +%elif cpuflag(ssse3) + %if %0==5 + palignr %1, %2, %3, %4 + %else + palignr %1, %2, %3 + %endif +%else + %define %%dst %1 + %if %0==5 + %ifnidn %1, %2 + mova %%dst, %2 + %endif + %rotate 1 + %endif + %ifnidn %4, %2 + mova %4, %2 + %endif + %if mmsize==8 + psllq %%dst, (8-%3)*8 + psrlq %4, %3*8 + %else + pslldq %%dst, 16-%3 + psrldq %4, %3 + %endif + por %%dst, %4 +%endif +%endmacro + +%macro PSHUFLW 1+ + %if mmsize == 8 + pshufw %1 + %else + pshuflw %1 + %endif +%endmacro + +; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes +; values shifted in are undefined +; faster if dst==src +%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift +%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift +%macro PSXLPIX 5 + %if mmsize == 8 + %if %5&1 + ps%1lq %3, %4, %5*8 + %else + pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff + %endif + %else + ps%1ldq %3, %4, %5*2 + %endif +%endmacro + +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + pand m%3, m%5, m%4 ; src .. y6 .. y4 + pand m%1, m%5, m%2 ; dst .. y6 .. y4 +%else + mova m%1, %5 + pand m%3, m%1, m%4 ; src .. y6 .. y4 + pand m%1, m%1, m%2 ; dst .. y6 .. y4 +%endif + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 3-4 +%if %0==3 + padd%1 m%2, m%3 + padd%1 m%3, m%3 + psub%1 m%3, m%2 +%elif avx_enabled + padd%1 m%4, m%2, m%3 + psub%1 m%3, m%2 + SWAP %2, %4 +%else + mova m%4, m%2 + padd%1 m%2, m%3 + psub%1 m%3, m%4 +%endif +%endmacro + +%macro SUMSUB_BADC 5-6 +%if %0==6 + SUMSUB_BA %1, %2, %3, %6 + SUMSUB_BA %1, %4, %5, %6 +%else + padd%1 m%2, m%3 + padd%1 m%4, m%5 + padd%1 m%3, m%3 + padd%1 m%5, m%5 + psub%1 m%3, m%2 + psub%1 m%5, m%4 +%endif +%endmacro + +%macro HADAMARD4_V 4+ + SUMSUB_BADC w, %1, %2, %3, %4 + SUMSUB_BADC w, %1, %3, %2, %4 +%endmacro + +%macro HADAMARD8_V 8+ + SUMSUB_BADC w, %1, %2, %3, %4 + SUMSUB_BADC w, %5, %6, %7, %8 + SUMSUB_BADC w, %1, %3, %2, %4 + SUMSUB_BADC w, %5, %7, %6, %8 + SUMSUB_BADC w, %1, %5, %2, %6 + SUMSUB_BADC w, %3, %7, %4, %8 +%endmacro + +%macro TRANS_SSE2 5-6 +; TRANSPOSE2x2 +; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq +; %2: ord/unord (for compat with sse4, unused) +; %3/%4: source regs +; %5/%6: tmp regs +%ifidn %1, d +%define mask [mask_10] +%define shift 16 +%elifidn %1, q +%define mask [mask_1100] +%define shift 32 +%endif +%if %0==6 ; less dependency if we have two tmp + mova m%5, mask ; ff00 + mova m%6, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pand m%6, m%5 ; x5.. + pandn m%5, m%3 ; ..x0 + psrl%1 m%3, shift ; ..x1 + por m%4, m%5 ; x4x0 + por m%3, m%6 ; x5x1 +%else ; more dependency, one insn less. sometimes faster, sometimes not + mova m%5, m%4 ; x5x4 + psll%1 m%4, shift ; x4.. + pxor m%4, m%3 ; (x4^x1)x0 + pand m%4, mask ; (x4^x1).. + pxor m%3, m%4 ; x4x0 + psrl%1 m%4, shift ; ..(x1^x4) + pxor m%5, m%4 ; x5x1 + SWAP %4, %3, %5 +%endif +%endmacro + +%macro TRANS_SSE4 5-6 ; see above +%ifidn %1, d +%ifidn %2, ord + psrl%1 m%5, m%3, 16 + pblendw m%5, m%4, q2222 + psll%1 m%4, 16 + pblendw m%4, m%3, q1111 + SWAP %3, %5 +%else +%if avx_enabled + pblendw m%5, m%3, m%4, q2222 + SWAP %3, %5 +%else + mova m%5, m%3 + pblendw m%3, m%4, q2222 +%endif + psll%1 m%4, 16 + psrl%1 m%5, 16 + por m%4, m%5 +%endif +%elifidn %1, q + shufps m%5, m%3, m%4, q3131 + shufps m%3, m%3, m%4, q2020 + SWAP %4, %5 +%endif +%endmacro + +%macro TRANS_XOP 5-6 +%ifidn %1, d + vpperm m%5, m%3, m%4, [transd_shuf1] + vpperm m%3, m%3, m%4, [transd_shuf2] +%elifidn %1, q + shufps m%5, m%3, m%4, q3131 + shufps m%3, m%4, q2020 +%endif + SWAP %4, %5 +%endmacro + +%macro HADAMARD 5-6 +; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) +; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) +; %3/%4: regs +; %5(%6): tmpregs +%if %1!=0 ; have to reorder stuff for horizontal op + %ifidn %2, sumsub + %define ORDER ord + ; sumsub needs order because a-b != b-a unless a=b + %else + %define ORDER unord + ; if we just max, order doesn't matter (allows pblendw+or in sse4) + %endif + %if %1==1 + TRANS d, ORDER, %3, %4, %5, %6 + %elif %1==2 + %if mmsize==8 + SBUTTERFLY dq, %3, %4, %5 + %else + TRANS q, ORDER, %3, %4, %5, %6 + %endif + %elif %1==4 + SBUTTERFLY qdq, %3, %4, %5 + %elif %1==8 + SBUTTERFLY dqqq, %3, %4, %5 + %endif +%endif +%ifidn %2, sumsub + SUMSUB_BA w, %3, %4, %5 +%else + %ifidn %2, amax + %if %0==6 + ABSW2 m%3, m%4, m%3, m%4, m%5, m%6 + %else + ABSW m%3, m%3, m%5 + ABSW m%4, m%4, m%5 + %endif + %endif + pmaxsw m%3, m%4 +%endif +%endmacro + + +%macro HADAMARD2_2D 6-7 sumsub + HADAMARD 0, sumsub, %1, %2, %5 + HADAMARD 0, sumsub, %3, %4, %5 + SBUTTERFLY %6, %1, %2, %5 +%ifnum %7 + HADAMARD 0, amax, %1, %2, %5, %7 +%else + HADAMARD 0, %7, %1, %2, %5 +%endif + SBUTTERFLY %6, %3, %4, %5 +%ifnum %7 + HADAMARD 0, amax, %3, %4, %5, %7 +%else + HADAMARD 0, %7, %3, %4, %5 +%endif +%endmacro + +%macro HADAMARD4_2D 5-6 sumsub + HADAMARD2_2D %1, %2, %3, %4, %5, wd + HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6 + SWAP %2, %3 +%endmacro + +%macro HADAMARD4_2D_SSE 5-6 sumsub + HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1 + HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3 + SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0 + SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2 + HADAMARD2_2D %1, %3, %2, %4, %5, dq + SBUTTERFLY qdq, %1, %2, %5 + HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1 + SBUTTERFLY qdq, %3, %4, %5 + HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3 +%endmacro + +%macro HADAMARD8_2D 9-10 sumsub + HADAMARD2_2D %1, %2, %3, %4, %9, wd + HADAMARD2_2D %5, %6, %7, %8, %9, wd + HADAMARD2_2D %1, %3, %2, %4, %9, dq + HADAMARD2_2D %5, %7, %6, %8, %9, dq + HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10 + HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10 +%ifnidn %10, amax + SWAP %2, %5 + SWAP %4, %7 +%endif +%endmacro + +; doesn't include the "pmaddubsw hmul_8p" pass +%macro HADAMARD8_2D_HMUL 10 + HADAMARD4_V %1, %2, %3, %4, %9 + HADAMARD4_V %5, %6, %7, %8, %9 + SUMSUB_BADC w, %1, %5, %2, %6, %9 + HADAMARD 2, sumsub, %1, %5, %9, %10 + HADAMARD 2, sumsub, %2, %6, %9, %10 + SUMSUB_BADC w, %3, %7, %4, %8, %9 + HADAMARD 2, sumsub, %3, %7, %9, %10 + HADAMARD 2, sumsub, %4, %8, %9, %10 + HADAMARD 1, amax, %1, %5, %9, %10 + HADAMARD 1, amax, %2, %6, %9, %5 + HADAMARD 1, amax, %3, %7, %9, %5 + HADAMARD 1, amax, %4, %8, %9, %5 +%endmacro + +%macro SUMSUB2_AB 4 +%if cpuflag(xop) + pmacs%1%1 m%4, m%3, [p%1_m2], m%2 + pmacs%1%1 m%2, m%2, [p%1_2], m%3 +%elifnum %3 + psub%1 m%4, m%2, m%3 + psub%1 m%4, m%3 + padd%1 m%2, m%2 + padd%1 m%2, m%3 +%else + mova m%4, m%2 + padd%1 m%2, m%2 + padd%1 m%2, %3 + psub%1 m%4, %3 + psub%1 m%4, %3 +%endif +%endmacro + +%macro SUMSUBD2_AB 5 +%ifnum %4 + psra%1 m%5, m%2, 1 ; %3: %3>>1 + psra%1 m%4, m%3, 1 ; %2: %2>>1 + padd%1 m%4, m%2 ; %3: %3>>1+%2 + psub%1 m%5, m%3 ; %2: %2>>1-%3 + SWAP %2, %5 + SWAP %3, %4 +%else + mova %5, m%2 + mova %4, m%3 + psra%1 m%3, 1 ; %3: %3>>1 + psra%1 m%2, 1 ; %2: %2>>1 + padd%1 m%3, %5 ; %3: %3>>1+%2 + psub%1 m%2, %4 ; %2: %2>>1-%3 +%endif +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC w, %4, %1, %3, %2, %5 + SUMSUB_BA w, %3, %4, %5 + SUMSUB2_AB w, %1, %2, %5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC w, %4, %1, %3, %2 + SUMSUB_BA w, %3, %4 + mova [%5], m%2 + SUMSUB2_AB w, %1, [%5], %2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 6-7 +%ifnum %6 + SUMSUBD2_AB %1, %3, %5, %7, %6 + ; %3: %3>>1-%5 %5: %3+%5>>1 + SUMSUB_BA %1, %4, %2, %7 + ; %4: %2+%4 %2: %2-%4 + SUMSUB_BADC %1, %5, %4, %3, %2, %7 + ; %5: %2+%4 + (%3+%5>>1) + ; %4: %2+%4 - (%3+%5>>1) + ; %3: %2-%4 + (%3>>1-%5) + ; %2: %2-%4 - (%3>>1-%5) +%else +%ifidn %1, w + SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] +%else + SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] +%endif + SUMSUB_BA %1, %4, %2 + SUMSUB_BADC %1, %5, %4, %3, %2 +%endif + SWAP %2, %5, %4 + ; %2: %2+%4 + (%3+%5>>1) row0 + ; %3: %2-%4 + (%3>>1-%5) row1 + ; %4: %2-%4 - (%3>>1-%5) row2 + ; %5: %2+%4 - (%3+%5>>1) row3 +%endmacro + + +%macro LOAD_DIFF 5-6 1 +%if HIGH_BIT_DEPTH +%if %6 ; %5 aligned? + mova %1, %4 + psubw %1, %5 +%else + movu %1, %4 + movu %2, %5 + psubw %1, %2 +%endif +%else ; !HIGH_BIT_DEPTH +%ifidn %3, none + movh %1, %4 + movh %2, %5 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%else + movh %1, %4 + punpcklbw %1, %3 + movh %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endif +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr +%if BIT_DEPTH == 8 && cpuflag(ssse3) + movh m%2, [%8+%1*FDEC_STRIDE] + movh m%1, [%7+%1*FENC_STRIDE] + punpcklbw m%1, m%2 + movh m%3, [%8+%2*FDEC_STRIDE] + movh m%2, [%7+%2*FENC_STRIDE] + punpcklbw m%2, m%3 + movh m%4, [%8+%3*FDEC_STRIDE] + movh m%3, [%7+%3*FENC_STRIDE] + punpcklbw m%3, m%4 + movh m%5, [%8+%4*FDEC_STRIDE] + movh m%4, [%7+%4*FENC_STRIDE] + punpcklbw m%4, m%5 + pmaddubsw m%1, m%6 + pmaddubsw m%2, m%6 + pmaddubsw m%3, m%6 + pmaddubsw m%4, m%6 +%else + LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB] + LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB] + LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB] + LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB] +%endif +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro STORE_IDCT 4 + movhps [r0-4*FDEC_STRIDE], %1 + movh [r0-3*FDEC_STRIDE], %1 + movhps [r0-2*FDEC_STRIDE], %2 + movh [r0-1*FDEC_STRIDE], %2 + movhps [r0+0*FDEC_STRIDE], %3 + movh [r0+1*FDEC_STRIDE], %3 + movhps [r0+2*FDEC_STRIDE], %4 + movh [r0+3*FDEC_STRIDE], %4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11 + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11 + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11 + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11 +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +; 2xdst, 2xtmp, 2xsrcrow +%macro LOAD_DIFF16x2_AVX2 6 + pmovzxbw m%1, [r1+%5*FENC_STRIDE] + pmovzxbw m%2, [r1+%6*FENC_STRIDE] + pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE] + pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE] + psubw m%1, m%3 + psubw m%2, m%4 +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 + psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 +%endmacro + +; (high depth) in: %1, %2, min to clip, max to clip, mem128 +; in: %1, tmp, %3, mem64 +%macro STORE_DIFF 4-5 +%if HIGH_BIT_DEPTH + psrad %1, 6 + psrad %2, 6 + packssdw %1, %2 + paddw %1, %5 + CLIPW %1, %3, %4 + mova %5, %1 +%else + movh %2, %4 + punpcklbw %2, %3 + psraw %1, 6 + paddsw %1, %2 + packuswb %1, %1 + movh %4, %1 +%endif +%endmacro + +%macro SHUFFLE_MASK_W 8 + %rep 8 + %if %1>=0x80 + db %1, %1 + %else + db %1*2 + db %1*2+1 + %endif + %rotate 1 + %endrep +%endmacro + +; instruction, accum, input, iteration (zero to swap, nonzero to add) +%macro ACCUM 4 +%if %4 + %1 m%2, m%3 +%else + SWAP %2, %3 +%endif +%endmacro + +; IACA support +%macro IACA_START 0 + mov ebx, 111 + db 0x64, 0x67, 0x90 +%endmacro + +%macro IACA_END 0 + mov ebx, 222 + db 0x64, 0x67, 0x90 +%endmacro diff --git a/source/common/yuv.cpp b/source/common/yuv.cpp new file mode 100644 index 0000000..fffc215 --- /dev/null +++ b/source/common/yuv.cpp @@ -0,0 +1,184 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + + +#include "common.h" +#include "yuv.h" +#include "shortyuv.h" +#include "picyuv.h" +#include "primitives.h" + +using namespace x265; + +Yuv::Yuv() +{ + m_buf[0] = NULL; + m_buf[1] = NULL; + m_buf[2] = NULL; +} + +bool Yuv::create(uint32_t size, int csp) +{ + m_csp = csp; + m_hChromaShift = CHROMA_H_SHIFT(csp); + m_vChromaShift = CHROMA_V_SHIFT(csp); + + // set width and height + m_size = size; + m_csize = size >> m_hChromaShift; + m_part = partitionFromSizes(size, size); + + size_t sizeL = size * size; + size_t sizeC = sizeL >> (m_vChromaShift + m_hChromaShift); + + X265_CHECK((sizeC & 15) == 0, "invalid size"); + + // memory allocation (padded for SIMD reads) + CHECKED_MALLOC(m_buf[0], pixel, sizeL + sizeC * 2 + 8); + m_buf[1] = m_buf[0] + sizeL; + m_buf[2] = m_buf[0] + sizeL + sizeC; + return true; + +fail: + return false; +} + +void Yuv::destroy() +{ + X265_FREE(m_buf[0]); +} + +void Yuv::copyToPicYuv(PicYuv& dstPic, uint32_t cuAddr, uint32_t absPartIdx) const +{ + pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx); + + primitives.luma_copy_pp[m_part](dstY, dstPic.m_stride, m_buf[0], m_size); + + pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx); + pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx); + primitives.chroma[m_csp].copy_pp[m_part](dstU, dstPic.m_strideC, m_buf[1], m_csize); + primitives.chroma[m_csp].copy_pp[m_part](dstV, dstPic.m_strideC, m_buf[2], m_csize); +} + +void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx) +{ + /* We cheat with const_cast internally because the get methods are not capable of + * returning const buffers and the primitives are not const aware, but we know + * this function does not modify srcPic */ + PicYuv& srcPicSafe = const_cast(srcPic); + pixel* srcY = srcPicSafe.getLumaAddr(cuAddr, absPartIdx); + + primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcY, srcPic.m_stride); + + pixel* srcU = srcPicSafe.getCbAddr(cuAddr, absPartIdx); + pixel* srcV = srcPicSafe.getCrAddr(cuAddr, absPartIdx); + primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcU, srcPicSafe.m_strideC); + primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcV, srcPicSafe.m_strideC); +} + +void Yuv::copyFromYuv(const Yuv& srcYuv) +{ + X265_CHECK(m_size <= srcYuv.m_size, "invalid size\n"); + + primitives.luma_copy_pp[m_part](m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size); + primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize); + primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize); +} + +void Yuv::copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const +{ + pixel* dstY = dstYuv.getLumaAddr(absPartIdx); + primitives.luma_copy_pp[m_part](dstY, dstYuv.m_size, m_buf[0], m_size); + + pixel* dstU = dstYuv.getCbAddr(absPartIdx); + pixel* dstV = dstYuv.getCrAddr(absPartIdx); + primitives.chroma[m_csp].copy_pp[m_part](dstU, dstYuv.m_csize, m_buf[1], m_csize); + primitives.chroma[m_csp].copy_pp[m_part](dstV, dstYuv.m_csize, m_buf[2], m_csize); +} + +void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const +{ + pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size); + pixel* dstY = dstYuv.m_buf[0]; + + primitives.luma_copy_pp[dstYuv.m_part](dstY, dstYuv.m_size, srcY, m_size); + + pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx); + pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx); + pixel* dstU = dstYuv.m_buf[1]; + pixel* dstV = dstYuv.m_buf[2]; + primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstU, dstYuv.m_csize, srcU, m_csize); + primitives.chroma[m_csp].copy_pp[dstYuv.m_part](dstV, dstYuv.m_csize, srcV, m_csize); +} + +void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL) +{ + primitives.luma_add_ps[log2SizeL - 2](m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size); + primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize); + primitives.chroma[m_csp].add_ps[log2SizeL - 2](m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize); +} + +void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma) +{ + int part = partitionFromSizes(width, height); + + if (bLuma) + { + int16_t* srcY0 = const_cast(srcYuv0).getLumaAddr(absPartIdx); + int16_t* srcY1 = const_cast(srcYuv1).getLumaAddr(absPartIdx); + pixel* dstY = getLumaAddr(absPartIdx); + + primitives.luma_addAvg[part](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size); + } + if (bChroma) + { + int16_t* srcU0 = const_cast(srcYuv0).getCbAddr(absPartIdx); + int16_t* srcV0 = const_cast(srcYuv0).getCrAddr(absPartIdx); + int16_t* srcU1 = const_cast(srcYuv1).getCbAddr(absPartIdx); + int16_t* srcV1 = const_cast(srcYuv1).getCrAddr(absPartIdx); + pixel* dstU = getCbAddr(absPartIdx); + pixel* dstV = getCrAddr(absPartIdx); + + primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize); + primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize); + } +} + +void Yuv::copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const +{ + const pixel* src = getLumaAddr(absPartIdx); + pixel* dst = dstYuv.getLumaAddr(absPartIdx); + primitives.square_copy_pp[log2Size - 2](dst, dstYuv.m_size, const_cast(src), m_size); +} + +void Yuv::copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const +{ + int part = partitionFromLog2Size(log2SizeL); + const pixel* srcU = getCbAddr(absPartIdx); + const pixel* srcV = getCrAddr(absPartIdx); + pixel* dstU = dstYuv.getCbAddr(absPartIdx); + pixel* dstV = dstYuv.getCrAddr(absPartIdx); + + primitives.chroma[m_csp].copy_pp[part](dstU, dstYuv.m_csize, const_cast(srcU), m_csize); + primitives.chroma[m_csp].copy_pp[part](dstV, dstYuv.m_csize, const_cast(srcV), m_csize); +} diff --git a/source/common/yuv.h b/source/common/yuv.h new file mode 100644 index 0000000..a02987c --- /dev/null +++ b/source/common/yuv.h @@ -0,0 +1,109 @@ +/***************************************************************************** + * Copyright (C) 2014 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_YUV_H +#define X265_YUV_H + +#include "common.h" +#include "primitives.h" + +namespace x265 { +// private namespace + +class ShortYuv; +class PicYuv; + +/* A Yuv instance holds pixels for a square CU (64x64 down to 8x8) for all three planes + * these are typically used to hold fenc, predictions, or reconstructed blocks */ +class Yuv +{ +public: + + pixel* m_buf[3]; + + uint32_t m_size; + uint32_t m_csize; + int m_part; // cached partition enum size + + int m_csp; + int m_hChromaShift; + int m_vChromaShift; + + Yuv(); + + bool create(uint32_t size, int csp); + void destroy(); + + // Copy YUV buffer to picture buffer + void copyToPicYuv(PicYuv& destPicYuv, uint32_t cuAddr, uint32_t absPartIdx) const; + + // Copy YUV buffer from picture buffer + void copyFromPicYuv(const PicYuv& srcPicYuv, uint32_t cuAddr, uint32_t absPartIdx); + + // Copy from same size YUV buffer + void copyFromYuv(const Yuv& srcYuv); + + // Copy Small YUV buffer to the part of other Big YUV buffer + void copyToPartYuv(Yuv& dstYuv, uint32_t absPartIdx) const; + + // Copy the part of Big YUV buffer to other Small YUV buffer + void copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const; + + // Clip(srcYuv0 + srcYuv1) -> m_buf .. aka recon = clip(pred + residual) + void addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL); + + // (srcYuv0 + srcYuv1)/2 for YUV partition (bidir averaging) + void addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma); + + void copyPartToPartLuma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const; + void copyPartToPartChroma(Yuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const; + + pixel* getLumaAddr(uint32_t absPartIdx) { return m_buf[0] + getAddrOffset(absPartIdx, m_size); } + pixel* getCbAddr(uint32_t absPartIdx) { return m_buf[1] + getChromaAddrOffset(absPartIdx); } + pixel* getCrAddr(uint32_t absPartIdx) { return m_buf[2] + getChromaAddrOffset(absPartIdx); } + pixel* getChromaAddr(uint32_t chromaId, uint32_t absPartIdx) { return m_buf[chromaId] + getChromaAddrOffset(absPartIdx); } + + const pixel* getLumaAddr(uint32_t absPartIdx) const { return m_buf[0] + getAddrOffset(absPartIdx, m_size); } + const pixel* getCbAddr(uint32_t absPartIdx) const { return m_buf[1] + getChromaAddrOffset(absPartIdx); } + const pixel* getCrAddr(uint32_t absPartIdx) const { return m_buf[2] + getChromaAddrOffset(absPartIdx); } + const pixel* getChromaAddr(uint32_t chromaId, uint32_t absPartIdx) const { return m_buf[chromaId] + getChromaAddrOffset(absPartIdx); } + + int getChromaAddrOffset(uint32_t absPartIdx) const + { + int blkX = g_zscanToPelX[absPartIdx] >> m_hChromaShift; + int blkY = g_zscanToPelY[absPartIdx] >> m_vChromaShift; + + return blkX + blkY * m_csize; + } + + static int getAddrOffset(uint32_t absPartIdx, uint32_t width) + { + int blkX = g_zscanToPelX[absPartIdx]; + int blkY = g_zscanToPelY[absPartIdx]; + + return blkX + blkY * width; + } +}; +} + +#endif diff --git a/source/compat/getopt/LGPL.txt b/source/compat/getopt/LGPL.txt new file mode 100644 index 0000000..602bfc9 --- /dev/null +++ b/source/compat/getopt/LGPL.txt @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/source/compat/getopt/getopt.c b/source/compat/getopt/getopt.c new file mode 100644 index 0000000..5427773 --- /dev/null +++ b/source/compat/getopt/getopt.c @@ -0,0 +1,1066 @@ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + +#ifdef HAVE_CONFIG_H +# include +#endif + +#if !defined __STDC__ || !__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. */ +# if defined HAVE_LIBINTL_H || defined _LIBC +# include +# ifndef _ +# define _(msgid) gettext (msgid) +# endif +# else +# define _(msgid) (msgid) +# endif +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + +#include "getopt.h" + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# include +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined __STDC__ || !__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not __STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Stored original parameters. + XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ +extern int __libc_argc; +extern char **__libc_argv; + +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +# ifdef USE_NONOPTION_FLAGS +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; +# endif + +# ifdef USE_NONOPTION_FLAGS +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +# else +# define SWAP_FLAGS(ch1, ch2) +# endif +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined __STDC__ && __STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined __STDC__ && __STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + if (posixly_correct == NULL + && argc == __libc_argc && argv == __libc_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int32_t *longind; + int long_only; +{ + int print_errors = opterr; + if (optstring[0] == ':') + print_errors = 0; + + if (argc < 1) + return -1; + + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#if defined _LIBC && defined USE_NONOPTION_FLAGS +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else if (long_only + || pfound->has_arg != p->has_arg + || pfound->flag != p->flag + || pfound->val != p->val) + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (print_errors) + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { + if (argv[optind - 1][1] == '-') + /* --option */ + fprintf (stderr, + _("%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); + else + /* +option or -option */ + fprintf (stderr, + _("%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); + } + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (print_errors) + { + if (argv[optind][1] == '-') + /* --option */ + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); + else + /* +option or -option */ + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (print_errors) + { + if (posixly_correct) + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: illegal option -- %c\n"), + argv[0], c); + else + fprintf (stderr, _("%s: invalid option -- %c\n"), + argv[0], c); + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (print_errors) + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int32_t *) 0, + 0); +} + +int +getopt_long (argc, argv, options, long_options, opt_index) + int argc; + char *const *argv; + const char *options; + const struct option *long_options; + int32_t *opt_index; +{ + return _getopt_internal (argc, argv, options, long_options, opt_index, 0); +} + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +/* Compile with -DTEST to make an executable for use in testing + the above definition of `getopt'. */ + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + + c = getopt (argc, argv, "abc:d:0123456789"); + if (c == -1) + break; + + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ diff --git a/source/compat/getopt/getopt.h b/source/compat/getopt/getopt.h new file mode 100644 index 0000000..886794a --- /dev/null +++ b/source/compat/getopt/getopt.h @@ -0,0 +1,182 @@ +/* Declarations for getopt. + Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _GETOPT_H + +#ifndef __need_getopt +# define _GETOPT_H 1 +#endif + +#include + +/* If __GNU_LIBRARY__ is not already defined, either we are being used + standalone, or this is the first header included in the source file. + If we are being used with glibc, we need to include , but + that does not exist if we are standalone. So: if __GNU_LIBRARY__ is + not defined, include , which will pull in for us + if it's from glibc. (Why ctype.h? It's guaranteed to exist and it + doesn't flood the namespace with stuff the way some other headers do.) */ +#if !defined __GNU_LIBRARY__ +# include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#ifndef __need_getopt +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +# if (defined __STDC__ && __STDC__) || defined __cplusplus + const char *name; +# else + char *name; +# endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int32_t *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +# define no_argument 0 +# define required_argument 1 +# define optional_argument 2 +#endif /* need getopt */ + + +/* Get definitions and prototypes for functions to process the + arguments in ARGV (ARGC of them, minus the program name) for + options given in OPTS. + + Return the option character from OPTS just read. Return -1 when + there are no more options. For unrecognized options, or options + missing arguments, `optopt' is set to the option letter, and '?' is + returned. + + The OPTS string is a list of characters which are recognized option + letters, optionally followed by colons, specifying that that letter + takes an argument, to be placed in `optarg'. + + If a letter in OPTS is followed by two colons, its argument is + optional. This behavior is specific to the GNU `getopt'. + + The argument `--' causes premature termination of argument + scanning, explicitly telling `getopt' that there are no more + options. + + If OPTS begins with `--', then non-option arguments are treated as + arguments to the option '\0'. This behavior is specific to the GNU + `getopt'. */ + +#if (defined __STDC__ && __STDC__) || defined __cplusplus +# ifdef __GNU_LIBRARY__ +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int __argc, char *const *__argv, const char *__shortopts); +# else /* not __GNU_LIBRARY__ */ +extern int getopt (); +# endif /* __GNU_LIBRARY__ */ + +# ifndef __need_getopt +extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts, + const struct option *__longopts, int32_t *__longind); +extern int getopt_long_only (int __argc, char *const *__argv, + const char *__shortopts, + const struct option *__longopts, int32_t *__longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int __argc, char *const *__argv, + const char *__shortopts, + const struct option *__longopts, int32_t *__longind, + int __long_only); +# endif +#else /* not __STDC__ */ +extern int getopt (); +# ifndef __need_getopt +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +# endif +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +/* Make sure we later can get all the definitions and declarations. */ +#undef __need_getopt + +#endif /* getopt.h */ diff --git a/source/compat/msvc/stdint.h b/source/compat/msvc/stdint.h new file mode 100644 index 0000000..00878e5 --- /dev/null +++ b/source/compat/msvc/stdint.h @@ -0,0 +1,24 @@ +#pragma once + +#ifndef _MSC_VER +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif + +#include // for intptr_t +#if !defined(UINT64_MAX) +#include +#define UINT64_MAX _UI64_MAX +#endif + +/* a minimal set of C99 types for use with MSVC (VC9) */ + +typedef signed char int8_t; +typedef short int int16_t; +typedef int int32_t; +typedef __int64 int64_t; + +typedef unsigned char uint8_t; +typedef unsigned short int uint16_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; + diff --git a/source/encoder/CMakeLists.txt b/source/encoder/CMakeLists.txt new file mode 100644 index 0000000..020364f --- /dev/null +++ b/source/encoder/CMakeLists.txt @@ -0,0 +1,25 @@ +# vim: syntax=cmake + +if(GCC) + add_definitions(-Wno-uninitialized) +endif() + +add_library(encoder OBJECT ../x265.h + analysis.cpp analysis.h + search.cpp search.h + bitcost.cpp bitcost.h rdcost.h + motion.cpp motion.h + slicetype.cpp slicetype.h + frameencoder.cpp frameencoder.h + framefilter.cpp framefilter.h + level.cpp level.h + nal.cpp nal.h + sei.cpp sei.h + sao.cpp sao.h + entropy.cpp entropy.h + dpb.cpp dpb.h + ratecontrol.cpp ratecontrol.h + reference.cpp reference.h + encoder.cpp encoder.h + api.cpp + weightPrediction.cpp) diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp new file mode 100644 index 0000000..c62f5f0 --- /dev/null +++ b/source/encoder/analysis.cpp @@ -0,0 +1,1867 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Deepthi Nandakumar +* Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" +#include "primitives.h" +#include "threading.h" + +#include "analysis.h" +#include "rdcost.h" +#include "encoder.h" + +#include "PPA/ppa.h" + +using namespace x265; + +/* An explanation of rate distortion levels (--rd-level) + * + * rd-level 0 generates no recon per CU (NO RDO or Quant) + * + * sa8d selection between merge / skip / inter / intra and split + * no recon pixels generated until CTU analysis is complete, requiring + * intra predictions to use source pixels + * + * rd-level 1 uses RDO for merge and skip, sa8d for all else + * + * RDO selection between merge and skip + * sa8d selection between (merge/skip) / inter modes / intra and split + * intra prediction uses reconstructed pixels + * + * rd-level 2 uses RDO for merge/skip and split + * + * RDO selection between merge and skip + * sa8d selection between (merge/skip) / inter modes / intra + * RDO split decisions + * + * rd-level 3 uses RDO for merge/skip/best inter/intra + * + * RDO selection between merge and skip + * sa8d selection of best inter mode + * RDO selection between (merge/skip) / best inter mode / intra / split + * + * rd-level 4 enables RDOQuant + * + * rd-level 5,6 does RDO for each inter mode + */ + +Analysis::Analysis() +{ + m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0; +} + +bool Analysis::create(ThreadLocalData *tld) +{ + m_tld = tld; + m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2; + + int csp = m_param->internalCsp; + uint32_t cuSize = g_maxCUSize; + + bool ok = true; + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1) + { + ModeDepth &md = m_modeDepth[depth]; + + md.cuMemPool.create(depth, csp, MAX_PRED_TYPES); + ok &= md.fencYuv.create(cuSize, csp); + + for (int j = 0; j < MAX_PRED_TYPES; j++) + { + md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j); + ok &= md.pred[j].predYuv.create(cuSize, csp); + ok &= md.pred[j].reconYuv.create(cuSize, csp); + md.pred[j].fencYuv = &md.fencYuv; + } + } + + return ok; +} + +void Analysis::destroy() +{ + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + { + m_modeDepth[i].cuMemPool.destroy(); + m_modeDepth[i].fencYuv.destroy(); + + for (int j = 0; j < MAX_PRED_TYPES; j++) + { + m_modeDepth[i].pred[j].predYuv.destroy(); + m_modeDepth[i].pred[j].reconYuv.destroy(); + } + } +} + +Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext) +{ + m_slice = ctu.m_slice; + m_frame = &frame; + + invalidateContexts(0); + m_quant.setQPforQuant(ctu); + m_rqt[0].cur.load(initialContext); + m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0); + + uint32_t numPartition = ctu.m_numPartitions; + if (m_slice->m_sliceType == I_SLICE) + { + uint32_t zOrder = 0; + if (m_param->analysisMode == X265_ANALYSIS_LOAD) + compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder); + else + { + compressIntraCU(ctu, cuGeom, NULL, zOrder); + + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData) + { + CUData *bestCU = &m_modeDepth[0].bestMode->cu; + memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition); + memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition); + memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition); + m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr; + m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc; + } + } + } + else + { + if (!m_param->rdLevel) + { + /* In RD Level 0/1, copy source pixels into the reconstructed block so + * they are available for intra predictions */ + m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0); + + compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1 + + /* generate residual for entire CTU at once and copy to reconPic */ + encodeResidue(ctu, cuGeom); + } + else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2) + compressInterCU_dist(ctu, cuGeom); + else if (m_param->rdLevel <= 4) + compressInterCU_rd0_4(ctu, cuGeom); + else + compressInterCU_rd5_6(ctu, cuGeom); + } + + return *m_modeDepth[0].bestMode; +} + +void Analysis::tryLossless(const CUGeom& cuGeom) +{ + ModeDepth& md = m_modeDepth[cuGeom.depth]; + + if (!md.bestMode->distortion) + /* already lossless */ + return; + else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA) + { + md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); + PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0]; + uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir; + checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes); + checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); + } + else + { + md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); + md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv); + encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom); + checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); + } +} + +void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder) +{ + uint32_t depth = cuGeom.depth; + ModeDepth& md = m_modeDepth[depth]; + md.bestMode = NULL; + + bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); + bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); + + if (shared) + { + uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + + if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx) + { + m_quant.setQPforQuant(parentCTU); + + PartSize size = (PartSize)sharedPartSizes[zOrder]; + Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN]; + mode.cu.initSubCU(parentCTU, cuGeom); + checkIntra(mode, cuGeom, size, sharedModes); + checkBestMode(mode, depth); + + if (m_bTryLossless) + tryLossless(cuGeom); + + if (mightSplit) + addSplitFlagCost(*md.bestMode, cuGeom.depth); + + // increment zOrder offset to point to next best depth in sharedDepth buffer + zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]]; + mightSplit = false; + } + } + else if (mightNotSplit) + { + m_quant.setQPforQuant(parentCTU); + + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); + checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); + checkBestMode(md.pred[PRED_INTRA], depth); + + if (depth == g_maxCUDepth) + { + md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); + checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); + checkBestMode(md.pred[PRED_INTRA_NxN], depth); + } + + if (m_bTryLossless) + tryLossless(cuGeom); + + if (mightSplit) + addSplitFlagCost(*md.bestMode, cuGeom.depth); + } + + if (mightSplit) + { + Mode* splitPred = &md.pred[PRED_SPLIT]; + splitPred->initCosts(); + CUData* splitCU = &splitPred->cu; + splitCU->initSubCU(parentCTU, cuGeom); + + uint32_t nextDepth = depth + 1; + ModeDepth& nd = m_modeDepth[nextDepth]; + invalidateContexts(nextDepth); + Entropy* nextContext = &m_rqt[depth].cur; + + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childCuData.flags & CUGeom::PRESENT) + { + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_rqt[nextDepth].cur.load(*nextContext); + compressIntraCU(parentCTU, childCuData, shared, zOrder); + + // Save best CU and pred data for this sub CU + splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitPred->addSubCosts(*nd.bestMode); + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nextContext = &nd.bestMode->contexts; + } + else + { + /* record the depth of this non-present sub-CU */ + splitCU->setEmptyPart(childCuData, subPartIdx); + zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth]; + } + } + nextContext->store(splitPred->contexts); + if (mightNotSplit) + addSplitFlagCost(*splitPred, cuGeom.depth); + else + updateModeCost(*splitPred); + checkBestMode(*splitPred, depth); + } + + checkDQP(md.bestMode->cu, cuGeom); + + /* Copy best data to encData CTU and recon */ + md.bestMode->cu.copyToPic(depth); + if (md.bestMode != &md.pred[PRED_SPLIT]) + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx); +} + +bool Analysis::findJob(int threadId) +{ + /* try to acquire a CU mode to analyze */ + if (m_totalNumJobs > m_numAcquiredJobs) + { + /* ATOMIC_INC returns the incremented value */ + int id = ATOMIC_INC(&m_numAcquiredJobs); + if (m_totalNumJobs >= id) + { + parallelModeAnalysis(threadId, id - 1); + + if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs) + m_modeCompletionEvent.trigger(); + return true; + } + } + + if (m_totalNumME > m_numAcquiredME) + { + int id = ATOMIC_INC(&m_numAcquiredME); + if (m_totalNumME >= id) + { + parallelME(threadId, id - 1); + + if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) + m_meCompletionEvent.trigger(); + return true; + } + } + + return false; +} + +void Analysis::parallelME(int threadId, int meId) +{ + Analysis* slave; + + if (threadId == -1) + slave = this; + else + { + slave = &m_tld[threadId].analysis; + slave->setQP(*m_slice, m_rdCost.m_qp); + slave->m_slice = m_slice; + slave->m_frame = m_frame; + + PicYuv* fencPic = m_frame->m_origPicYuv; + pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx); + slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride); + slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight); + + slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart); + } + + if (meId < m_slice->m_numRefIdx[0]) + slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId); + else + slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]); +} + +void Analysis::parallelModeAnalysis(int threadId, int jobId) +{ + Analysis* slave; + + if (threadId == -1) + slave = this; + else + { + slave = &m_tld[threadId].analysis; + slave->m_slice = m_slice; + slave->m_frame = m_frame; + slave->setQP(*m_slice, m_rdCost.m_qp); + slave->invalidateContexts(0); + if (jobId) + slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride); + } + + ModeDepth& md = m_modeDepth[m_curGeom->depth]; + + if (m_param->rdLevel <= 4) + { + switch (jobId) + { + case 0: + if (slave != this) + slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); + slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom); + if (m_param->rdLevel > 2) + slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom); + break; + + case 1: + slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N); + break; + + case 2: + slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N); + break; + + case 3: + slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN); + break; + + case 4: + slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU); + break; + + case 5: + slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD); + break; + + case 6: + slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N); + break; + + case 7: + slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N); + break; + + default: + X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); + break; + } + } + else + { + bool bMergeOnly = m_curGeom->log2CUSize == 6; + if (slave != this) + { + slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); + slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu); + } + + switch (jobId) + { + case 0: + slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL); + if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) + slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL); + break; + + case 1: + slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false); + break; + + case 2: + slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false); + break; + + case 3: + slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false); + break; + + case 4: + slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly); + break; + + case 5: + slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly); + break; + + case 6: + slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly); + break; + + case 7: + slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly); + break; + + default: + X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); + break; + } + } +} + +void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom) +{ + uint32_t depth = cuGeom.depth; + uint32_t cuAddr = parentCTU.m_cuAddr; + ModeDepth& md = m_modeDepth[depth]; + md.bestMode = NULL; + + bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); + bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); + uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0; + + X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n"); + + if (mightNotSplit && depth >= minDepth) + { + int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4); + int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; + + /* Initialize all prediction CUs based on parentCTU */ + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); + if (m_param->bEnableRectInter) + { + md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); + } + if (bTryAmp) + { + md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); + } + if (bTryIntra) + { + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); + if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) + md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); + } + + m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4; + m_numAcquiredJobs = !bTryIntra; + m_numCompletedJobs = m_numAcquiredJobs; + m_curGeom = &cuGeom; + m_bJobsQueued = true; + JobProvider::enqueue(); + + for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++) + m_pool->pokeIdleThread(); + + /* participate in processing jobs, until all are distributed */ + while (findJob(-1)) + ; + + JobProvider::dequeue(); + m_bJobsQueued = false; + + /* the master worker thread (this one) does merge analysis. By doing + * merge after all the other jobs are at least started, we usually avoid + * blocking on another thread */ + + if (m_param->rdLevel <= 4) + { + checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + + m_modeCompletionEvent.wait(); + + /* select best inter mode based on sa8d cost */ + Mode *bestInter = &md.pred[PRED_2Nx2N]; + + if (m_param->bEnableRectInter) + { + if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_Nx2N]; + if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_2NxN]; + } + + if (bTryAmp) + { + if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_2NxnU]; + if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_2NxnD]; + if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_nLx2N]; + if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_nRx2N]; + } + + if (m_param->rdLevel > 2) + { + /* encode best inter */ + for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) + { + prepMotionCompensation(bestInter->cu, cuGeom, puIdx); + motionCompensation(bestInter->predYuv, false, true); + } + encodeResAndCalcRdInterCU(*bestInter, cuGeom); + + /* RD selection between merge, inter and intra */ + checkBestMode(*bestInter, depth); + + if (bTryIntra) + checkBestMode(md.pred[PRED_INTRA], depth); + } + else /* m_param->rdLevel == 2 */ + { + if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) + md.bestMode = bestInter; + + if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) + { + md.bestMode = &md.pred[PRED_INTRA]; + encodeIntraInInter(*md.bestMode, cuGeom); + } + else if (!md.bestMode->cu.m_mergeFlag[0]) + { + /* finally code the best mode selected from SA8D costs */ + for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) + { + prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); + motionCompensation(md.bestMode->predYuv, false, true); + } + encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); + } + } + } + else + { + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + m_modeCompletionEvent.wait(); + + checkBestMode(md.pred[PRED_2Nx2N], depth); + + if (m_param->bEnableRectInter) + { + checkBestMode(md.pred[PRED_Nx2N], depth); + checkBestMode(md.pred[PRED_2NxN], depth); + } + + if (bTryAmp) + { + checkBestMode(md.pred[PRED_2NxnU], depth); + checkBestMode(md.pred[PRED_2NxnD], depth); + checkBestMode(md.pred[PRED_nLx2N], depth); + checkBestMode(md.pred[PRED_nRx2N], depth); + } + + if (bTryIntra) + { + checkBestMode(md.pred[PRED_INTRA], depth); + if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) + checkBestMode(md.pred[PRED_INTRA_NxN], depth); + } + } + + if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra) + { + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); + checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); + encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); + checkBestMode(md.pred[PRED_INTRA], depth); + } + + if (m_bTryLossless) + tryLossless(cuGeom); + + if (mightSplit) + addSplitFlagCost(*md.bestMode, cuGeom.depth); + } + + bool bNoSplit = false; + if (md.bestMode) + { + bNoSplit = !!md.bestMode->cu.isSkipped(0); + if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4) + bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); + } + + if (mightSplit && !bNoSplit) + { + Mode* splitPred = &md.pred[PRED_SPLIT]; + splitPred->initCosts(); + CUData* splitCU = &splitPred->cu; + splitCU->initSubCU(parentCTU, cuGeom); + + uint32_t nextDepth = depth + 1; + ModeDepth& nd = m_modeDepth[nextDepth]; + invalidateContexts(nextDepth); + Entropy* nextContext = &m_rqt[depth].cur; + + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childCuData.flags & CUGeom::PRESENT) + { + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_rqt[nextDepth].cur.load(*nextContext); + compressInterCU_dist(parentCTU, childCuData); + + // Save best CU and pred data for this sub CU + splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitPred->addSubCosts(*nd.bestMode); + + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nextContext = &nd.bestMode->contexts; + } + else + splitCU->setEmptyPart(childCuData, subPartIdx); + } + nextContext->store(splitPred->contexts); + + if (mightNotSplit) + addSplitFlagCost(*splitPred, cuGeom.depth); + else + updateModeCost(*splitPred); + + checkBestMode(*splitPred, depth); + } + + if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA) + { + /* early-out statistics */ + FrameData& curEncData = const_cast(*m_frame->m_encData); + FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; + uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; + cuStat.count[depth] += 1; + cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; + } + + checkDQP(md.bestMode->cu, cuGeom); + + /* Copy best data to encData CTU and recon */ + md.bestMode->cu.copyToPic(depth); + if (md.bestMode != &md.pred[PRED_SPLIT]) + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx); +} + +void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom) +{ + uint32_t depth = cuGeom.depth; + uint32_t cuAddr = parentCTU.m_cuAddr; + ModeDepth& md = m_modeDepth[depth]; + md.bestMode = NULL; + + bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); + bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); + uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom); + + if (mightNotSplit && depth >= minDepth) + { + bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; + + /* Initialize all prediction CUs based on parentCTU */ + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); + if (m_param->bEnableRectInter) + { + md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); + } + if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6) + { + md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); + } + + /* Compute Merge Cost */ + checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + + bool earlyskip = false; + if (m_param->rdLevel) + earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth + + if (!earlyskip) + { + checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N); + Mode *bestInter = &md.pred[PRED_2Nx2N]; + + if (m_param->bEnableRectInter) + { + checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N); + if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_Nx2N]; + checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN); + if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_2NxN]; + } + + if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6) + { + bool bHor = false, bVer = false; + if (bestInter->cu.m_partSize[0] == SIZE_2NxN) + bHor = true; + else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N) + bVer = true; + else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N && + md.bestMode && md.bestMode->cu.getQtRootCbf(0)) + { + bHor = true; + bVer = true; + } + + if (bHor) + { + checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU); + if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_2NxnU]; + checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD); + if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_2NxnD]; + } + if (bVer) + { + checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N); + if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_nLx2N]; + checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N); + if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) + bestInter = &md.pred[PRED_nRx2N]; + } + } + + if (m_param->rdLevel >= 3) + { + /* Calculate RD cost of best inter option */ + for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) + { + prepMotionCompensation(bestInter->cu, cuGeom, puIdx); + motionCompensation(bestInter->predYuv, false, true); + } + + encodeResAndCalcRdInterCU(*bestInter, cuGeom); + + if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost) + md.bestMode = bestInter; + + if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) || + md.bestMode->sa8dCost == MAX_INT64) + { + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); + checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); + encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); + if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost) + md.bestMode = &md.pred[PRED_INTRA]; + } + } + else + { + /* SA8D choice between merge/skip, inter, and intra */ + if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) + md.bestMode = bestInter; + + if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64) + { + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); + checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); + if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) + md.bestMode = &md.pred[PRED_INTRA]; + } + + /* finally code the best mode selected by SA8D costs: + * RD level 2 - fully encode the best mode + * RD level 1 - generate recon pixels + * RD level 0 - generate chroma prediction */ + if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) + { + /* prediction already generated for this CU, and if rd level + * is not 0, it is already fully encoded */ + } + else if (md.bestMode->cu.m_predMode[0] == MODE_INTER) + { + for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) + { + prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); + motionCompensation(md.bestMode->predYuv, false, true); + } + if (m_param->rdLevel == 2) + encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); + else if (m_param->rdLevel == 1) + { + m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize); + generateCoeffRecon(*md.bestMode, cuGeom); + } + } + else + { + if (m_param->rdLevel == 2) + encodeIntraInInter(*md.bestMode, cuGeom); + else if (m_param->rdLevel == 1) + generateCoeffRecon(*md.bestMode, cuGeom); + } + } + } // !earlyskip + + if (m_bTryLossless) + tryLossless(cuGeom); + + if (mightSplit) + addSplitFlagCost(*md.bestMode, cuGeom.depth); + } + + bool bNoSplit = false; + if (md.bestMode) + { + bNoSplit = !!md.bestMode->cu.isSkipped(0); + if (mightSplit && depth && depth >= minDepth && !bNoSplit) + bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); + } + + if (mightSplit && !bNoSplit) + { + Mode* splitPred = &md.pred[PRED_SPLIT]; + splitPred->initCosts(); + CUData* splitCU = &splitPred->cu; + splitCU->initSubCU(parentCTU, cuGeom); + + uint32_t nextDepth = depth + 1; + ModeDepth& nd = m_modeDepth[nextDepth]; + invalidateContexts(nextDepth); + Entropy* nextContext = &m_rqt[depth].cur; + + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childCuData.flags & CUGeom::PRESENT) + { + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_rqt[nextDepth].cur.load(*nextContext); + compressInterCU_rd0_4(parentCTU, childCuData); + + // Save best CU and pred data for this sub CU + splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitPred->addSubCosts(*nd.bestMode); + + if (m_param->rdLevel) + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + else + nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx); + if (m_param->rdLevel > 1) + nextContext = &nd.bestMode->contexts; + } + else + splitCU->setEmptyPart(childCuData, subPartIdx); + } + nextContext->store(splitPred->contexts); + + if (mightNotSplit) + addSplitFlagCost(*splitPred, cuGeom.depth); + else if (m_param->rdLevel <= 1) + splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits); + else + updateModeCost(*splitPred); + + if (!md.bestMode) + md.bestMode = splitPred; + else if (m_param->rdLevel >= 1) + { + if (splitPred->rdCost < md.bestMode->rdCost) + md.bestMode = splitPred; + } + else + { + if (splitPred->sa8dCost < md.bestMode->sa8dCost) + md.bestMode = splitPred; + } + } + + if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA) + { + /* early-out statistics */ + FrameData& curEncData = const_cast(*m_frame->m_encData); + FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; + uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; + cuStat.count[depth] += 1; + cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; + } + + checkDQP(md.bestMode->cu, cuGeom); + + /* Copy best data to encData CTU and recon */ + md.bestMode->cu.copyToPic(depth); + if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel) + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx); +} + +void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom) +{ + uint32_t depth = cuGeom.depth; + ModeDepth& md = m_modeDepth[depth]; + md.bestMode = NULL; + + bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); + bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); + + if (mightNotSplit) + { + for (int i = 0; i < MAX_PRED_TYPES; i++) + md.pred[i].cu.initSubCU(parentCTU, cuGeom); + + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0); + + if (!earlySkip) + { + checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false); + checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); + + if (m_param->bEnableRectInter) + { + // Nx2N rect + if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) + { + checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false); + checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); + } + if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) + { + checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false); + checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); + } + } + + // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N) + if (m_slice->m_sps->maxAMPDepth > depth) + { + bool bMergeOnly = cuGeom.log2CUSize == 6; + + bool bHor = false, bVer = false; + if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN) + bHor = true; + else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N) + bVer = true; + else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0)) + { + bHor = true; + bVer = true; + } + + if (bHor) + { + if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) + { + checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly); + checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); + } + if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) + { + checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly); + checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); + } + } + if (bVer) + { + if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) + { + checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly); + checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); + } + if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) + { + checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly); + checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); + } + } + } + + if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && + (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))) + { + checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); + checkBestMode(md.pred[PRED_INTRA], depth); + + if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) + { + checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); + checkBestMode(md.pred[PRED_INTRA_NxN], depth); + } + } + } + + if (m_bTryLossless) + tryLossless(cuGeom); + + if (mightSplit) + addSplitFlagCost(*md.bestMode, cuGeom.depth); + } + + // estimate split cost + if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0))) + { + Mode* splitPred = &md.pred[PRED_SPLIT]; + splitPred->initCosts(); + CUData* splitCU = &splitPred->cu; + splitCU->initSubCU(parentCTU, cuGeom); + + uint32_t nextDepth = depth + 1; + ModeDepth& nd = m_modeDepth[nextDepth]; + invalidateContexts(nextDepth); + Entropy* nextContext = &m_rqt[depth].cur; + + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childCuData.flags & CUGeom::PRESENT) + { + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); + m_rqt[nextDepth].cur.load(*nextContext); + compressInterCU_rd5_6(parentCTU, childCuData); + + // Save best CU and pred data for this sub CU + splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); + splitPred->addSubCosts(*nd.bestMode); + nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); + nextContext = &nd.bestMode->contexts; + } + else + splitCU->setEmptyPart(childCuData, subPartIdx); + } + nextContext->store(splitPred->contexts); + if (mightNotSplit) + addSplitFlagCost(*splitPred, cuGeom.depth); + else + updateModeCost(*splitPred); + + checkBestMode(*splitPred, depth); + } + + checkDQP(md.bestMode->cu, cuGeom); + + /* Copy best data to encData CTU and recon */ + md.bestMode->cu.copyToPic(depth); + if (md.bestMode != &md.pred[PRED_SPLIT]) + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx); +} + +/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ +void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom) +{ + uint32_t depth = cuGeom.depth; + ModeDepth& md = m_modeDepth[depth]; + Yuv *fencYuv = &md.fencYuv; + + /* Note that these two Mode instances are named MERGE and SKIP but they may + * hold the reverse when the function returns. We toggle between the two modes */ + Mode* tempPred = &merge; + Mode* bestPred = &skip; + + X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n"); + + tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N); + tempPred->cu.setPredModeSubParts(MODE_INTER); + tempPred->cu.m_mergeFlag[0] = true; + + bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N); + bestPred->cu.setPredModeSubParts(MODE_INTER); + bestPred->cu.m_mergeFlag[0] = true; + + MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists + uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; + uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); + + bestPred->sa8dCost = MAX_INT64; + int bestSadCand = -1; + int sizeIdx = cuGeom.log2CUSize - 2; + for (uint32_t i = 0; i < maxNumMergeCand; ++i) + { + if (m_bFrameParallel && + (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || + mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) + continue; + + tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx + tempPred->cu.m_interDir[0] = interDirNeighbours[i]; + tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; + tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; + tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; + tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; + + // do MC only for Luma part + prepMotionCompensation(tempPred->cu, cuGeom, 0); + motionCompensation(tempPred->predYuv, true, false); + + tempPred->sa8dBits = getTUBits(i, maxNumMergeCand); + tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size); + tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits); + + if (tempPred->sa8dCost < bestPred->sa8dCost) + { + bestSadCand = i; + std::swap(tempPred, bestPred); + } + } + + /* force mode decision to take inter or intra */ + if (bestSadCand < 0) + return; + + /* calculate the motion compensation for chroma for the best mode selected */ + prepMotionCompensation(bestPred->cu, cuGeom, 0); + motionCompensation(bestPred->predYuv, false, true); + + if (m_param->rdLevel) + { + if (m_param->bLossless) + bestPred->rdCost = MAX_INT64; + else + encodeResAndCalcRdSkipCU(*bestPred); + + /* Encode with residual */ + tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand; + tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); + tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); + tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); + tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); + tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); + tempPred->sa8dCost = bestPred->sa8dCost; + tempPred->predYuv.copyFromYuv(bestPred->predYuv); + + encodeResAndCalcRdInterCU(*tempPred, cuGeom); + + md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred; + } + else + md.bestMode = bestPred; + + /* broadcast sets of MV field data */ + bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); + bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); + bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); + bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); + bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); +} + +/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ +void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom) +{ + uint32_t depth = cuGeom.depth; + + /* Note that these two Mode instances are named MERGE and SKIP but they may + * hold the reverse when the function returns. We toggle between the two modes */ + Mode* tempPred = &merge; + Mode* bestPred = &skip; + + merge.cu.setPredModeSubParts(MODE_INTER); + merge.cu.setPartSizeSubParts(SIZE_2Nx2N); + merge.cu.m_mergeFlag[0] = true; + + skip.cu.setPredModeSubParts(MODE_INTER); + skip.cu.setPartSizeSubParts(SIZE_2Nx2N); + skip.cu.m_mergeFlag[0] = true; + + MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists + uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; + uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); + + bool foundCbf0Merge = false; + bool triedPZero = false, triedBZero = false; + bestPred->rdCost = MAX_INT64; + for (uint32_t i = 0; i < maxNumMergeCand; i++) + { + if (m_bFrameParallel && + (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || + mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) + continue; + + /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */ + if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx) + { + if (triedPZero) + continue; + triedPZero = true; + } + else if (interDirNeighbours[i] == 3 && + !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx && + !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx) + { + if (triedBZero) + continue; + triedBZero = true; + } + + tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ + tempPred->cu.m_interDir[0] = interDirNeighbours[i]; + tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; + tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; + tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; + tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; + tempPred->cu.setSkipFlagSubParts(false); /* must be cleared between encode iterations */ + + prepMotionCompensation(tempPred->cu, cuGeom, 0); + motionCompensation(tempPred->predYuv, true, true); + + uint8_t hasCbf = true; + bool swapped = false; + if (!foundCbf0Merge) + { + /* if the best prediction has CBF (not a skip) then try merge with residual */ + + encodeResAndCalcRdInterCU(*tempPred, cuGeom); + hasCbf = tempPred->cu.getQtRootCbf(0); + foundCbf0Merge = !hasCbf; + + if (tempPred->rdCost < bestPred->rdCost) + { + std::swap(tempPred, bestPred); + swapped = true; + } + } + if (!m_param->bLossless && hasCbf) + { + /* try merge without residual (skip), if not lossless coding */ + + if (swapped) + { + tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; + tempPred->cu.m_interDir[0] = interDirNeighbours[i]; + tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; + tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; + tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; + tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; + tempPred->cu.setSkipFlagSubParts(false); + tempPred->predYuv.copyFromYuv(bestPred->predYuv); + } + + encodeResAndCalcRdSkipCU(*tempPred); + + if (tempPred->rdCost < bestPred->rdCost) + std::swap(tempPred, bestPred); + } + } + + if (bestPred->rdCost < MAX_INT64) + { + m_modeDepth[depth].bestMode = bestPred; + + /* broadcast sets of MV field data */ + uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0]; + bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0); + bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0); + bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0); + bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0); + bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0); + } +} + +void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize) +{ + interMode.initCosts(); + interMode.cu.setPartSizeSubParts(partSize); + interMode.cu.setPredModeSubParts(MODE_INTER); + + if (predInterSearch(interMode, cuGeom, false, false)) + { + /* predInterSearch sets interMode.sa8dBits */ + const Yuv& fencYuv = *interMode.fencYuv; + Yuv& predYuv = interMode.predYuv; + interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); + interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits); + } + else + { + interMode.distortion = MAX_UINT; + interMode.sa8dCost = MAX_INT64; + } +} + +void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly) +{ + interMode.initCosts(); + interMode.cu.setPartSizeSubParts(partSize); + interMode.cu.setPredModeSubParts(MODE_INTER); + + if (predInterSearch(interMode, cuGeom, bMergeOnly, true)) + { + /* predInterSearch sets interMode.sa8dBits, but this is ignored */ + encodeResAndCalcRdInterCU(interMode, cuGeom); + } + else + { + interMode.distortion = MAX_UINT; + interMode.rdCost = MAX_INT64; + } +} + +/* Note that this function does not save the best intra prediction, it must + * be generated later. It records the best mode in the cu */ +void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + uint32_t depth = cu.m_cuDepth[0]; + + cu.setPartSizeSubParts(SIZE_2Nx2N); + cu.setPredModeSubParts(MODE_INTRA); + + uint32_t initTrDepth = 0; + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t tuSize = 1 << log2TrSize; + const uint32_t absPartIdx = 0; + + // Reference sample smoothing + initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX); + + pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0]; + uint32_t stride = m_modeDepth[depth].fencYuv.m_size; + + pixel *above = m_refAbove + tuSize - 1; + pixel *aboveFiltered = m_refAboveFlt + tuSize - 1; + pixel *left = m_refLeft + tuSize - 1; + pixel *leftFiltered = m_refLeftFlt + tuSize - 1; + int sad, bsad; + uint32_t bits, bbits, mode, bmode; + uint64_t cost, bcost; + + // 33 Angle modes once + ALIGN_VAR_32(pixel, bufScale[32 * 32]); + ALIGN_VAR_32(pixel, bufTrans[32 * 32]); + ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); + int scaleTuSize = tuSize; + int scaleStride = stride; + int costShift = 0; + int sizeIdx = log2TrSize - 2; + + if (tuSize > 32) + { + // origin is 64x64, we scale to 32x32 and setup required parameters + primitives.scale2D_64to32(bufScale, fenc, stride); + fenc = bufScale; + + // reserve space in case primitives need to store data in above + // or left buffers + pixel _above[4 * 32 + 1]; + pixel _left[4 * 32 + 1]; + pixel *aboveScale = _above + 2 * 32; + pixel *leftScale = _left + 2 * 32; + aboveScale[0] = leftScale[0] = above[0]; + primitives.scale1D_128to64(aboveScale + 1, above + 1, 0); + primitives.scale1D_128to64(leftScale + 1, left + 1, 0); + + scaleTuSize = 32; + scaleStride = 32; + costShift = 2; + sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 + + // Filtered and Unfiltered refAbove and refLeft pointing to above and left. + above = aboveScale; + left = leftScale; + aboveFiltered = aboveScale; + leftFiltered = leftScale; + } + + pixelcmp_t sa8d = primitives.sa8d[sizeIdx]; + int predsize = scaleTuSize * scaleTuSize; + + m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + + /* there are three cost tiers for intra modes: + * pred[0] - mode probable, least cost + * pred[1], pred[2] - less probable, slightly more cost + * non-mpm modes - all cost the same (rbits) */ + uint64_t mpms; + uint32_t preds[3]; + uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); + + // DC + primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16)); + bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + bmode = mode = DC_IDX; + bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + bcost = m_rdCost.calcRdSADCost(bsad, bbits); + + pixel *abovePlanar = above; + pixel *leftPlanar = left; + + if (tuSize & (8 | 16 | 32)) + { + abovePlanar = aboveFiltered; + leftPlanar = leftFiltered; + } + + // PLANAR + primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0); + sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + mode = PLANAR_IDX; + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + cost = m_rdCost.calcRdSADCost(sad, bits); + COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + + // Transpose NxN + primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride); + + primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16)); + + bool modeHor; + pixel *cmp; + intptr_t srcStride; + +#define TRY_ANGLE(angle) \ + modeHor = angle < 18; \ + cmp = modeHor ? bufTrans : fenc; \ + srcStride = modeHor ? scaleTuSize : scaleStride; \ + sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ + bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ + cost = m_rdCost.calcRdSADCost(sad, bits) + + if (m_param->bEnableFastIntra) + { + int asad = 0; + uint32_t lowmode, highmode, amode = 5, abits = 0; + uint64_t acost = MAX_INT64; + + /* pick the best angle, sampling at distance of 5 */ + for (mode = 5; mode < 35; mode += 5) + { + TRY_ANGLE(mode); + COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); + } + + /* refine best angle at distance 2, then distance 1 */ + for (uint32_t dist = 2; dist >= 1; dist--) + { + lowmode = amode - dist; + highmode = amode + dist; + + X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); + TRY_ANGLE(lowmode); + COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); + + X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); + TRY_ANGLE(highmode); + COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); + } + + if (amode == 33) + { + TRY_ANGLE(34); + COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); + } + + COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); + } + else // calculate and search all intra prediction angles for lowest cost + { + for (mode = 2; mode < 35; mode++) + { + TRY_ANGLE(mode); + COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + } + } + + cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth); + intraMode.initCosts(); + intraMode.totalBits = bbits; + intraMode.distortion = bsad; + intraMode.sa8dCost = bcost; + intraMode.sa8dBits = bbits; +} + +void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + Yuv* reconYuv = &intraMode.reconYuv; + Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv; + + X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); + X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); + + m_quant.setQPforQuant(cu); + + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + m_entropyCoder.load(m_rqt[cuGeom.depth].cur); + + Cost icosts; + codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); + extractIntraResultQT(cu, *reconYuv, 0, 0); + + intraMode.distortion = icosts.distortion; + intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); + + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); + m_entropyCoder.codePredInfo(cu, 0); + intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits(); + + bool bCodeDQP = m_slice->m_pps->bUseDQP; + m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange); + + intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); + intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; + if (m_rdCost.m_psyRd) + intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + + m_entropyCoder.store(intraMode.contexts); + updateModeCost(intraMode); +} + +void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) +{ + if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth) + { + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childCuData.flags & CUGeom::PRESENT) + encodeResidue(ctu, childCuData); + } + return; + } + + uint32_t absPartIdx = cuGeom.encodeIdx; + int sizeIdx = cuGeom.log2CUSize - 2; + + Yuv& fencYuv = m_modeDepth[0].fencYuv; + + /* reuse the bestMode data structures at the current depth */ + Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode; + Yuv& reconYuv = bestMode->reconYuv; + CUData& cu = bestMode->cu; + + cu.copyFromPic(ctu, cuGeom); + m_quant.setQPforQuant(cu); + + if (cu.m_predMode[0] == MODE_INTRA) + { + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN; + residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange); + getBestIntraModeChroma(*bestMode, cuGeom); + residualQTIntraChroma(*bestMode, cuGeom, 0, 0); + } + else if (cu.m_predMode[0] == MODE_INTER) + { + X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n"); + + /* Calculate residual for current CU part into depth sized resiYuv */ + + ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; + + /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */ + Yuv& predYuv = m_modeDepth[0].bestMode->predYuv; + pixel* predY = predYuv.getLumaAddr(absPartIdx); + pixel* predU = predYuv.getCbAddr(absPartIdx); + pixel* predV = predYuv.getCrAddr(absPartIdx); + + primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size, + fencYuv.getLumaAddr(absPartIdx), predY, + fencYuv.m_size, predYuv.m_size); + + primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize, + fencYuv.getCbAddr(absPartIdx), predU, + fencYuv.m_csize, predYuv.m_csize); + + primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize, + fencYuv.getCrAddr(absPartIdx), predV, + fencYuv.m_csize, predYuv.m_csize); + + uint32_t tuDepthRange[2]; + cu.getInterTUQtDepthRange(tuDepthRange, 0); + + residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange); + + if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) + cu.setSkipFlagSubParts(true); + + PicYuv& reconPicYuv = *m_frame->m_reconPicYuv; + if (cu.getQtRootCbf(0)) // TODO: split to each component + { + /* residualTransformQuantInter() wrote transformed residual back into + * resiYuv. Generate the recon pixels by adding it to the prediction */ + + primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size, + predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size); + primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize, + predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize); + primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize, + predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize); + + /* copy the reconstructed part to the recon pic for later intra + * predictions */ + reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx); + } + else + { + /* copy the prediction pixels to the recon pic for later intra + * predictions */ + + primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride, + predY, predYuv.m_size); + primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC, + predU, predYuv.m_csize); + primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC, + predV, predYuv.m_csize); + } + } + /* else if (cu.m_predMode[0] == MODE_NONE) {} */ + + checkDQP(cu, cuGeom); + cu.updatePic(cuGeom.depth); +} + +/* check whether current try is the best with identifying the depth of current try */ +void Analysis::checkBestMode(Mode& mode, uint32_t depth) +{ + ModeDepth& md = m_modeDepth[depth]; + if (md.bestMode) + { + if (mode.rdCost < md.bestMode->rdCost) + md.bestMode = &mode; + } + else + md.bestMode = &mode; +} + +void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth) +{ + if (m_param->rdLevel >= 3) + { + /* code the split flag (0 or 1) and update bit costs */ + mode.contexts.resetBits(); + mode.contexts.codeSplitFlag(mode.cu, 0, depth); + uint32_t bits = mode.contexts.getNumberOfWrittenBits(); + mode.mvBits += bits; + mode.totalBits += bits; + updateModeCost(mode); + } + else if (m_param->rdLevel <= 1) + { + mode.sa8dBits++; + mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits); + } + else + { + mode.mvBits++; + mode.totalBits++; + updateModeCost(mode); + } +} + +void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom) +{ + if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth) + { + if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits + { + bool hasResidual = false; + for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++) + { + if (cu.getQtRootCbf(absPartIdx)) + { + hasResidual = true; + break; + } + } + if (hasResidual) + cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); + else + cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); + } + else + { + if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0)) + cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); + } + } +} + +uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom) +{ + /* Do not attempt to code a block larger than the largest block in the + * co-located CTUs in L0 and L1 */ + int currentQP = parentCTU.m_qp[0]; + int previousQP = currentQP; + uint32_t minDepth0 = 4, minDepth1 = 4; + uint32_t sum = 0; + int numRefs = 0; + if (m_slice->m_numRefIdx[0]) + { + numRefs++; + const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); + previousQP = cu.m_qp[0]; + if (!cu.m_cuDepth[cuGeom.encodeIdx]) + return 0; + for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4) + { + uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; + minDepth0 = X265_MIN(d, minDepth0); + sum += d; + } + } + if (m_slice->m_numRefIdx[1]) + { + numRefs++; + const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); + if (!cu.m_cuDepth[cuGeom.encodeIdx]) + return 0; + for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) + { + uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; + minDepth1 = X265_MIN(d, minDepth1); + sum += d; + } + } + if (!numRefs) + return 0; + + uint32_t minDepth = X265_MIN(minDepth0, minDepth1); + uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2); + + /* allow block size growth if QP is raising or avg depth is + * less than 1.5 of min depth */ + if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1))) + minDepth -= 1; + + return minDepth; +} + +/* returns true if recursion should be stopped */ +bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode) +{ + /* early exit when the RD cost of best mode at depth n is less than the sum + * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright, + * left, colocated) and avg cost of that CU at depth "n" with weightage for + * each quantity */ + + uint32_t depth = cuGeom.depth; + FrameData& curEncData = const_cast(*m_frame->m_encData); + FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; + uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth]; + uint64_t cuCount = cuStat.count[depth]; + + uint64_t neighCost = 0, neighCount = 0; + const CUData* above = parentCTU.m_cuAbove; + if (above) + { + FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr]; + neighCost += astat.avgCost[depth] * astat.count[depth]; + neighCount += astat.count[depth]; + + const CUData* aboveLeft = parentCTU.m_cuAboveLeft; + if (aboveLeft) + { + FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr]; + neighCost += lstat.avgCost[depth] * lstat.count[depth]; + neighCount += lstat.count[depth]; + } + + const CUData* aboveRight = parentCTU.m_cuAboveRight; + if (aboveRight) + { + FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr]; + neighCost += rstat.avgCost[depth] * rstat.count[depth]; + neighCount += rstat.count[depth]; + } + } + const CUData* left = parentCTU.m_cuLeft; + if (left) + { + FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr]; + neighCost += nstat.avgCost[depth] * nstat.count[depth]; + neighCount += nstat.count[depth]; + } + + // give 60% weight to all CU's and 40% weight to neighbour CU's + if (neighCost + cuCount) + { + uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount)); + uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost; + if (curCost < avgCost && avgCost) + return true; + } + + return false; +} diff --git a/source/encoder/analysis.h b/source/encoder/analysis.h new file mode 100644 index 0000000..404cc90 --- /dev/null +++ b/source/encoder/analysis.h @@ -0,0 +1,132 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Deepthi Nandakumar +* Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_ANALYSIS_H +#define X265_ANALYSIS_H + +#include "common.h" +#include "predict.h" +#include "quant.h" +#include "yuv.h" +#include "shortyuv.h" +#include "cudata.h" + +#include "entropy.h" +#include "search.h" + +namespace x265 { +// private namespace + +class Entropy; + +class Analysis : public Search +{ +public: + + enum { + PRED_MERGE, + PRED_SKIP, + PRED_INTRA, + PRED_2Nx2N, + PRED_Nx2N, + PRED_2NxN, + PRED_SPLIT, + PRED_2NxnU, + PRED_2NxnD, + PRED_nLx2N, + PRED_nRx2N, + PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */ + PRED_LOSSLESS, /* lossless encode of best mode */ + MAX_PRED_TYPES + }; + + struct ModeDepth + { + Mode pred[MAX_PRED_TYPES]; + Mode* bestMode; + Yuv fencYuv; + CUDataMemPool cuMemPool; + }; + + ModeDepth m_modeDepth[NUM_CU_DEPTH]; + bool m_bTryLossless; + + Analysis(); + bool create(ThreadLocalData* tld); + void destroy(); + Search::Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext); + +protected: + + /* mode analysis distribution */ + int m_totalNumJobs; + volatile int m_numAcquiredJobs; + volatile int m_numCompletedJobs; + Event m_modeCompletionEvent; + bool findJob(int threadId); + void parallelModeAnalysis(int threadId, int jobId); + void parallelME(int threadId, int meId); + + /* full analysis for an I-slice CU */ + void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* sdata, uint32_t &zOrder); + + /* full analysis for a P or B slice CU */ + void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom); + void compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom); + void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom); + + /* measure merge and skip */ + void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom); + void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom); + + /* measure inter options */ + void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize); + void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly); + + /* measure intra options */ + void checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom); + void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom); + + /* encode current bestMode losslessly, pick best RD cost */ + void tryLossless(const CUGeom& cuGeom); + + void checkDQP(CUData& cu, const CUGeom& cuGeom); + void addSplitFlagCost(Mode& mode, uint32_t depth); + void checkBestMode(Mode& mode, uint32_t depth); + uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom); + bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode); + + void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom); +}; + +struct ThreadLocalData +{ + Analysis analysis; + + void destroy() { analysis.destroy(); } +}; + +} + +#endif // ifndef X265_ANALYSIS_H diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp new file mode 100644 index 0000000..66f8e28 --- /dev/null +++ b/source/encoder/api.cpp @@ -0,0 +1,249 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "bitstream.h" +#include "param.h" + +#include "encoder.h" +#include "entropy.h" +#include "level.h" +#include "nal.h" +#include "bitcost.h" + +using namespace x265; + +extern "C" +x265_encoder *x265_encoder_open(x265_param *p) +{ + if (!p) + return NULL; + + x265_param *param = X265_MALLOC(x265_param, 1); + if (!param) + return NULL; + + memcpy(param, p, sizeof(x265_param)); + x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str); + x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str); + + x265_setup_primitives(param, param->cpuid); + + if (x265_check_params(param)) + return NULL; + + if (x265_set_globals(param)) + return NULL; + + Encoder *encoder = new Encoder; + if (!param->rc.bEnableSlowFirstPass) + x265_param_apply_fastfirstpass(param); + + // may change params for auto-detect, etc + encoder->configure(param); + + // may change rate control and CPB params + if (!enforceLevel(*param, encoder->m_vps)) + { + delete encoder; + return NULL; + } + + // will detect and set profile/tier/level in VPS + determineLevel(*param, encoder->m_vps); + + encoder->create(); + encoder->init(); + + x265_print_params(param); + + return encoder; +} + +extern "C" +int x265_encoder_headers(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal) +{ + if (pp_nal && enc) + { + Encoder *encoder = static_cast(enc); + Entropy sbacCoder; + Bitstream bs; + encoder->getStreamHeaders(encoder->m_nalList, sbacCoder, bs); + *pp_nal = &encoder->m_nalList.m_nal[0]; + if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal; + return encoder->m_nalList.m_occupancy; + } + + return -1; +} + +extern "C" +void x265_encoder_parameters(x265_encoder *enc, x265_param *out) +{ + if (enc && out) + { + Encoder *encoder = static_cast(enc); + memcpy(out, encoder->m_param, sizeof(x265_param)); + } +} + +extern "C" +int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out) +{ + if (!enc) + return -1; + + Encoder *encoder = static_cast(enc); + int numEncoded; + + // While flushing, we cannot return 0 until the entire stream is flushed + do + { + numEncoded = encoder->encode(pic_in, pic_out); + } + while (numEncoded == 0 && !pic_in && encoder->m_numDelayedPic); + + // do not allow reuse of these buffers for more than one picture. The + // encoder now owns these analysisData buffers. + if (pic_in) + { + pic_in->analysisData.intraData = NULL; + pic_in->analysisData.interData = NULL; + } + + if (pp_nal && numEncoded > 0) + { + *pp_nal = &encoder->m_nalList.m_nal[0]; + if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal; + } + else if (pi_nal) + *pi_nal = 0; + + return numEncoded; +} + +extern "C" +void x265_encoder_get_stats(x265_encoder *enc, x265_stats *outputStats, uint32_t statsSizeBytes) +{ + if (enc && outputStats) + { + Encoder *encoder = static_cast(enc); + encoder->fetchStats(outputStats, statsSizeBytes); + } +} + +extern "C" +void x265_encoder_log(x265_encoder* enc, int argc, char **argv) +{ + if (enc) + { + Encoder *encoder = static_cast(enc); + encoder->writeLog(argc, argv); + } +} + +extern "C" +void x265_encoder_close(x265_encoder *enc) +{ + if (enc) + { + Encoder *encoder = static_cast(enc); + + encoder->printSummary(); + encoder->destroy(); + delete encoder; + } +} + +extern "C" +void x265_cleanup(void) +{ + destroyROM(); + BitCost::destroy(); +} + +extern "C" +x265_picture *x265_picture_alloc() +{ + return (x265_picture*)x265_malloc(sizeof(x265_picture)); +} + +extern "C" +void x265_picture_init(x265_param *param, x265_picture *pic) +{ + memset(pic, 0, sizeof(x265_picture)); + + pic->bitDepth = param->internalBitDepth; + pic->colorSpace = param->internalCsp; + pic->forceqp = X265_QP_AUTO; + if (param->analysisMode) + { + uint32_t numPartitions = 1 << (g_maxFullDepth * 2); + uint32_t widthInCU = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t heightInCU = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + + uint32_t numCUsInFrame = widthInCU * heightInCU; + pic->analysisData.numCUsInFrame = numCUsInFrame; + pic->analysisData.numPartitions = numPartitions; + } +} + +extern "C" +void x265_picture_free(x265_picture *p) +{ + return x265_free(p); +} + +int x265_alloc_analysis_data(x265_picture* pic) +{ + CHECKED_MALLOC(pic->analysisData.interData, x265_inter_data, pic->analysisData.numCUsInFrame * 85); + CHECKED_MALLOC(pic->analysisData.intraData, x265_intra_data, 1); + pic->analysisData.intraData->cuAddr = NULL; + pic->analysisData.intraData->depth = NULL; + pic->analysisData.intraData->modes = NULL; + pic->analysisData.intraData->partSizes = NULL; + pic->analysisData.intraData->poc = NULL; + CHECKED_MALLOC(pic->analysisData.intraData->depth, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame); + CHECKED_MALLOC(pic->analysisData.intraData->modes, uint8_t, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame); + CHECKED_MALLOC(pic->analysisData.intraData->partSizes, char, pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame); + CHECKED_MALLOC(pic->analysisData.intraData->cuAddr, uint32_t, pic->analysisData.numCUsInFrame); + CHECKED_MALLOC(pic->analysisData.intraData->poc, int, pic->analysisData.numCUsInFrame); + return 0; + +fail: + x265_free_analysis_data(pic); + return -1; +} + +void x265_free_analysis_data(x265_picture* pic) +{ + X265_FREE(pic->analysisData.interData); + pic->analysisData.interData = NULL; + X265_FREE(pic->analysisData.intraData->depth); + X265_FREE(pic->analysisData.intraData->modes); + X265_FREE(pic->analysisData.intraData->partSizes); + X265_FREE(pic->analysisData.intraData->cuAddr); + X265_FREE(pic->analysisData.intraData->poc); + X265_FREE(pic->analysisData.intraData); + pic->analysisData.intraData = NULL; +} diff --git a/source/encoder/bitcost.cpp b/source/encoder/bitcost.cpp new file mode 100644 index 0000000..c0a4fab --- /dev/null +++ b/source/encoder/bitcost.cpp @@ -0,0 +1,91 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "bitcost.h" + +using namespace x265; + +void BitCost::setQP(unsigned int qp) +{ + if (!s_costs[qp]) + { + ScopedLock s(s_costCalcLock); + + // Now that we have acquired the lock, check again if another thread calculated + // this row while we were blocked + if (!s_costs[qp]) + { + x265_emms(); // just to be safe + + CalculateLogs(); + s_costs[qp] = new uint16_t[4 * BC_MAX_MV + 1] + 2 * BC_MAX_MV; + double lambda = x265_lambda_tab[qp]; + + // estimate same cost for negative and positive MVD + for (int i = 0; i <= 2 * BC_MAX_MV; i++) + s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 16) - 1); + } + } + + m_cost = s_costs[qp]; +} + +/*** + * Class static data and methods + */ + +uint16_t *BitCost::s_costs[BC_MAX_QP]; + +float *BitCost::s_bitsizes; + +Lock BitCost::s_costCalcLock; + +void BitCost::CalculateLogs() +{ + if (!s_bitsizes) + { + s_bitsizes = new float[2 * BC_MAX_MV + 1]; + s_bitsizes[0] = 0.718f; + float log2_2 = 2.0f / log(2.0f); // 2 x 1/log(2) + for (int i = 1; i <= 2 * BC_MAX_MV; i++) + s_bitsizes[i] = log((float)(i + 1)) * log2_2 + 1.718f; + } +} + +void BitCost::destroy() +{ + for (int i = 0; i < BC_MAX_QP; i++) + { + if (s_costs[i]) + { + delete [] (s_costs[i] - 2 * BC_MAX_MV); + + s_costs[i] = 0; + } + } + + delete [] s_bitsizes; + s_bitsizes = 0; +} diff --git a/source/encoder/bitcost.h b/source/encoder/bitcost.h new file mode 100644 index 0000000..d28486b --- /dev/null +++ b/source/encoder/bitcost.h @@ -0,0 +1,93 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_BITCOST_H +#define X265_BITCOST_H + +#include "common.h" +#include "threading.h" +#include "mv.h" + +namespace x265 { +// private x265 namespace + +class BitCost +{ +public: + + BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0) {} + + void setQP(unsigned int qp); + + void setMVP(const MV& mvp) { m_mvp = mvp; m_cost_mvx = m_cost - mvp.x; m_cost_mvy = m_cost - mvp.y; } + + // return bit cost of motion vector difference, multiplied by lambda + inline uint16_t mvcost(const MV& mv) const { return m_cost_mvx[mv.x] + m_cost_mvy[mv.y]; } + + // return bit cost of motion vector difference, without lambda + inline uint32_t bitcost(const MV& mv) const + { + return (uint32_t)(s_bitsizes[abs(mv.x - m_mvp.x)] + + s_bitsizes[abs(mv.y - m_mvp.y)] + 0.5f); + } + + static inline uint32_t bitcost(const MV& mv, const MV& mvp) + { + return (uint32_t)(s_bitsizes[abs(mv.x - mvp.x)] + + s_bitsizes[abs(mv.y - mvp.y)] + 0.5f); + } + + static void destroy(); + +protected: + + uint16_t *m_cost_mvx; + + uint16_t *m_cost_mvy; + + uint16_t *m_cost; + + MV m_mvp; + + BitCost& operator =(const BitCost&); + +private: + + /* default log2_max_mv_length_horizontal and log2_max_mv_length_horizontal + * are 15, specified in quarter-pel luma sample units. making the maximum + * signaled ful-pel motion distance 4096, max qpel is 32768 */ + enum { BC_MAX_MV = (1 << 15) }; + + enum { BC_MAX_QP = 82 }; + + static float *s_bitsizes; + + static uint16_t *s_costs[BC_MAX_QP]; + + static Lock s_costCalcLock; + + static void CalculateLogs(); +}; +} + +#endif // ifndef X265_BITCOST_H diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp new file mode 100644 index 0000000..1c82a76 --- /dev/null +++ b/source/encoder/dpb.cpp @@ -0,0 +1,298 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" +#include "slice.h" + +#include "dpb.h" + +using namespace x265; + +DPB::~DPB() +{ + while (!m_freeList.empty()) + { + Frame* curFrame = m_freeList.popFront(); + curFrame->destroy(); + delete curFrame; + } + + while (!m_picList.empty()) + { + Frame* curFrame = m_picList.popFront(); + curFrame->destroy(); + delete curFrame; + } + + while (m_picSymFreeList) + { + FrameData* next = m_picSymFreeList->m_freeListNext; + m_picSymFreeList->destroy(); + + m_picSymFreeList->m_reconPicYuv->destroy(); + delete m_picSymFreeList->m_reconPicYuv; + + delete m_picSymFreeList; + m_picSymFreeList = next; + } +} + +// move unreferenced pictures from picList to freeList for recycle +void DPB::recycleUnreferenced() +{ + Frame *iterFrame = m_picList.first(); + + while (iterFrame) + { + Frame *curFrame = iterFrame; + iterFrame = iterFrame->m_next; + if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders) + { + curFrame->m_reconRowCount.set(0); + curFrame->m_bChromaExtended = false; + + // iterator is invalidated by remove, restart scan + m_picList.remove(*curFrame); + iterFrame = m_picList.first(); + + m_freeList.pushBack(*curFrame); + curFrame->m_encData->m_freeListNext = m_picSymFreeList; + m_picSymFreeList = curFrame->m_encData; + curFrame->m_encData = NULL; + curFrame->m_reconPicYuv = NULL; + } + } +} + +void DPB::prepareEncode(Frame *newFrame) +{ + Slice* slice = newFrame->m_encData->m_slice; + slice->m_poc = newFrame->m_poc; + + int pocCurr = slice->m_poc; + int type = newFrame->m_lowres.sliceType; + bool bIsKeyFrame = newFrame->m_lowres.bKeyframe; + + slice->m_nalUnitType = getNalUnitType(pocCurr, bIsKeyFrame); + if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL) + m_lastIDR = pocCurr; + slice->m_lastIDR = m_lastIDR; + slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE; + + if (type == X265_TYPE_B) + { + // change from _R "referenced" to _N "non-referenced" NAL unit type + switch (slice->m_nalUnitType) + { + case NAL_UNIT_CODED_SLICE_TRAIL_R: + slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TRAIL_N; + break; + case NAL_UNIT_CODED_SLICE_RADL_R: + slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N; + break; + case NAL_UNIT_CODED_SLICE_RASL_R: + slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N; + break; + default: + break; + } + } + + /* m_bHasReferences starts out as true for non-B pictures, and is set to false + * once no more pictures reference it */ + newFrame->m_encData->m_bHasReferences = IS_REFERENCED(newFrame); + + m_picList.pushFront(*newFrame); + + // Do decoding refresh marking if any + decodingRefreshMarking(pocCurr, slice->m_nalUnitType); + + computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering); + + // Mark pictures in m_piclist as unreferenced if they are not included in RPS + applyReferencePictureSet(&slice->m_rps, pocCurr); + + slice->m_numRefIdx[0] = X265_MIN(m_maxRefL0, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC + slice->m_numRefIdx[1] = X265_MIN(m_maxRefL1, slice->m_rps.numberOfPositivePictures); + slice->setRefPicList(m_picList); + + X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx[1], "B slice without L1 references (non-fatal)\n"); + + if (slice->m_sliceType == B_SLICE) + { + /* TODO: the lookahead should be able to tell which reference picture + * had the least motion residual. We should be able to use that here to + * select a colocation reference list and index */ + slice->m_colFromL0Flag = false; + slice->m_colRefIdx = 0; + slice->m_bCheckLDC = false; + } + else + { + slice->m_bCheckLDC = true; + slice->m_colFromL0Flag = true; + slice->m_colRefIdx = 0; + } + slice->m_sLFaseFlag = (SLFASE_CONSTANT & (1 << (pocCurr % 31))) > 0; + + /* Increment reference count of all motion-referenced frames to prevent them + * from being recycled. These counts are decremented at the end of + * compressFrame() */ + int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) + { + Frame *refpic = slice->m_refPicList[l][ref]; + ATOMIC_INC(&refpic->m_countRefEncoders); + } + } +} + +void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer) +{ + unsigned int poci = 0, numNeg = 0, numPos = 0; + + Frame* iterPic = m_picList.first(); + + while (iterPic && (poci < maxDecPicBuffer - 1)) + { + if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences) + { + rps->poc[poci] = iterPic->m_poc; + rps->deltaPOC[poci] = rps->poc[poci] - curPoc; + (rps->deltaPOC[poci] < 0) ? numNeg++ : numPos++; + rps->bUsed[poci] = !isRAP; + poci++; + } + iterPic = iterPic->m_next; + } + + rps->numberOfPictures = poci; + rps->numberOfPositivePictures = numPos; + rps->numberOfNegativePictures = numNeg; + + rps->sortDeltaPOC(); +} + +/* Marking reference pictures when an IDR/CRA is encountered. */ +void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType) +{ + if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL) + { + /* If the nal_unit_type is IDR, all pictures in the reference picture + * list are marked as "unused for reference" */ + Frame* iterFrame = m_picList.first(); + while (iterFrame) + { + if (iterFrame->m_poc != pocCurr) + iterFrame->m_encData->m_bHasReferences = false; + iterFrame = iterFrame->m_next; + } + } + else // CRA or No DR + { + if (m_bRefreshPending && pocCurr > m_pocCRA) + { + /* If the bRefreshPending flag is true (a deferred decoding refresh + * is pending) and the current temporal reference is greater than + * the temporal reference of the latest CRA picture (pocCRA), mark + * all reference pictures except the latest CRA picture as "unused + * for reference" and set the bRefreshPending flag to false */ + Frame* iterFrame = m_picList.first(); + while (iterFrame) + { + if (iterFrame->m_poc != pocCurr && iterFrame->m_poc != m_pocCRA) + iterFrame->m_encData->m_bHasReferences = false; + iterFrame = iterFrame->m_next; + } + + m_bRefreshPending = false; + } + if (nalUnitType == NAL_UNIT_CODED_SLICE_CRA) + { + /* If the nal_unit_type is CRA, set the bRefreshPending flag to true + * and pocCRA to the temporal reference of the current picture */ + m_bRefreshPending = true; + m_pocCRA = pocCurr; + } + } + + /* Note that the current picture is already placed in the reference list and + * its marking is not changed. If the current picture has a nal_ref_idc + * that is not 0, it will remain marked as "used for reference" */ +} + +/** Function for applying picture marking based on the Reference Picture Set */ +void DPB::applyReferencePictureSet(RPS *rps, int curPoc) +{ + // loop through all pictures in the reference picture buffer + Frame* iterFrame = m_picList.first(); + while (iterFrame) + { + if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences) + { + // loop through all pictures in the Reference Picture Set + // to see if the picture should be kept as reference picture + bool referenced = false; + for (int i = 0; i < rps->numberOfPositivePictures + rps->numberOfNegativePictures; i++) + { + if (iterFrame->m_poc == curPoc + rps->deltaPOC[i]) + { + referenced = true; + break; + } + } + if (!referenced) + iterFrame->m_encData->m_bHasReferences = false; + } + iterFrame = iterFrame->m_next; + } +} + +/* deciding the nal_unit_type */ +NalUnitType DPB::getNalUnitType(int curPOC, bool bIsKeyFrame) +{ + if (!curPOC) + return NAL_UNIT_CODED_SLICE_IDR_W_RADL; + + if (bIsKeyFrame) + return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : NAL_UNIT_CODED_SLICE_IDR_W_RADL; + + if (m_pocCRA && curPOC < m_pocCRA) + // All leading pictures are being marked as TFD pictures here since + // current encoder uses all reference pictures while encoding leading + // pictures. An encoder can ensure that a leading picture can be still + // decodable when random accessing to a CRA/CRANT/BLA/BLANT picture by + // controlling the reference pictures used for encoding that leading + // picture. Such a leading picture need not be marked as a TFD picture. + return NAL_UNIT_CODED_SLICE_RASL_R; + + if (m_lastIDR && curPOC < m_lastIDR) + return NAL_UNIT_CODED_SLICE_RADL_R; + + return NAL_UNIT_CODED_SLICE_TRAIL_R; +} diff --git a/source/encoder/dpb.h b/source/encoder/dpb.h new file mode 100644 index 0000000..b4bfd4b --- /dev/null +++ b/source/encoder/dpb.h @@ -0,0 +1,78 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_DPB_H +#define X265_DPB_H + +#include "piclist.h" + +namespace x265 { +// private namespace for x265 + +class Frame; +class FrameData; +class Slice; + +class DPB +{ +public: + + int m_lastIDR; + int m_pocCRA; + bool m_bRefreshPending; + int m_maxRefL0; + int m_maxRefL1; + int m_bOpenGOP; + PicList m_picList; + PicList m_freeList; + FrameData* m_picSymFreeList; + + DPB(x265_param *param) + { + m_lastIDR = 0; + m_pocCRA = 0; + m_bRefreshPending = false; + m_picSymFreeList = NULL; + m_maxRefL0 = param->maxNumReferences; + m_maxRefL1 = param->bBPyramid ? 2 : 1; + m_bOpenGOP = param->bOpenGOP; + } + + ~DPB(); + + void prepareEncode(Frame*); + + void recycleUnreferenced(); + +protected: + + void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer); + + void applyReferencePictureSet(RPS *rps, int curPoc); + void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType); + + NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame); +}; +} + +#endif // X265_DPB_H diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp new file mode 100644 index 0000000..44e82af --- /dev/null +++ b/source/encoder/encoder.cpp @@ -0,0 +1,1492 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "threadpool.h" +#include "param.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" + +#include "bitcost.h" +#include "encoder.h" +#include "slicetype.h" +#include "frameencoder.h" +#include "ratecontrol.h" +#include "dpb.h" +#include "nal.h" + +#include "x265.h" + +namespace x265 { +const char g_sliceTypeToChar[] = {'B', 'P', 'I'}; +} + +static const char *summaryCSVHeader = + "Command, Date/Time, Elapsed Time, FPS, Bitrate, " + "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), " + "I count, I ave-QP, I kpbs, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), " + "P count, P ave-QP, P kpbs, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), " + "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), " + "Version\n"; + +using namespace x265; + +Encoder::Encoder() +{ + m_aborted = false; + m_encodedFrameNum = 0; + m_pocLast = -1; + m_curEncoder = 0; + m_numLumaWPFrames = 0; + m_numChromaWPFrames = 0; + m_numLumaWPBiFrames = 0; + m_numChromaWPBiFrames = 0; + m_lookahead = NULL; + m_frameEncoder = NULL; + m_rateControl = NULL; + m_dpb = NULL; + m_exportedPic = NULL; + m_numDelayedPic = 0; + m_outputCount = 0; + m_csvfpt = NULL; + m_param = NULL; + m_cuOffsetY = NULL; + m_cuOffsetC = NULL; + m_buOffsetY = NULL; + m_buOffsetC = NULL; + m_threadPool = 0; + m_numThreadLocalData = 0; +} + +void Encoder::create() +{ + if (!primitives.sad[0]) + { + // this should be an impossible condition when using our public API, and indicates a serious bug. + x265_log(m_param, X265_LOG_ERROR, "Primitives must be initialized before encoder is created\n"); + abort(); + } + + x265_param* p = m_param; + + int rows = (p->sourceHeight + p->maxCUSize - 1) >> g_log2Size[p->maxCUSize]; + + // Do not allow WPP if only one row, it is pointless and unstable + if (rows == 1) + p->bEnableWavefront = 0; + + int poolThreadCount = p->poolNumThreads ? p->poolNumThreads : getCpuCount(); + + // Trim the thread pool if --wpp, --pme, and --pmode are disabled + if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation) + poolThreadCount = 0; + + if (poolThreadCount > 1) + { + m_threadPool = ThreadPool::allocThreadPool(poolThreadCount); + poolThreadCount = m_threadPool->getThreadCount(); + } + else + poolThreadCount = 0; + + if (!poolThreadCount) + { + // issue warnings if any of these features were requested + if (p->bEnableWavefront) + x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --wpp disabled\n"); + if (p->bDistributeMotionEstimation) + x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pme disabled\n"); + if (p->bDistributeModeAnalysis) + x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pmode disabled\n"); + + // disable all pool features if the thread pool is disabled or unusable. + p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = 0; + } + + if (!p->frameNumThreads) + { + // auto-detect frame threads + int cpuCount = getCpuCount(); + if (!p->bEnableWavefront) + p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2); + else if (cpuCount > 32) + p->frameNumThreads = 6; // dual-socket 10-core IvyBridge or higher + else if (cpuCount >= 16) + p->frameNumThreads = 5; // 8 HT cores, or dual socket + else if (cpuCount >= 8) + p->frameNumThreads = 3; // 4 HT cores + else if (cpuCount >= 4) + p->frameNumThreads = 2; // Dual or Quad core + else + p->frameNumThreads = 1; + } + + x265_log(p, X265_LOG_INFO, "WPP streams / frame threads / pool : %d / %d / %d%s%s\n", + p->bEnableWavefront ? rows : 0, p->frameNumThreads, poolThreadCount, + p->bDistributeMotionEstimation ? " / pme" : "", p->bDistributeModeAnalysis ? " / pmode" : ""); + + m_frameEncoder = new FrameEncoder[m_param->frameNumThreads]; + for (int i = 0; i < m_param->frameNumThreads; i++) + m_frameEncoder[i].setThreadPool(m_threadPool); + + if (!m_scalingList.init()) + { + x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n"); + m_aborted = true; + } + else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off")) + m_scalingList.m_bEnabled = false; + else if (!strcmp(m_param->scalingLists, "default")) + m_scalingList.setDefaultScalingList(); + else if (m_scalingList.parseScalingList(m_param->scalingLists)) + m_aborted = true; + m_scalingList.setupQuantMatrices(); + + /* Allocate thread local data, one for each thread pool worker and + * if --no-wpp, one for each frame encoder */ + m_numThreadLocalData = poolThreadCount; + if (!m_param->bEnableWavefront) + m_numThreadLocalData += m_param->frameNumThreads; + m_threadLocalData = new ThreadLocalData[m_numThreadLocalData]; + for (int i = 0; i < m_numThreadLocalData; i++) + { + m_threadLocalData[i].analysis.setThreadPool(m_threadPool); + m_threadLocalData[i].analysis.initSearch(*m_param, m_scalingList); + m_threadLocalData[i].analysis.create(m_threadLocalData); + } + + if (!m_param->bEnableWavefront) + for (int i = 0; i < m_param->frameNumThreads; i++) + m_frameEncoder[i].m_tld = &m_threadLocalData[poolThreadCount + i]; + + m_lookahead = new Lookahead(m_param, m_threadPool); + m_dpb = new DPB(m_param); + m_rateControl = new RateControl(m_param); + + initSPS(&m_sps); + initPPS(&m_pps); + + /* Try to open CSV file handle */ + if (m_param->csvfn) + { + m_csvfpt = fopen(m_param->csvfn, "r"); + if (m_csvfpt) + { + // file already exists, re-open for append + fclose(m_csvfpt); + m_csvfpt = fopen(m_param->csvfn, "ab"); + } + else + { + // new CSV file, write header + m_csvfpt = fopen(m_param->csvfn, "wb"); + if (m_csvfpt) + { + if (m_param->logLevel >= X265_LOG_DEBUG) + { + fprintf(m_csvfpt, "Encode Order, Type, POC, QP, Bits, "); + if (m_param->rc.rateControlMode == X265_RC_CRF) + fprintf(m_csvfpt, "RateFactor, "); + fprintf(m_csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB), " + "Encoding time, Elapsed time, List 0, List 1\n"); + } + else + fputs(summaryCSVHeader, m_csvfpt); + } + } + } + + m_aborted |= parseLambdaFile(m_param); +} + +void Encoder::destroy() +{ + if (m_exportedPic) + { + ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); + m_exportedPic = NULL; + } + + if (m_rateControl) + m_rateControl->terminate(); // unblock all blocked RC calls + + if (m_frameEncoder) + { + for (int i = 0; i < m_param->frameNumThreads; i++) + { + // Ensure frame encoder is idle before destroying it + m_frameEncoder[i].getEncodedPicture(m_nalList); + m_frameEncoder[i].destroy(); + } + + delete [] m_frameEncoder; + } + + for (int i = 0; i < m_numThreadLocalData; i++) + m_threadLocalData[i].destroy(); + + delete [] m_threadLocalData; + + if (m_lookahead) + { + m_lookahead->destroy(); + delete m_lookahead; + } + + delete m_dpb; + if (m_rateControl) + { + m_rateControl->destroy(); + delete m_rateControl; + } + // thread pool release should always happen last + if (m_threadPool) + m_threadPool->release(); + + X265_FREE(m_cuOffsetY); + X265_FREE(m_cuOffsetC); + X265_FREE(m_buOffsetY); + X265_FREE(m_buOffsetC); + + if (m_csvfpt) + fclose(m_csvfpt); + free(m_param->rc.statFileName); // alloc'd by strdup + + X265_FREE(m_param); +} + +void Encoder::init() +{ + if (m_frameEncoder) + { + int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + for (int i = 0; i < m_param->frameNumThreads; i++) + { + if (!m_frameEncoder[i].init(this, numRows, numCols, i)) + { + x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n"); + m_aborted = true; + } + } + } + if (m_param->bEmitHRDSEI) + m_rateControl->initHRD(&m_sps); + if (!m_rateControl->init(&m_sps)) + m_aborted = true; + m_lookahead->init(); + m_encodeStartTime = x265_mdate(); +} + +void Encoder::updateVbvPlan(RateControl* rc) +{ + for (int i = 0; i < m_param->frameNumThreads; i++) + { + FrameEncoder *encoder = &m_frameEncoder[i]; + if (encoder->m_rce.isActive && encoder->m_rce.poc != rc->m_curSlice->m_poc) + { + int64_t bits = (int64_t) X265_MAX(encoder->m_rce.frameSizeEstimated, encoder->m_rce.frameSizePlanned); + rc->m_bufferFill -= bits; + rc->m_bufferFill = X265_MAX(rc->m_bufferFill, 0); + rc->m_bufferFill += encoder->m_rce.bufferRate; + rc->m_bufferFill = X265_MIN(rc->m_bufferFill, rc->m_bufferSize); + if (rc->m_2pass) + rc->m_predictedBits += bits; + } + } +} + +/** + * Feed one new input frame into the encoder, get one frame out. If pic_in is + * NULL, a flush condition is implied and pic_in must be NULL for all subsequent + * calls for this encoder instance. + * + * pic_in input original YUV picture or NULL + * pic_out pointer to reconstructed picture struct + * + * returns 0 if no frames are currently available for output + * 1 if frame was output, m_nalList contains access unit + * negative on malloc error or abort */ +int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) +{ + if (m_aborted) + return -1; + + if (m_exportedPic) + { + ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); + m_exportedPic = NULL; + m_dpb->recycleUnreferenced(); + } + + if (pic_in) + { + if (pic_in->colorSpace != m_param->internalCsp) + { + x265_log(m_param, X265_LOG_ERROR, "Unsupported color space (%d) on input\n", + pic_in->colorSpace); + return -1; + } + if (pic_in->bitDepth < 8 || pic_in->bitDepth > 16) + { + x265_log(m_param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", + pic_in->bitDepth); + return -1; + } + + Frame *inFrame; + if (m_dpb->m_freeList.empty()) + { + inFrame = new Frame; + if (inFrame->create(m_param)) + { + /* the first PicYuv created is asked to generate the CU and block unit offset + * arrays which are then shared with all subsequent PicYuv (orig and recon) + * allocated by this top level encoder */ + if (m_cuOffsetY) + { + inFrame->m_origPicYuv->m_cuOffsetC = m_cuOffsetC; + inFrame->m_origPicYuv->m_cuOffsetY = m_cuOffsetY; + inFrame->m_origPicYuv->m_buOffsetC = m_buOffsetC; + inFrame->m_origPicYuv->m_buOffsetY = m_buOffsetY; + } + else + { + if (!inFrame->m_origPicYuv->createOffsets(m_sps)) + { + m_aborted = true; + x265_log(m_param, X265_LOG_ERROR, "memory allocation failure, aborting encode\n"); + inFrame->destroy(); + delete inFrame; + return -1; + } + else + { + m_cuOffsetC = inFrame->m_origPicYuv->m_cuOffsetC; + m_cuOffsetY = inFrame->m_origPicYuv->m_cuOffsetY; + m_buOffsetC = inFrame->m_origPicYuv->m_buOffsetC; + m_buOffsetY = inFrame->m_origPicYuv->m_buOffsetY; + } + } + } + else + { + m_aborted = true; + x265_log(m_param, X265_LOG_ERROR, "memory allocation failure, aborting encode\n"); + inFrame->destroy(); + delete inFrame; + return -1; + } + } + else + inFrame = m_dpb->m_freeList.popBack(); + + /* Copy input picture into a Frame and PicYuv, send to lookahead */ + inFrame->m_poc = ++m_pocLast; + inFrame->m_origPicYuv->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset); + inFrame->m_intraData = pic_in->analysisData.intraData; + inFrame->m_interData = pic_in->analysisData.interData; + inFrame->m_userData = pic_in->userData; + inFrame->m_pts = pic_in->pts; + inFrame->m_forceqp = pic_in->forceqp; + + if (m_pocLast == 0) + m_firstPts = inFrame->m_pts; + if (m_bframeDelay && m_pocLast == m_bframeDelay) + m_bframeDelayTime = inFrame->m_pts - m_firstPts; + + /* Encoder holds a reference count until stats collection is finished */ + ATOMIC_INC(&inFrame->m_countRefEncoders); + bool bEnableWP = m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred; + if (m_param->rc.aqMode || bEnableWP) + { + if (m_param->rc.cuTree && m_param->rc.bStatRead) + { + if (!m_rateControl->cuTreeReadFor2Pass(inFrame)) + { + m_aborted = 1; + return -1; + } + } + else + m_rateControl->calcAdaptiveQuantFrame(inFrame); + } + + /* Use the frame types from the first pass, if available */ + int sliceType = (m_param->rc.bStatRead) ? m_rateControl->rateControlSliceType(inFrame->m_poc) : pic_in->sliceType; + m_lookahead->addPicture(inFrame, sliceType); + m_numDelayedPic++; + } + else + m_lookahead->flush(); + + FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder]; + m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads; + int ret = 0; + + // getEncodedPicture() should block until the FrameEncoder has completed + // encoding the frame. This is how back-pressure through the API is + // accomplished when the encoder is full. + Frame *outFrame = curEncoder->getEncodedPicture(m_nalList); + + if (outFrame) + { + Slice *slice = outFrame->m_encData->m_slice; + if (pic_out) + { + PicYuv *recpic = outFrame->m_reconPicYuv; + pic_out->poc = slice->m_poc; + pic_out->bitDepth = X265_DEPTH; + pic_out->userData = outFrame->m_userData; + pic_out->colorSpace = m_param->internalCsp; + + pic_out->pts = outFrame->m_pts; + pic_out->dts = outFrame->m_dts; + + switch (slice->m_sliceType) + { + case I_SLICE: + pic_out->sliceType = outFrame->m_lowres.bKeyframe ? X265_TYPE_IDR : X265_TYPE_I; + break; + case P_SLICE: + pic_out->sliceType = X265_TYPE_P; + break; + case B_SLICE: + pic_out->sliceType = X265_TYPE_B; + break; + } + + pic_out->planes[0] = recpic->m_picOrg[0]; + pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel)); + pic_out->planes[1] = recpic->m_picOrg[1]; + pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel)); + pic_out->planes[2] = recpic->m_picOrg[2]; + pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel)); + } + + if (m_param->analysisMode) + { + pic_out->analysisData.interData = outFrame->m_interData; + pic_out->analysisData.intraData = outFrame->m_intraData; + pic_out->analysisData.numCUsInFrame = slice->m_sps->numCUsInFrame; + pic_out->analysisData.numPartitions = slice->m_sps->numPartitions; + } + + if (slice->m_sliceType == P_SLICE) + { + if (slice->m_weightPredTable[0][0][0].bPresentFlag) + m_numLumaWPFrames++; + if (slice->m_weightPredTable[0][0][1].bPresentFlag || + slice->m_weightPredTable[0][0][2].bPresentFlag) + m_numChromaWPFrames++; + } + else if (slice->m_sliceType == B_SLICE) + { + bool bLuma = false, bChroma = false; + for (int l = 0; l < 2; l++) + { + if (slice->m_weightPredTable[l][0][0].bPresentFlag) + bLuma = true; + if (slice->m_weightPredTable[l][0][1].bPresentFlag || + slice->m_weightPredTable[l][0][2].bPresentFlag) + bChroma = true; + } + + if (bLuma) + m_numLumaWPBiFrames++; + if (bChroma) + m_numChromaWPBiFrames++; + } + if (m_aborted) + return -1; + + finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits); + // Allow this frame to be recycled if no frame encoders are using it for reference + if (!pic_out) + { + ATOMIC_DEC(&outFrame->m_countRefEncoders); + m_dpb->recycleUnreferenced(); + } + else + m_exportedPic = outFrame; + + m_numDelayedPic--; + + ret = 1; + } + + // pop a single frame from decided list, then provide to frame encoder + // curEncoder is guaranteed to be idle at this point + Frame* frameEnc = m_lookahead->getDecidedPicture(); + if (frameEnc) + { + // give this picture a FrameData instance before encoding + if (m_dpb->m_picSymFreeList) + { + frameEnc->m_encData = m_dpb->m_picSymFreeList; + m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext; + frameEnc->reinit(m_sps); + } + else + { + frameEnc->allocEncodeData(m_param, m_sps); + Slice* slice = frameEnc->m_encData->m_slice; + slice->m_sps = &m_sps; + slice->m_pps = &m_pps; + slice->m_maxNumMergeCand = m_param->maxNumMergeCand; + slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS); + frameEnc->m_reconPicYuv->m_cuOffsetC = m_cuOffsetC; + frameEnc->m_reconPicYuv->m_cuOffsetY = m_cuOffsetY; + frameEnc->m_reconPicYuv->m_buOffsetC = m_buOffsetC; + frameEnc->m_reconPicYuv->m_buOffsetY = m_buOffsetY; + } + curEncoder->m_rce.encodeOrder = m_encodedFrameNum++; + if (m_bframeDelay) + { + int64_t *prevReorderedPts = m_prevReorderedPts; + frameEnc->m_dts = m_encodedFrameNum > m_bframeDelay + ? prevReorderedPts[(m_encodedFrameNum - m_bframeDelay) % m_bframeDelay] + : frameEnc->m_reorderedPts - m_bframeDelayTime; + prevReorderedPts[m_encodedFrameNum % m_bframeDelay] = frameEnc->m_reorderedPts; + } + else + frameEnc->m_dts = frameEnc->m_reorderedPts; + + // determine references, setup RPS, etc + m_dpb->prepareEncode(frameEnc); + + if (m_param->rc.rateControlMode != X265_RC_CQP) + m_lookahead->getEstimatedPictureCost(frameEnc); + + // Allow FrameEncoder::compressFrame() to start in the frame encoder thread + if (!curEncoder->startCompressFrame(frameEnc)) + m_aborted = true; + } + else if (m_encodedFrameNum) + m_rateControl->setFinalFrameCount(m_encodedFrameNum); + + return ret; +} + +void EncStats::addPsnr(double psnrY, double psnrU, double psnrV) +{ + m_psnrSumY += psnrY; + m_psnrSumU += psnrU; + m_psnrSumV += psnrV; +} + +void EncStats::addBits(uint64_t bits) +{ + m_accBits += bits; + m_numPics++; +} + +void EncStats::addSsim(double ssim) +{ + m_globalSsim += ssim; +} + +void EncStats::addQP(double aveQp) +{ + m_totalQp += aveQp; +} + +char* Encoder::statsCSVString(EncStats& stat, char* buffer) +{ + if (!stat.m_numPics) + { + sprintf(buffer, "-, -, -, -, -, -, -, "); + return buffer; + } + + double fps = (double)m_param->fpsNum / m_param->fpsDenom; + double scale = fps / 1000 / (double)stat.m_numPics; + + int len = sprintf(buffer, "%-6u, ", stat.m_numPics); + + len += sprintf(buffer + len, "%2.2lf, ", stat.m_totalQp / (double)stat.m_numPics); + len += sprintf(buffer + len, "%-8.2lf, ", stat.m_accBits * scale); + if (m_param->bEnablePsnr) + { + len += sprintf(buffer + len, "%.3lf, %.3lf, %.3lf, ", + stat.m_psnrSumY / (double)stat.m_numPics, + stat.m_psnrSumU / (double)stat.m_numPics, + stat.m_psnrSumV / (double)stat.m_numPics); + } + else + len += sprintf(buffer + len, "-, -, -, "); + + if (m_param->bEnableSsim) + sprintf(buffer + len, "%.3lf, ", x265_ssim2dB(stat.m_globalSsim / (double)stat.m_numPics)); + else + sprintf(buffer + len, "-, "); + return buffer; +} + +char* Encoder::statsString(EncStats& stat, char* buffer) +{ + double fps = (double)m_param->fpsNum / m_param->fpsDenom; + double scale = fps / 1000 / (double)stat.m_numPics; + + int len = sprintf(buffer, "%6u, ", stat.m_numPics); + + len += sprintf(buffer + len, "Avg QP:%2.2lf", stat.m_totalQp / (double)stat.m_numPics); + len += sprintf(buffer + len, " kb/s: %-8.2lf", stat.m_accBits * scale); + if (m_param->bEnablePsnr) + { + len += sprintf(buffer + len, " PSNR Mean: Y:%.3lf U:%.3lf V:%.3lf", + stat.m_psnrSumY / (double)stat.m_numPics, + stat.m_psnrSumU / (double)stat.m_numPics, + stat.m_psnrSumV / (double)stat.m_numPics); + } + if (m_param->bEnableSsim) + { + sprintf(buffer + len, " SSIM Mean: %.6lf (%.3lfdB)", + stat.m_globalSsim / (double)stat.m_numPics, + x265_ssim2dB(stat.m_globalSsim / (double)stat.m_numPics)); + } + return buffer; +} + +void Encoder::printSummary() +{ + if (m_param->logLevel < X265_LOG_INFO) + return; + + char buffer[200]; + if (m_analyzeI.m_numPics) + x265_log(m_param, X265_LOG_INFO, "frame I: %s\n", statsString(m_analyzeI, buffer)); + if (m_analyzeP.m_numPics) + x265_log(m_param, X265_LOG_INFO, "frame P: %s\n", statsString(m_analyzeP, buffer)); + if (m_analyzeB.m_numPics) + x265_log(m_param, X265_LOG_INFO, "frame B: %s\n", statsString(m_analyzeB, buffer)); + if (m_analyzeAll.m_numPics) + x265_log(m_param, X265_LOG_INFO, "global : %s\n", statsString(m_analyzeAll, buffer)); + if (m_param->bEnableWeightedPred && m_analyzeP.m_numPics) + { + x265_log(m_param, X265_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n", + (float)100.0 * m_numLumaWPFrames / m_analyzeP.m_numPics, + (float)100.0 * m_numChromaWPFrames / m_analyzeP.m_numPics); + } + if (m_param->bEnableWeightedBiPred && m_analyzeB.m_numPics) + { + x265_log(m_param, X265_LOG_INFO, "Weighted B-Frames: Y:%.1f%% UV:%.1f%%\n", + (float)100.0 * m_numLumaWPBiFrames / m_analyzeB.m_numPics, + (float)100.0 * m_numChromaWPBiFrames / m_analyzeB.m_numPics); + } + int pWithB = 0; + for (int i = 0; i <= m_param->bframes; i++) + pWithB += m_lookahead->m_histogram[i]; + + if (pWithB) + { + int p = 0; + for (int i = 0; i <= m_param->bframes; i++) + p += sprintf(buffer + p, "%.1f%% ", 100. * m_lookahead->m_histogram[i] / pWithB); + + x265_log(m_param, X265_LOG_INFO, "consecutive B-frames: %s\n", buffer); + } + if (m_param->bLossless) + { + float frameSize = (float)(m_param->sourceWidth - m_sps.conformanceWindow.rightOffset) * + (m_param->sourceHeight - m_sps.conformanceWindow.bottomOffset); + float uncompressed = frameSize * X265_DEPTH * m_analyzeAll.m_numPics; + + x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits); + } + + if (!m_param->bLogCuStats) + return; + + for (int sliceType = 2; sliceType >= 0; sliceType--) + { + if (sliceType == P_SLICE && !m_analyzeP.m_numPics) + continue; + if (sliceType == B_SLICE && !m_analyzeB.m_numPics) + continue; + + StatisticLog finalLog; + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + { + for (int i = 0; i < m_param->frameNumThreads; i++) + { + StatisticLog& enclog = m_frameEncoder[i].m_sliceTypeLog[sliceType]; + if (!depth) + finalLog.totalCu += enclog.totalCu; + finalLog.cntIntra[depth] += enclog.cntIntra[depth]; + for (int m = 0; m < INTER_MODES; m++) + { + if (m < INTRA_MODES) + finalLog.cuIntraDistribution[depth][m] += enclog.cuIntraDistribution[depth][m]; + finalLog.cuInterDistribution[depth][m] += enclog.cuInterDistribution[depth][m]; + } + + if (depth == g_maxCUDepth) + finalLog.cntIntraNxN += enclog.cntIntraNxN; + if (sliceType != I_SLICE) + { + finalLog.cntTotalCu[depth] += enclog.cntTotalCu[depth]; + finalLog.cntInter[depth] += enclog.cntInter[depth]; + finalLog.cntSkipCu[depth] += enclog.cntSkipCu[depth]; + } + } + + uint64_t cntInter, cntSkipCu, cntIntra = 0, cntIntraNxN = 0, encCu = 0; + uint64_t cuInterDistribution[INTER_MODES], cuIntraDistribution[INTRA_MODES]; + + // check for 0/0, if true assign 0 else calculate percentage + for (int n = 0; n < INTER_MODES; n++) + { + if (!finalLog.cntInter[depth]) + cuInterDistribution[n] = 0; + else + cuInterDistribution[n] = (finalLog.cuInterDistribution[depth][n] * 100) / finalLog.cntInter[depth]; + + if (n < INTRA_MODES) + { + if (!finalLog.cntIntra[depth]) + { + cntIntraNxN = 0; + cuIntraDistribution[n] = 0; + } + else + { + cntIntraNxN = (finalLog.cntIntraNxN * 100) / finalLog.cntIntra[depth]; + cuIntraDistribution[n] = (finalLog.cuIntraDistribution[depth][n] * 100) / finalLog.cntIntra[depth]; + } + } + } + + if (!finalLog.totalCu) + encCu = 0; + else if (sliceType == I_SLICE) + { + cntIntra = (finalLog.cntIntra[depth] * 100) / finalLog.totalCu; + cntIntraNxN = (finalLog.cntIntraNxN * 100) / finalLog.totalCu; + } + else + encCu = ((finalLog.cntIntra[depth] + finalLog.cntInter[depth]) * 100) / finalLog.totalCu; + + if (sliceType == I_SLICE) + { + cntInter = 0; + cntSkipCu = 0; + } + else if (!finalLog.cntTotalCu[depth]) + { + cntInter = 0; + cntIntra = 0; + cntSkipCu = 0; + } + else + { + cntInter = (finalLog.cntInter[depth] * 100) / finalLog.cntTotalCu[depth]; + cntIntra = (finalLog.cntIntra[depth] * 100) / finalLog.cntTotalCu[depth]; + cntSkipCu = (finalLog.cntSkipCu[depth] * 100) / finalLog.cntTotalCu[depth]; + } + + // print statistics + int cuSize = g_maxCUSize >> depth; + char stats[256] = { 0 }; + int len = 0; + if (sliceType != I_SLICE) + len += sprintf(stats + len, " EncCU "X265_LL "%% Merge "X265_LL "%%", encCu, cntSkipCu); + + if (cntInter) + { + len += sprintf(stats + len, " Inter "X265_LL "%%", cntInter); + if (m_param->bEnableAMP) + len += sprintf(stats + len, "(%dx%d "X265_LL "%% %dx%d "X265_LL "%% %dx%d "X265_LL "%% AMP "X265_LL "%%)", + cuSize, cuSize, cuInterDistribution[0], + cuSize / 2, cuSize, cuInterDistribution[2], + cuSize, cuSize / 2, cuInterDistribution[1], + cuInterDistribution[3]); + else if (m_param->bEnableRectInter) + len += sprintf(stats + len, "(%dx%d "X265_LL "%% %dx%d "X265_LL "%% %dx%d "X265_LL "%%)", + cuSize, cuSize, cuInterDistribution[0], + cuSize / 2, cuSize, cuInterDistribution[2], + cuSize, cuSize / 2, cuInterDistribution[1]); + } + if (cntIntra) + { + len += sprintf(stats + len, " Intra "X265_LL "%%(DC "X265_LL "%% P "X265_LL "%% Ang "X265_LL "%%", + cntIntra, cuIntraDistribution[0], + cuIntraDistribution[1], cuIntraDistribution[2]); + if (sliceType != I_SLICE) + { + if (depth == g_maxCUDepth) + len += sprintf(stats + len, " %dx%d "X265_LL "%%", cuSize / 2, cuSize / 2, cntIntraNxN); + } + + len += sprintf(stats + len, ")"); + if (sliceType == I_SLICE) + { + if (depth == g_maxCUDepth) + len += sprintf(stats + len, " %dx%d: "X265_LL "%%", cuSize / 2, cuSize / 2, cntIntraNxN); + } + } + const char slicechars[] = "BPI"; + if (stats[0]) + x265_log(m_param, X265_LOG_INFO, "%c%-2d:%s\n", slicechars[sliceType], cuSize, stats); + } + } +} + +void Encoder::fetchStats(x265_stats *stats, size_t statsSizeBytes) +{ + if (statsSizeBytes >= sizeof(stats)) + { + stats->globalPsnrY = m_analyzeAll.m_psnrSumY; + stats->globalPsnrU = m_analyzeAll.m_psnrSumU; + stats->globalPsnrV = m_analyzeAll.m_psnrSumV; + stats->encodedPictureCount = m_analyzeAll.m_numPics; + stats->totalWPFrames = m_numLumaWPFrames; + stats->accBits = m_analyzeAll.m_accBits; + stats->elapsedEncodeTime = (double)(x265_mdate() - m_encodeStartTime) / 1000000; + if (stats->encodedPictureCount > 0) + { + stats->globalSsim = m_analyzeAll.m_globalSsim / stats->encodedPictureCount; + stats->globalPsnr = (stats->globalPsnrY * 6 + stats->globalPsnrU + stats->globalPsnrV) / (8 * stats->encodedPictureCount); + stats->elapsedVideoTime = (double)stats->encodedPictureCount * m_param->fpsDenom / m_param->fpsNum; + stats->bitrate = (0.001f * stats->accBits) / stats->elapsedVideoTime; + } + else + { + stats->globalSsim = 0; + stats->globalPsnr = 0; + stats->bitrate = 0; + stats->elapsedVideoTime = 0; + } + } + + /* If new statistics are added to x265_stats, we must check here whether the + * structure provided by the user is the new structure or an older one (for + * future safety) */ +} + +void Encoder::writeLog(int argc, char **argv) +{ + if (m_csvfpt) + { + if (m_param->logLevel >= X265_LOG_DEBUG) + { + // adding summary to a per-frame csv log file needs a summary header + fprintf(m_csvfpt, "\nSummary\n"); + fputs(summaryCSVHeader, m_csvfpt); + } + // CLI arguments or other + for (int i = 1; i < argc; i++) + { + if (i) fputc(' ', m_csvfpt); + fputs(argv[i], m_csvfpt); + } + + // current date and time + time_t now; + struct tm* timeinfo; + time(&now); + timeinfo = localtime(&now); + char buffer[200]; + strftime(buffer, 128, "%c", timeinfo); + fprintf(m_csvfpt, ", %s, ", buffer); + + x265_stats stats; + fetchStats(&stats, sizeof(stats)); + + // elapsed time, fps, bitrate + fprintf(m_csvfpt, "%.2f, %.2f, %.2f,", + stats.elapsedEncodeTime, stats.encodedPictureCount / stats.elapsedEncodeTime, stats.bitrate); + + if (m_param->bEnablePsnr) + fprintf(m_csvfpt, " %.3lf, %.3lf, %.3lf, %.3lf,", + stats.globalPsnrY / stats.encodedPictureCount, stats.globalPsnrU / stats.encodedPictureCount, + stats.globalPsnrV / stats.encodedPictureCount, stats.globalPsnr); + else + fprintf(m_csvfpt, " -, -, -, -,"); + if (m_param->bEnableSsim) + fprintf(m_csvfpt, " %.6f, %6.3f,", stats.globalSsim, x265_ssim2dB(stats.globalSsim)); + else + fprintf(m_csvfpt, " -, -,"); + + fputs(statsCSVString(m_analyzeI, buffer), m_csvfpt); + fputs(statsCSVString(m_analyzeP, buffer), m_csvfpt); + fputs(statsCSVString(m_analyzeB, buffer), m_csvfpt); + fprintf(m_csvfpt, " %s\n", x265_version_str); + } +} + +/** + * Produce an ascii(hex) representation of picture digest. + * + * Returns: a statically allocated null-terminated string. DO NOT FREE. + */ +static const char*digestToString(const unsigned char digest[3][16], int numChar) +{ + const char* hex = "0123456789abcdef"; + static char string[99]; + int cnt = 0; + + for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++) + { + for (int i = 0; i < numChar; i++) + { + string[cnt++] = hex[digest[yuvIdx][i] >> 4]; + string[cnt++] = hex[digest[yuvIdx][i] & 0xf]; + } + + string[cnt++] = ','; + } + + string[cnt - 1] = '\0'; + return string; +} + +void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64_t bits) +{ + PicYuv* reconPic = curFrame->m_reconPicYuv; + + //===== calculate PSNR ===== + int width = reconPic->m_picWidth - m_sps.conformanceWindow.rightOffset; + int height = reconPic->m_picHeight - m_sps.conformanceWindow.bottomOffset; + int size = width * height; + + int maxvalY = 255 << (X265_DEPTH - 8); + int maxvalC = 255 << (X265_DEPTH - 8); + double refValueY = (double)maxvalY * maxvalY * size; + double refValueC = (double)maxvalC * maxvalC * size / 4.0; + uint64_t ssdY, ssdU, ssdV; + + ssdY = curEncoder->m_SSDY; + ssdU = curEncoder->m_SSDU; + ssdV = curEncoder->m_SSDV; + double psnrY = (ssdY ? 10.0 * log10(refValueY / (double)ssdY) : 99.99); + double psnrU = (ssdU ? 10.0 * log10(refValueC / (double)ssdU) : 99.99); + double psnrV = (ssdV ? 10.0 * log10(refValueC / (double)ssdV) : 99.99); + + FrameData& curEncData = *curFrame->m_encData; + Slice* slice = curEncData.m_slice; + + //===== add bits, psnr and ssim ===== + m_analyzeAll.addBits(bits); + m_analyzeAll.addQP(curEncData.m_avgQpAq); + + if (m_param->bEnablePsnr) + m_analyzeAll.addPsnr(psnrY, psnrU, psnrV); + + double ssim = 0.0; + if (m_param->bEnableSsim && curEncoder->m_ssimCnt) + { + ssim = curEncoder->m_ssim / curEncoder->m_ssimCnt; + m_analyzeAll.addSsim(ssim); + } + if (slice->isIntra()) + { + m_analyzeI.addBits(bits); + m_analyzeI.addQP(curEncData.m_avgQpAq); + if (m_param->bEnablePsnr) + m_analyzeI.addPsnr(psnrY, psnrU, psnrV); + if (m_param->bEnableSsim) + m_analyzeI.addSsim(ssim); + } + else if (slice->isInterP()) + { + m_analyzeP.addBits(bits); + m_analyzeP.addQP(curEncData.m_avgQpAq); + if (m_param->bEnablePsnr) + m_analyzeP.addPsnr(psnrY, psnrU, psnrV); + if (m_param->bEnableSsim) + m_analyzeP.addSsim(ssim); + } + else if (slice->isInterB()) + { + m_analyzeB.addBits(bits); + m_analyzeB.addQP(curEncData.m_avgQpAq); + if (m_param->bEnablePsnr) + m_analyzeB.addPsnr(psnrY, psnrU, psnrV); + if (m_param->bEnableSsim) + m_analyzeB.addSsim(ssim); + } + + // if debug log level is enabled, per frame logging is performed + if (m_param->logLevel >= X265_LOG_DEBUG) + { + char c = (slice->isIntra() ? 'I' : slice->isInterP() ? 'P' : 'B'); + int poc = slice->m_poc; + if (!IS_REFERENCED(curFrame)) + c += 32; // lower case if unreferenced + + char buf[1024]; + int p; + p = sprintf(buf, "POC:%d %c QP %2.2lf(%d) %10d bits", poc, c, curEncData.m_avgQpAq, slice->m_sliceQp, (int)bits); + if (m_param->rc.rateControlMode == X265_RC_CRF) + p += sprintf(buf + p, " RF:%.3lf", curEncData.m_rateFactor); + if (m_param->bEnablePsnr) + p += sprintf(buf + p, " [Y:%6.2lf U:%6.2lf V:%6.2lf]", psnrY, psnrU, psnrV); + if (m_param->bEnableSsim) + p += sprintf(buf + p, " [SSIM: %.3lfdB]", x265_ssim2dB(ssim)); + + if (!slice->isIntra()) + { + int numLists = slice->isInterP() ? 1 : 2; + for (int list = 0; list < numLists; list++) + { + p += sprintf(buf + p, " [L%d ", list); + for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) + { + int k = slice->m_refPOCList[list][ref] - slice->m_lastIDR; + p += sprintf(buf + p, "%d ", k); + } + + p += sprintf(buf + p, "]"); + } + } + + // per frame CSV logging if the file handle is valid + if (m_csvfpt) + { + fprintf(m_csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d,", m_outputCount++, c, poc, curEncData.m_avgQpAq, (int)bits); + if (m_param->rc.rateControlMode == X265_RC_CRF) + fprintf(m_csvfpt, "%.3lf,", curEncData.m_rateFactor); + double psnr = (psnrY * 6 + psnrU + psnrV) / 8; + if (m_param->bEnablePsnr) + fprintf(m_csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", psnrY, psnrU, psnrV, psnr); + else + fprintf(m_csvfpt, " -, -, -, -,"); + if (m_param->bEnableSsim) + fprintf(m_csvfpt, " %.6f, %6.3f,", ssim, x265_ssim2dB(ssim)); + else + fprintf(m_csvfpt, " -, -,"); + fprintf(m_csvfpt, " %.3lf, %.3lf", curEncoder->m_frameTime, curEncoder->m_elapsedCompressTime); + if (!slice->isIntra()) + { + int numLists = slice->isInterP() ? 1 : 2; + for (int list = 0; list < numLists; list++) + { + fprintf(m_csvfpt, ", "); + for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) + { + int k = slice->m_refPOCList[list][ref] - slice->m_lastIDR; + fprintf(m_csvfpt, " %d", k); + } + } + + if (numLists == 1) + fprintf(m_csvfpt, ", -"); + } + else + fprintf(m_csvfpt, ", -, -"); + fprintf(m_csvfpt, "\n"); + } + + if (m_param->decodedPictureHashSEI && m_param->logLevel >= X265_LOG_FULL) + { + const char* digestStr = NULL; + if (m_param->decodedPictureHashSEI == 1) + { + digestStr = digestToString(curEncoder->m_seiReconPictureDigest.m_digest, 16); + p += sprintf(buf + p, " [MD5:%s]", digestStr); + } + else if (m_param->decodedPictureHashSEI == 2) + { + digestStr = digestToString(curEncoder->m_seiReconPictureDigest.m_digest, 2); + p += sprintf(buf + p, " [CRC:%s]", digestStr); + } + else if (m_param->decodedPictureHashSEI == 3) + { + digestStr = digestToString(curEncoder->m_seiReconPictureDigest.m_digest, 4); + p += sprintf(buf + p, " [Checksum:%s]", digestStr); + } + } + x265_log(m_param, X265_LOG_DEBUG, "%s\n", buf); + fflush(stderr); + } +} + +#if defined(_MSC_VER) +#pragma warning(disable: 4800) // forcing int to bool +#pragma warning(disable: 4127) // conditional expression is constant +#endif + +void Encoder::getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs) +{ + sbacCoder.setBitstream(&bs); + + /* headers for start of bitstream */ + bs.resetBits(); + sbacCoder.codeVPS(m_vps); + bs.writeByteAlignment(); + list.serialize(NAL_UNIT_VPS, bs); + + bs.resetBits(); + sbacCoder.codeSPS(m_sps, m_scalingList, m_vps.ptl); + bs.writeByteAlignment(); + list.serialize(NAL_UNIT_SPS, bs); + + bs.resetBits(); + sbacCoder.codePPS(m_pps); + bs.writeByteAlignment(); + list.serialize(NAL_UNIT_PPS, bs); + + if (m_param->bEmitInfoSEI) + { + char *opts = x265_param2string(m_param); + if (opts) + { + char *buffer = X265_MALLOC(char, strlen(opts) + strlen(x265_version_str) + + strlen(x265_build_info_str) + 200); + if (buffer) + { + sprintf(buffer, "x265 (build %d) - %s:%s - H.265/HEVC codec - " + "Copyright 2013-2014 (c) Multicoreware Inc - " + "http://x265.org - options: %s", + X265_BUILD, x265_version_str, x265_build_info_str, opts); + + bs.resetBits(); + SEIuserDataUnregistered idsei; + idsei.m_userData = (uint8_t*)buffer; + idsei.m_userDataLength = (uint32_t)strlen(buffer); + idsei.write(bs, m_sps); + bs.writeByteAlignment(); + list.serialize(NAL_UNIT_PREFIX_SEI, bs); + + X265_FREE(buffer); + } + + X265_FREE(opts); + } + } + + if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) + { + /* Picture Timing and Buffering Period SEI require the SPS to be "activated" */ + SEIActiveParameterSets sei; + sei.m_selfContainedCvsFlag = true; + sei.m_noParamSetUpdateFlag = true; + + bs.resetBits(); + sei.write(bs, m_sps); + bs.writeByteAlignment(); + list.serialize(NAL_UNIT_PREFIX_SEI, bs); + } +} + +void Encoder::initSPS(SPS *sps) +{ + m_vps.ptl.progressiveSourceFlag = !m_param->interlaceMode; + m_vps.ptl.interlacedSourceFlag = !!m_param->interlaceMode; + m_vps.ptl.nonPackedConstraintFlag = false; + m_vps.ptl.frameOnlyConstraintFlag = false; + + sps->conformanceWindow = m_conformanceWindow; + sps->chromaFormatIdc = m_param->internalCsp; + sps->picWidthInLumaSamples = m_param->sourceWidth; + sps->picHeightInLumaSamples = m_param->sourceHeight; + sps->numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + sps->numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + sps->numCUsInFrame = sps->numCuInWidth * sps->numCuInHeight; + sps->numPartitions = NUM_CU_PARTITIONS; + sps->numPartInCUSize = 1 << g_maxFullDepth; + + sps->log2MinCodingBlockSize = g_maxLog2CUSize - g_maxCUDepth; + sps->log2DiffMaxMinCodingBlockSize = g_maxCUDepth; + + sps->quadtreeTULog2MaxSize = X265_MIN(g_maxLog2CUSize, 5); + sps->quadtreeTULog2MinSize = 2; + sps->quadtreeTUMaxDepthInter = m_param->tuQTMaxInterDepth; + sps->quadtreeTUMaxDepthIntra = m_param->tuQTMaxIntraDepth; + + sps->bUseSAO = m_param->bEnableSAO; + + sps->bUseAMP = m_param->bEnableAMP; + sps->maxAMPDepth = m_param->bEnableAMP ? g_maxCUDepth : 0; + + sps->maxDecPicBuffering = m_vps.maxDecPicBuffering; + sps->numReorderPics = m_vps.numReorderPics; + + sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing; + sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp; + + VUI& vui = sps->vuiParameters; + vui.aspectRatioInfoPresentFlag = !!m_param->vui.aspectRatioIdc; + vui.aspectRatioIdc = m_param->vui.aspectRatioIdc; + vui.sarWidth = m_param->vui.sarWidth; + vui.sarHeight = m_param->vui.sarHeight; + + vui.overscanInfoPresentFlag = m_param->vui.bEnableOverscanInfoPresentFlag; + vui.overscanAppropriateFlag = m_param->vui.bEnableOverscanAppropriateFlag; + + vui.videoSignalTypePresentFlag = m_param->vui.bEnableVideoSignalTypePresentFlag; + vui.videoFormat = m_param->vui.videoFormat; + vui.videoFullRangeFlag = m_param->vui.bEnableVideoFullRangeFlag; + + vui.colourDescriptionPresentFlag = m_param->vui.bEnableColorDescriptionPresentFlag; + vui.colourPrimaries = m_param->vui.colorPrimaries; + vui.transferCharacteristics = m_param->vui.transferCharacteristics; + vui.matrixCoefficients = m_param->vui.matrixCoeffs; + + vui.chromaLocInfoPresentFlag = m_param->vui.bEnableChromaLocInfoPresentFlag; + vui.chromaSampleLocTypeTopField = m_param->vui.chromaSampleLocTypeTopField; + vui.chromaSampleLocTypeBottomField = m_param->vui.chromaSampleLocTypeBottomField; + + vui.defaultDisplayWindow.bEnabled = m_param->vui.bEnableDefaultDisplayWindowFlag; + vui.defaultDisplayWindow.rightOffset = m_param->vui.defDispWinRightOffset; + vui.defaultDisplayWindow.topOffset = m_param->vui.defDispWinTopOffset; + vui.defaultDisplayWindow.bottomOffset = m_param->vui.defDispWinBottomOffset; + vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset; + + vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode; + vui.fieldSeqFlag = !!m_param->interlaceMode; + + vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI; + + vui.timingInfo.numUnitsInTick = m_param->fpsDenom; + vui.timingInfo.timeScale = m_param->fpsNum; +} + +void Encoder::initPPS(PPS *pps) +{ + bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; + + if (!m_param->bLossless && (m_param->rc.aqMode || bIsVbv)) + { + pps->bUseDQP = true; + pps->maxCuDQPDepth = 0; /* TODO: make configurable? */ + } + else + { + pps->bUseDQP = false; + pps->maxCuDQPDepth = 0; + } + + pps->chromaCbQpOffset = m_param->cbQpOffset; + pps->chromaCrQpOffset = m_param->crQpOffset; + + pps->bConstrainedIntraPred = m_param->bEnableConstrainedIntra; + pps->bUseWeightPred = m_param->bEnableWeightedPred; + pps->bUseWeightedBiPred = m_param->bEnableWeightedBiPred; + pps->bTransquantBypassEnabled = m_param->bCULossless || m_param->bLossless; + pps->bTransformSkipEnabled = m_param->bEnableTransformSkip; + pps->bSignHideEnabled = m_param->bEnableSignHiding; + + /* If offsets are ever configured, enable bDeblockingFilterControlPresent and set + * deblockingFilterBetaOffsetDiv2 / deblockingFilterTcOffsetDiv2 */ + bool bDeblockOffsetInPPS = 0; + pps->bDeblockingFilterControlPresent = !m_param->bEnableLoopFilter || bDeblockOffsetInPPS; + pps->bPicDisableDeblockingFilter = !m_param->bEnableLoopFilter; + pps->deblockingFilterBetaOffsetDiv2 = 0; + pps->deblockingFilterTcOffsetDiv2 = 0; + + pps->bEntropyCodingSyncEnabled = m_param->bEnableWavefront; +} + +void Encoder::configure(x265_param *p) +{ + this->m_param = p; + + if (p->keyframeMax < 0) + { + /* A negative max GOP size indicates the user wants only one I frame at + * the start of the stream. Set an infinite GOP distance and disable + * adaptive I frame placement */ + p->keyframeMax = INT_MAX; + p->scenecutThreshold = 0; + } + else if (p->keyframeMax <= 1) + { + // disable lookahead for all-intra encodes + p->bFrameAdaptive = 0; + p->bframes = 0; + } + if (!p->keyframeMin) + { + double fps = (double)p->fpsNum / p->fpsDenom; + p->keyframeMin = X265_MIN((int)fps, p->keyframeMax / 10); + } + p->keyframeMin = X265_MAX(1, X265_MIN(p->keyframeMin, p->keyframeMax / 2 + 1)); + + if (p->bBPyramid && !p->bframes) + p->bBPyramid = 0; + + /* Disable features which are not supported by the current RD level */ + if (p->rdLevel < 4) + { + if (p->psyRdoq > 0) /* impossible */ + x265_log(p, X265_LOG_WARNING, "--psy-rdoq disabled, requires --rdlevel 4 or higher\n"); + p->psyRdoq = 0; + } + if (p->rdLevel < 3) + { + if (p->bCULossless) /* impossible */ + x265_log(p, X265_LOG_WARNING, "--cu-lossless disabled, requires --rdlevel 3 or higher\n"); + if (p->bEnableTransformSkip) /* impossible */ + x265_log(p, X265_LOG_WARNING, "--tskip disabled, requires --rdlevel 3 or higher\n"); + p->bCULossless = p->bEnableTransformSkip = 0; + } + if (p->rdLevel < 2) + { + if (p->bDistributeModeAnalysis) /* not useful */ + x265_log(p, X265_LOG_WARNING, "--pmode disabled, requires --rdlevel 2 or higher\n"); + p->bDistributeModeAnalysis = 0; + + if (p->psyRd > 0) /* impossible */ + x265_log(p, X265_LOG_WARNING, "--psy-rd disabled, requires --rdlevel 2 or higher\n"); + p->psyRd = 0; + + if (p->bEnableRectInter) /* broken, not very useful */ + x265_log(p, X265_LOG_WARNING, "--rect disabled, requires --rdlevel 2 or higher\n"); + p->bEnableRectInter = 0; + } + + if (!p->bEnableRectInter) /* not useful */ + p->bEnableAMP = false; + + /* In 444, chroma gets twice as much resolution, so halve quality when psy-rd is enabled */ + if (p->internalCsp == X265_CSP_I444 && p->psyRd) + { + p->cbQpOffset += 6; + p->crQpOffset += 6; + } + + if (p->bLossless) + { + p->rc.rateControlMode = X265_RC_CQP; + p->rc.qp = 4; // An oddity, QP=4 is more lossless than QP=0 and gives better lambdas + p->bEnableSsim = 0; + p->bEnablePsnr = 0; + } + + if (p->rc.rateControlMode == X265_RC_CQP) + { + p->rc.aqMode = X265_AQ_NONE; + p->rc.bitrate = 0; + p->rc.cuTree = 0; + p->rc.aqStrength = 0; + } + + if (p->rc.aqMode == 0 && p->rc.cuTree) + { + p->rc.aqMode = X265_AQ_VARIANCE; + p->rc.aqStrength = 0.0; + } + + if (p->lookaheadDepth == 0 && p->rc.cuTree && !p->rc.bStatRead) + { + x265_log(p, X265_LOG_WARNING, "cuTree disabled, requires lookahead to be enabled\n"); + p->rc.cuTree = 0; + } + + if (p->rc.aqStrength == 0 && p->rc.cuTree == 0) + p->rc.aqMode = X265_AQ_NONE; + + if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0) + p->rc.aqStrength = 0; + + if (p->internalCsp != X265_CSP_I420) + { + x265_log(p, X265_LOG_WARNING, "!! HEVC Range Extension specifications are not finalized !!\n"); + x265_log(p, X265_LOG_WARNING, "!! This output bitstream may not be compliant with the final spec !!\n"); + } + + if (p->scalingLists && p->internalCsp == X265_CSP_I444) + { + x265_log(p, X265_LOG_WARNING, "Scaling lists are not yet supported for 4:4:4 color space\n"); + p->scalingLists = 0; + } + + if (p->interlaceMode) + x265_log(p, X265_LOG_WARNING, "Support for interlaced video is experimental\n"); + + if (p->rc.rfConstantMin > p->rc.rfConstant) + { + x265_log(m_param, X265_LOG_WARNING, "CRF min must be less than CRF\n"); + p->rc.rfConstantMin = 0; + } + + m_bframeDelay = p->bframes ? (p->bBPyramid ? 2 : 1) : 0; + + p->bFrameBias = X265_MIN(X265_MAX(-90, p->bFrameBias), 100); + + if (p->logLevel < X265_LOG_INFO) + { + /* don't measure these metrics if they will not be reported */ + p->bEnablePsnr = 0; + p->bEnableSsim = 0; + } + /* Warn users trying to measure PSNR/SSIM with psy opts on. */ + if (p->bEnablePsnr || p->bEnableSsim) + { + const char *s = NULL; + + if (p->psyRd || p->psyRdoq) + { + s = p->bEnablePsnr ? "psnr" : "ssim"; + x265_log(p, X265_LOG_WARNING, "--%s used with psy on: results will be invalid!\n", s); + } + else if (!p->rc.aqMode && p->bEnableSsim) + { + x265_log(p, X265_LOG_WARNING, "--ssim used with AQ off: results will be invalid!\n"); + s = "ssim"; + } + else if (p->rc.aqStrength > 0 && p->bEnablePsnr) + { + x265_log(p, X265_LOG_WARNING, "--psnr used with AQ on: results will be invalid!\n"); + s = "psnr"; + } + if (s) + x265_log(p, X265_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s); + } + + //========= set default display window ================================== + m_conformanceWindow.bEnabled = false; + m_conformanceWindow.rightOffset = 0; + m_conformanceWindow.topOffset = 0; + m_conformanceWindow.bottomOffset = 0; + m_conformanceWindow.leftOffset = 0; + + //======== set pad size if width is not multiple of the minimum CU size ========= + const uint32_t minCUSize = MIN_CU_SIZE; + if (p->sourceWidth & (minCUSize - 1)) + { + uint32_t rem = p->sourceWidth & (minCUSize - 1); + uint32_t padsize = minCUSize - rem; + p->sourceWidth += padsize; + + /* set the confirmation window offsets */ + m_conformanceWindow.bEnabled = true; + m_conformanceWindow.rightOffset = padsize; + } + + //======== set pad size if height is not multiple of the minimum CU size ========= + if (p->sourceHeight & (minCUSize - 1)) + { + uint32_t rem = p->sourceHeight & (minCUSize - 1); + uint32_t padsize = minCUSize - rem; + p->sourceHeight += padsize; + + /* set the confirmation window offsets */ + m_conformanceWindow.bEnabled = true; + m_conformanceWindow.bottomOffset = padsize; + } +} diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h new file mode 100644 index 0000000..8a387c2 --- /dev/null +++ b/source/encoder/encoder.h @@ -0,0 +1,175 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_ENCODER_H +#define X265_ENCODER_H + +#include "common.h" +#include "slice.h" +#include "scalinglist.h" +#include "x265.h" +#include "nal.h" + +struct x265_encoder {}; + +namespace x265 { +// private namespace +extern const char g_sliceTypeToChar[3]; + +class Entropy; + +struct EncStats +{ + double m_psnrSumY; + double m_psnrSumU; + double m_psnrSumV; + double m_globalSsim; + double m_totalQp; + uint64_t m_accBits; + uint32_t m_numPics; + + EncStats() + { + m_psnrSumY = m_psnrSumU = m_psnrSumV = m_globalSsim = 0; + m_accBits = 0; + m_numPics = 0; + m_totalQp = 0; + } + + void addQP(double aveQp); + + void addPsnr(double psnrY, double psnrU, double psnrV); + + void addBits(uint64_t bits); + + void addSsim(double ssim); +}; + +class FrameEncoder; +class DPB; +class Lookahead; +class RateControl; +class ThreadPool; +struct ThreadLocalData; + +class Encoder : public x265_encoder +{ +private: + + int m_pocLast; // time index (POC) + int m_encodedFrameNum; + int m_outputCount; + + int m_bframeDelay; + int64_t m_firstPts; + int64_t m_bframeDelayTime; + int64_t m_prevReorderedPts[2]; + + ThreadPool* m_threadPool; + FrameEncoder* m_frameEncoder; + DPB* m_dpb; + + Frame* m_exportedPic; + + int m_curEncoder; + + /* cached PicYuv offset arrays, shared by all instances of + * PicYuv created by this encoder */ + intptr_t* m_cuOffsetY; + intptr_t* m_cuOffsetC; + intptr_t* m_buOffsetY; + intptr_t* m_buOffsetC; + + /* Collect statistics globally */ + EncStats m_analyzeAll; + EncStats m_analyzeI; + EncStats m_analyzeP; + EncStats m_analyzeB; + FILE* m_csvfpt; + int64_t m_encodeStartTime; + + // weighted prediction + int m_numLumaWPFrames; // number of P frames with weighted luma reference + int m_numChromaWPFrames; // number of P frames with weighted chroma reference + int m_numLumaWPBiFrames; // number of B frames with weighted luma reference + int m_numChromaWPBiFrames; // number of B frames with weighted chroma reference + +public: + + int m_conformanceMode; + VPS m_vps; + SPS m_sps; + PPS m_pps; + NALList m_nalList; + ScalingList m_scalingList; // quantization matrix information + int m_numThreadLocalData; + + int m_lastBPSEI; + uint32_t m_numDelayedPic; + + x265_param* m_param; + RateControl* m_rateControl; + ThreadLocalData* m_threadLocalData; + Lookahead* m_lookahead; + Window m_conformanceWindow; + + bool m_aborted; // fatal error detected + + Encoder(); + + ~Encoder() {} + + void create(); + void destroy(); + void init(); + + int encode(const x265_picture* pic, x265_picture *pic_out); + + void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs); + + void fetchStats(x265_stats* stats, size_t statsSizeBytes); + + void writeLog(int argc, char **argv); + + void printSummary(); + + char* statsString(EncStats&, char*); + + char* statsCSVString(EncStats& stat, char* buffer); + + void setThreadPool(ThreadPool* p) { m_threadPool = p; } + + void configure(x265_param *param); + + void updateVbvPlan(RateControl* rc); + +protected: + + void initSPS(SPS *sps); + void initPPS(PPS *pps); + + void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits); +}; +} + +#endif // ifndef X265_ENCODER_H diff --git a/source/encoder/entropy.cpp b/source/encoder/entropy.cpp new file mode 100644 index 0000000..13eaf57 --- /dev/null +++ b/source/encoder/entropy.cpp @@ -0,0 +1,2172 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "framedata.h" +#include "scalinglist.h" +#include "quant.h" +#include "contexts.h" +#include "picyuv.h" + +#include "sao.h" +#include "entropy.h" + +#define CU_DQP_TU_CMAX 5 // max number bins for truncated unary +#define CU_DQP_EG_k 0 // exp-golomb order +#define START_VALUE 8 // start value for dpcm mode + +static const uint32_t g_puOffset[8] = { 0, 8, 4, 4, 2, 10, 1, 5 }; + +namespace x265 { + +Entropy::Entropy() +{ + markValid(); + m_fracBits = 0; + X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState[0]) * MAX_OFF_CTX_MOD, "context state table is too small\n"); +} + +void Entropy::codeVPS(const VPS& vps) +{ + WRITE_CODE(0, 4, "vps_video_parameter_set_id"); + WRITE_CODE(3, 2, "vps_reserved_three_2bits"); + WRITE_CODE(0, 6, "vps_reserved_zero_6bits"); + WRITE_CODE(0, 3, "vps_max_sub_layers_minus1"); + WRITE_FLAG(1, "vps_temporal_id_nesting_flag"); + WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits"); + + codeProfileTier(vps.ptl); + + WRITE_FLAG(true, "vps_sub_layer_ordering_info_present_flag"); + WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1[i]"); + WRITE_UVLC(vps.numReorderPics, "vps_num_reorder_pics[i]"); + + WRITE_UVLC(0, "vps_max_latency_increase_plus1[i]"); + WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id"); + WRITE_UVLC(0, "vps_max_op_sets_minus1"); + WRITE_FLAG(0, "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */ + WRITE_FLAG(0, "vps_extension_flag"); +} + +void Entropy::codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl) +{ + WRITE_CODE(0, 4, "sps_video_parameter_set_id"); + WRITE_CODE(0, 3, "sps_max_sub_layers_minus1"); + WRITE_FLAG(1, "sps_temporal_id_nesting_flag"); + + codeProfileTier(ptl); + + WRITE_UVLC(0, "sps_seq_parameter_set_id"); + WRITE_UVLC(sps.chromaFormatIdc, "chroma_format_idc"); + + if (sps.chromaFormatIdc == X265_CSP_I444) + WRITE_FLAG(0, "separate_colour_plane_flag"); + + WRITE_UVLC(sps.picWidthInLumaSamples, "pic_width_in_luma_samples"); + WRITE_UVLC(sps.picHeightInLumaSamples, "pic_height_in_luma_samples"); + + const Window& conf = sps.conformanceWindow; + WRITE_FLAG(conf.bEnabled, "conformance_window_flag"); + if (conf.bEnabled) + { + int hShift = CHROMA_H_SHIFT(sps.chromaFormatIdc), vShift = CHROMA_V_SHIFT(sps.chromaFormatIdc); + WRITE_UVLC(conf.leftOffset >> hShift, "conf_win_left_offset"); + WRITE_UVLC(conf.rightOffset >> hShift, "conf_win_right_offset"); + WRITE_UVLC(conf.topOffset >> vShift, "conf_win_top_offset"); + WRITE_UVLC(conf.bottomOffset >> vShift, "conf_win_bottom_offset"); + } + + WRITE_UVLC(X265_DEPTH - 8, "bit_depth_luma_minus8"); + WRITE_UVLC(X265_DEPTH - 8, "bit_depth_chroma_minus8"); + WRITE_UVLC(BITS_FOR_POC - 4, "log2_max_pic_order_cnt_lsb_minus4"); + WRITE_FLAG(true, "sps_sub_layer_ordering_info_present_flag"); + + WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1[i]"); + WRITE_UVLC(sps.numReorderPics, "sps_num_reorder_pics[i]"); + WRITE_UVLC(0, "sps_max_latency_increase_plus1[i]"); + + WRITE_UVLC(sps.log2MinCodingBlockSize - 3, "log2_min_coding_block_size_minus3"); + WRITE_UVLC(sps.log2DiffMaxMinCodingBlockSize, "log2_diff_max_min_coding_block_size"); + WRITE_UVLC(sps.quadtreeTULog2MinSize - 2, "log2_min_transform_block_size_minus2"); + WRITE_UVLC(sps.quadtreeTULog2MaxSize - sps.quadtreeTULog2MinSize, "log2_diff_max_min_transform_block_size"); + WRITE_UVLC(sps.quadtreeTUMaxDepthInter - 1, "max_transform_hierarchy_depth_inter"); + WRITE_UVLC(sps.quadtreeTUMaxDepthIntra - 1, "max_transform_hierarchy_depth_intra"); + WRITE_FLAG(scalingList.m_bEnabled, "scaling_list_enabled_flag"); + if (scalingList.m_bEnabled) + { + WRITE_FLAG(scalingList.m_bDataPresent, "sps_scaling_list_data_present_flag"); + if (scalingList.m_bDataPresent) + codeScalingList(scalingList); + } + WRITE_FLAG(sps.bUseAMP, "amp_enabled_flag"); + WRITE_FLAG(sps.bUseSAO, "sample_adaptive_offset_enabled_flag"); + + WRITE_FLAG(0, "pcm_enabled_flag"); + WRITE_UVLC(0, "num_short_term_ref_pic_sets"); + WRITE_FLAG(0, "long_term_ref_pics_present_flag"); + + WRITE_FLAG(sps.bTemporalMVPEnabled, "sps_temporal_mvp_enable_flag"); + WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag"); + + WRITE_FLAG(1, "vui_parameters_present_flag"); + codeVUI(sps.vuiParameters); + + WRITE_FLAG(0, "sps_extension_flag"); +} + +void Entropy::codePPS(const PPS& pps) +{ + WRITE_UVLC(0, "pps_pic_parameter_set_id"); + WRITE_UVLC(0, "pps_seq_parameter_set_id"); + WRITE_FLAG(0, "dependent_slice_segments_enabled_flag"); + WRITE_FLAG(0, "output_flag_present_flag"); + WRITE_CODE(0, 3, "num_extra_slice_header_bits"); + WRITE_FLAG(pps.bSignHideEnabled, "sign_data_hiding_flag"); + WRITE_FLAG(0, "cabac_init_present_flag"); + WRITE_UVLC(0, "num_ref_idx_l0_default_active_minus1"); + WRITE_UVLC(0, "num_ref_idx_l1_default_active_minus1"); + + WRITE_SVLC(0, "init_qp_minus26"); + WRITE_FLAG(pps.bConstrainedIntraPred, "constrained_intra_pred_flag"); + WRITE_FLAG(pps.bTransformSkipEnabled, "transform_skip_enabled_flag"); + + WRITE_FLAG(pps.bUseDQP, "cu_qp_delta_enabled_flag"); + if (pps.bUseDQP) + WRITE_UVLC(pps.maxCuDQPDepth, "diff_cu_qp_delta_depth"); + + WRITE_SVLC(pps.chromaCbQpOffset, "pps_cb_qp_offset"); + WRITE_SVLC(pps.chromaCrQpOffset, "pps_cr_qp_offset"); + WRITE_FLAG(0, "pps_slice_chroma_qp_offsets_present_flag"); + + WRITE_FLAG(pps.bUseWeightPred, "weighted_pred_flag"); + WRITE_FLAG(pps.bUseWeightedBiPred, "weighted_bipred_flag"); + WRITE_FLAG(pps.bTransquantBypassEnabled, "transquant_bypass_enable_flag"); + WRITE_FLAG(0, "tiles_enabled_flag"); + WRITE_FLAG(pps.bEntropyCodingSyncEnabled, "entropy_coding_sync_enabled_flag"); + WRITE_FLAG(1, "loop_filter_across_slices_enabled_flag"); + + WRITE_FLAG(pps.bDeblockingFilterControlPresent, "deblocking_filter_control_present_flag"); + if (pps.bDeblockingFilterControlPresent) + { + WRITE_FLAG(0, "deblocking_filter_override_enabled_flag"); + WRITE_FLAG(pps.bPicDisableDeblockingFilter, "pps_disable_deblocking_filter_flag"); + if (!pps.bPicDisableDeblockingFilter) + { + WRITE_SVLC(pps.deblockingFilterBetaOffsetDiv2, "pps_beta_offset_div2"); + WRITE_SVLC(pps.deblockingFilterTcOffsetDiv2, "pps_tc_offset_div2"); + } + } + + WRITE_FLAG(0, "pps_scaling_list_data_present_flag"); + WRITE_FLAG(0, "lists_modification_present_flag"); + WRITE_UVLC(0, "log2_parallel_merge_level_minus2"); + WRITE_FLAG(0, "slice_segment_header_extension_present_flag"); + WRITE_FLAG(0, "pps_extension_flag"); +} + +void Entropy::codeProfileTier(const ProfileTierLevel& ptl) +{ + WRITE_CODE(0, 2, "XXX_profile_space[]"); + WRITE_FLAG(ptl.tierFlag, "XXX_tier_flag[]"); + WRITE_CODE(ptl.profileIdc, 5, "XXX_profile_idc[]"); + for (int j = 0; j < 32; j++) + WRITE_FLAG(ptl.profileCompatibilityFlag[j], "XXX_profile_compatibility_flag[][j]"); + + WRITE_FLAG(ptl.progressiveSourceFlag, "general_progressive_source_flag"); + WRITE_FLAG(ptl.interlacedSourceFlag, "general_interlaced_source_flag"); + WRITE_FLAG(ptl.nonPackedConstraintFlag, "general_non_packed_constraint_flag"); + WRITE_FLAG(ptl.frameOnlyConstraintFlag, "general_frame_only_constraint_flag"); + + if (ptl.profileIdc == Profile::MAINREXT || ptl.profileIdc == Profile::HIGHTHROUGHPUTREXT) + { + uint32_t bitDepthConstraint = ptl.bitDepthConstraint; + int csp = ptl.chromaFormatConstraint; + WRITE_FLAG(bitDepthConstraint<=12, "general_max_12bit_constraint_flag"); + WRITE_FLAG(bitDepthConstraint<=10, "general_max_10bit_constraint_flag"); + WRITE_FLAG(bitDepthConstraint<= 8 && csp != X265_CSP_I422 , "general_max_8bit_constraint_flag"); + WRITE_FLAG(csp == X265_CSP_I422 || csp == X265_CSP_I420 || csp == X265_CSP_I400, "general_max_422chroma_constraint_flag"); + WRITE_FLAG(csp == X265_CSP_I420 || csp == X265_CSP_I400, "general_max_420chroma_constraint_flag"); + WRITE_FLAG(csp == X265_CSP_I400, "general_max_monochrome_constraint_flag"); + WRITE_FLAG(ptl.intraConstraintFlag, "general_intra_constraint_flag"); + WRITE_FLAG(0, "general_one_picture_only_constraint_flag"); + WRITE_FLAG(ptl.lowerBitRateConstraintFlag, "general_lower_bit_rate_constraint_flag"); + WRITE_CODE(0 , 16, "XXX_reserved_zero_35bits[0..15]"); + WRITE_CODE(0 , 16, "XXX_reserved_zero_35bits[16..31]"); + WRITE_CODE(0 , 3, "XXX_reserved_zero_35bits[32..34]"); + } + else + { + WRITE_CODE(0, 16, "XXX_reserved_zero_44bits[0..15]"); + WRITE_CODE(0, 16, "XXX_reserved_zero_44bits[16..31]"); + WRITE_CODE(0, 12, "XXX_reserved_zero_44bits[32..43]"); + } + + WRITE_CODE(ptl.levelIdc, 8, "general_level_idc"); +} + +void Entropy::codeVUI(const VUI& vui) +{ + WRITE_FLAG(vui.aspectRatioInfoPresentFlag, "aspect_ratio_info_present_flag"); + if (vui.aspectRatioInfoPresentFlag) + { + WRITE_CODE(vui.aspectRatioIdc, 8, "aspect_ratio_idc"); + if (vui.aspectRatioIdc == 255) + { + WRITE_CODE(vui.sarWidth, 16, "sar_width"); + WRITE_CODE(vui.sarHeight, 16, "sar_height"); + } + } + + WRITE_FLAG(vui.overscanInfoPresentFlag, "overscan_info_present_flag"); + if (vui.overscanInfoPresentFlag) + WRITE_FLAG(vui.overscanAppropriateFlag, "overscan_appropriate_flag"); + + WRITE_FLAG(vui.videoSignalTypePresentFlag, "video_signal_type_present_flag"); + if (vui.videoSignalTypePresentFlag) + { + WRITE_CODE(vui.videoFormat, 3, "video_format"); + WRITE_FLAG(vui.videoFullRangeFlag, "video_full_range_flag"); + WRITE_FLAG(vui.colourDescriptionPresentFlag, "colour_description_present_flag"); + if (vui.colourDescriptionPresentFlag) + { + WRITE_CODE(vui.colourPrimaries, 8, "colour_primaries"); + WRITE_CODE(vui.transferCharacteristics, 8, "transfer_characteristics"); + WRITE_CODE(vui.matrixCoefficients, 8, "matrix_coefficients"); + } + } + + WRITE_FLAG(vui.chromaLocInfoPresentFlag, "chroma_loc_info_present_flag"); + if (vui.chromaLocInfoPresentFlag) + { + WRITE_UVLC(vui.chromaSampleLocTypeTopField, "chroma_sample_loc_type_top_field"); + WRITE_UVLC(vui.chromaSampleLocTypeBottomField, "chroma_sample_loc_type_bottom_field"); + } + + WRITE_FLAG(0, "neutral_chroma_indication_flag"); + WRITE_FLAG(vui.fieldSeqFlag, "field_seq_flag"); + WRITE_FLAG(vui.frameFieldInfoPresentFlag, "frame_field_info_present_flag"); + + WRITE_FLAG(vui.defaultDisplayWindow.bEnabled, "default_display_window_flag"); + if (vui.defaultDisplayWindow.bEnabled) + { + WRITE_UVLC(vui.defaultDisplayWindow.leftOffset, "def_disp_win_left_offset"); + WRITE_UVLC(vui.defaultDisplayWindow.rightOffset, "def_disp_win_right_offset"); + WRITE_UVLC(vui.defaultDisplayWindow.topOffset, "def_disp_win_top_offset"); + WRITE_UVLC(vui.defaultDisplayWindow.bottomOffset, "def_disp_win_bottom_offset"); + } + + WRITE_FLAG(1, "vui_timing_info_present_flag"); + WRITE_CODE(vui.timingInfo.numUnitsInTick, 32, "vui_num_units_in_tick"); + WRITE_CODE(vui.timingInfo.timeScale, 32, "vui_time_scale"); + WRITE_FLAG(0, "vui_poc_proportional_to_timing_flag"); + + WRITE_FLAG(vui.hrdParametersPresentFlag, "vui_hrd_parameters_present_flag"); + if (vui.hrdParametersPresentFlag) + codeHrdParameters(vui.hrdParameters); + + WRITE_FLAG(0, "bitstream_restriction_flag"); +} + +void Entropy::codeScalingList(const ScalingList& scalingList) +{ + for (int sizeId = 0; sizeId < ScalingList::NUM_SIZES; sizeId++) + { + for (int listId = 0; listId < ScalingList::NUM_LISTS; listId++) + { + int predList = scalingList.checkPredMode(sizeId, listId); + WRITE_FLAG(predList < 0, "scaling_list_pred_mode_flag"); + if (predList >= 0) + WRITE_UVLC(listId - predList, "scaling_list_pred_matrix_id_delta"); + else // DPCM Mode + codeScalingList(scalingList, sizeId, listId); + } + } +} + +void Entropy::codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId) +{ + int coefNum = X265_MIN(ScalingList::MAX_MATRIX_COEF_NUM, (int)ScalingList::s_numCoefPerSize[sizeId]); + const uint16_t* scan = (sizeId == 0 ? g_scan4x4[SCAN_DIAG] : g_scan8x8diag); + int nextCoef = START_VALUE; + int32_t *src = scalingList.m_scalingListCoef[sizeId][listId]; + int data; + + if (sizeId > BLOCK_8x8) + { + WRITE_SVLC(scalingList.m_scalingListDC[sizeId][listId] - 8, "scaling_list_dc_coef_minus8"); + nextCoef = scalingList.m_scalingListDC[sizeId][listId]; + } + for (int i = 0; i < coefNum; i++) + { + data = src[scan[i]] - nextCoef; + nextCoef = src[scan[i]]; + if (data > 127) + data = data - 256; + if (data < -128) + data = data + 256; + + WRITE_SVLC(data, "scaling_list_delta_coef"); + } +} + +void Entropy::codeHrdParameters(const HRDInfo& hrd) +{ + WRITE_FLAG(1, "nal_hrd_parameters_present_flag"); + WRITE_FLAG(0, "vcl_hrd_parameters_present_flag"); + WRITE_FLAG(0, "sub_pic_hrd_params_present_flag"); + + WRITE_CODE(hrd.bitRateScale, 4, "bit_rate_scale"); + WRITE_CODE(hrd.cpbSizeScale, 4, "cpb_size_scale"); + + WRITE_CODE(hrd.initialCpbRemovalDelayLength - 1, 5, "initial_cpb_removal_delay_length_minus1"); + WRITE_CODE(hrd.cpbRemovalDelayLength - 1, 5, "au_cpb_removal_delay_length_minus1"); + WRITE_CODE(hrd.dpbOutputDelayLength - 1, 5, "dpb_output_delay_length_minus1"); + + WRITE_FLAG(1, "fixed_pic_rate_general_flag"); + WRITE_UVLC(0, "elemental_duration_in_tc_minus1"); + WRITE_UVLC(0, "cpb_cnt_minus1"); + + WRITE_UVLC(hrd.bitRateValue - 1, "bit_rate_value_minus1"); + WRITE_UVLC(hrd.cpbSizeValue - 1, "cpb_size_value_minus1"); + WRITE_FLAG(hrd.cbrFlag, "cbr_flag"); +} + +void Entropy::codeAUD(const Slice& slice) +{ + int picType; + + switch (slice.m_sliceType) + { + case I_SLICE: + picType = 0; + break; + case P_SLICE: + picType = 1; + break; + case B_SLICE: + picType = 2; + break; + default: + picType = 7; + break; + } + + WRITE_CODE(picType, 3, "pic_type"); +} + +void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData) +{ + WRITE_FLAG(1, "first_slice_segment_in_pic_flag"); + if (slice.getRapPicFlag()) + WRITE_FLAG(0, "no_output_of_prior_pics_flag"); + + WRITE_UVLC(0, "slice_pic_parameter_set_id"); + + /* x265 does not use dependent slices, so always write all this data */ + + WRITE_UVLC(slice.m_sliceType, "slice_type"); + + if (!slice.getIdrPicFlag()) + { + int picOrderCntLSB = (slice.m_poc - slice.m_lastIDR + (1 << BITS_FOR_POC)) % (1 << BITS_FOR_POC); + WRITE_CODE(picOrderCntLSB, BITS_FOR_POC, "pic_order_cnt_lsb"); + +#if _DEBUG || CHECKED_BUILD + // check for bitstream restriction stating that: + // If the current picture is a BLA or CRA picture, the value of NumPocTotalCurr shall be equal to 0. + // Ideally this process should not be repeated for each slice in a picture + if (slice.isIRAP()) + for (int picIdx = 0; picIdx < slice.m_rps.numberOfPictures; picIdx++) + X265_CHECK(!slice.m_rps.bUsed[picIdx], "pic unused failure\n"); +#endif + + WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag"); + codeShortTermRefPicSet(slice.m_rps); + + if (slice.m_sps->bTemporalMVPEnabled) + WRITE_FLAG(1, "slice_temporal_mvp_enable_flag"); + } + const SAOParam *saoParam = encData.m_saoParam; + if (slice.m_sps->bUseSAO) + { + WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag"); + WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag"); + } + + // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override + // TODO: this might be a place to optimize a few bits per slice, by using param->refs for L0 default + + if (!slice.isIntra()) + { + bool overrideFlag = (slice.m_numRefIdx[0] != 1 || (slice.isInterB() && slice.m_numRefIdx[1] != 1)); + WRITE_FLAG(overrideFlag, "num_ref_idx_active_override_flag"); + if (overrideFlag) + { + WRITE_UVLC(slice.m_numRefIdx[0] - 1, "num_ref_idx_l0_active_minus1"); + if (slice.isInterB()) + WRITE_UVLC(slice.m_numRefIdx[1] - 1, "num_ref_idx_l1_active_minus1"); + else + { + X265_CHECK(slice.m_numRefIdx[1] == 0, "expected no L1 references for P slice\n"); + } + } + } + else + { + X265_CHECK(!slice.m_numRefIdx[0] && !slice.m_numRefIdx[1], "expected no references for I slice\n"); + } + + if (slice.isInterB()) + WRITE_FLAG(0, "mvd_l1_zero_flag"); + + if (slice.m_sps->bTemporalMVPEnabled) + { + if (slice.m_sliceType == B_SLICE) + WRITE_FLAG(slice.m_colFromL0Flag, "collocated_from_l0_flag"); + + if (slice.m_sliceType != I_SLICE && + ((slice.m_colFromL0Flag && slice.m_numRefIdx[0] > 1) || + (!slice.m_colFromL0Flag && slice.m_numRefIdx[1] > 1))) + { + WRITE_UVLC(slice.m_colRefIdx, "collocated_ref_idx"); + } + } + if ((slice.m_pps->bUseWeightPred && slice.m_sliceType == P_SLICE) || (slice.m_pps->bUseWeightedBiPred && slice.m_sliceType == B_SLICE)) + codePredWeightTable(slice); + + X265_CHECK(slice.m_maxNumMergeCand <= MRG_MAX_NUM_CANDS, "too many merge candidates\n"); + if (!slice.isIntra()) + WRITE_UVLC(MRG_MAX_NUM_CANDS - slice.m_maxNumMergeCand, "five_minus_max_num_merge_cand"); + + int code = slice.m_sliceQp - 26; + WRITE_SVLC(code, "slice_qp_delta"); + + bool isSAOEnabled = slice.m_sps->bUseSAO ? saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1] : false; + bool isDBFEnabled = !slice.m_pps->bPicDisableDeblockingFilter; + + if (isSAOEnabled || isDBFEnabled) + WRITE_FLAG(slice.m_sLFaseFlag, "slice_loop_filter_across_slices_enabled_flag"); +} + +/** write wavefront substreams sizes for the slice header */ +void Entropy::codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset) +{ + uint32_t offsetLen = 1; + while (maxOffset >= (1U << offsetLen)) + { + offsetLen++; + X265_CHECK(offsetLen < 32, "offsetLen is too large\n"); + } + + uint32_t numRows = slice.m_sps->numCuInHeight - 1; + WRITE_UVLC(numRows, "num_entry_point_offsets"); + if (numRows > 0) + WRITE_UVLC(offsetLen - 1, "offset_len_minus1"); + + for (uint32_t i = 0; i < numRows; i++) + WRITE_CODE(substreamSizes[i] - 1, offsetLen, "entry_point_offset_minus1"); +} + +void Entropy::codeShortTermRefPicSet(const RPS& rps) +{ + WRITE_UVLC(rps.numberOfNegativePictures, "num_negative_pics"); + WRITE_UVLC(rps.numberOfPositivePictures, "num_positive_pics"); + int prev = 0; + for (int j = 0; j < rps.numberOfNegativePictures; j++) + { + WRITE_UVLC(prev - rps.deltaPOC[j] - 1, "delta_poc_s0_minus1"); + prev = rps.deltaPOC[j]; + WRITE_FLAG(rps.bUsed[j], "used_by_curr_pic_s0_flag"); + } + + prev = 0; + for (int j = rps.numberOfNegativePictures; j < rps.numberOfNegativePictures + rps.numberOfPositivePictures; j++) + { + WRITE_UVLC(rps.deltaPOC[j] - prev - 1, "delta_poc_s1_minus1"); + prev = rps.deltaPOC[j]; + WRITE_FLAG(rps.bUsed[j], "used_by_curr_pic_s1_flag"); + } +} + +void Entropy::encodeCTU(const CUData& ctu, const CUGeom& cuGeom) +{ + bool bEncodeDQP = ctu.m_slice->m_pps->bUseDQP; + encodeCU(ctu, cuGeom, 0, 0, bEncodeDQP); +} + +/* encode a CU block recursively */ +void Entropy::encodeCU(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP) +{ + const Slice* slice = cu.m_slice; + + if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) + bEncodeDQP = true; + + int cuSplitFlag = !(cuGeom.flags & CUGeom::LEAF); + int cuUnsplitFlag = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); + + if (!cuUnsplitFlag) + { + uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2; + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + if (childCuData.flags & CUGeom::PRESENT) + encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP); + } + return; + } + + // We need to split, so don't try these modes. + if (cuSplitFlag) + codeSplitFlag(cu, absPartIdx, depth); + + if (depth < cu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth) + { + uint32_t qNumParts = (NUM_CU_PARTITIONS >> (depth << 1)) >> 2; + + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts) + { + const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); + encodeCU(cu, childCuData, absPartIdx, depth + 1, bEncodeDQP); + } + return; + } + + if (slice->m_pps->bTransquantBypassEnabled) + codeCUTransquantBypassFlag(cu.m_tqBypass[absPartIdx]); + + if (!slice->isIntra()) + codeSkipFlag(cu, absPartIdx); + + if (cu.isSkipped(absPartIdx)) + { + codeMergeIndex(cu, absPartIdx); + finishCU(cu, absPartIdx, depth); + return; + } + + if (!slice->isIntra()) + codePredMode(cu.m_predMode[absPartIdx]); + + codePartSize(cu, absPartIdx, depth); + + // prediction Info ( Intra : direction mode, Inter : Mv, reference idx ) + codePredInfo(cu, absPartIdx); + + uint32_t tuDepthRange[2]; + if (cu.isIntra(absPartIdx)) + cu.getIntraTUQtDepthRange(tuDepthRange, absPartIdx); + else + cu.getInterTUQtDepthRange(tuDepthRange, absPartIdx); + + // Encode Coefficients, allow codeCoeff() to modify bEncodeDQP + codeCoeff(cu, absPartIdx, depth, bEncodeDQP, tuDepthRange); + + // --- write terminating bit --- + finishCU(cu, absPartIdx, depth); +} + +/* finish encoding a cu and handle end-of-slice conditions */ +void Entropy::finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth) +{ + const Slice* slice = cu.m_slice; + X265_CHECK(cu.m_slice->m_endCUAddr == cu.m_slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n"); + uint32_t realEndAddress = slice->m_endCUAddr; + uint32_t cuAddr = cu.getSCUAddr() + absPartIdx; + + uint32_t granularityMask = g_maxCUSize - 1; + uint32_t cuSize = 1 << cu.m_log2CUSize[absPartIdx]; + uint32_t rpelx = cu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize; + uint32_t bpely = cu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize; + bool granularityBoundary = (((rpelx & granularityMask) == 0 || (rpelx == slice->m_sps->picWidthInLumaSamples )) && + ((bpely & granularityMask) == 0 || (bpely == slice->m_sps->picHeightInLumaSamples))); + + if (granularityBoundary) + { + // Encode slice finish + bool bTerminateSlice = false; + if (cuAddr + (NUM_CU_PARTITIONS >> (depth << 1)) == realEndAddress) + bTerminateSlice = true; + + // The 1-terminating bit is added to all streams, so don't add it here when it's 1. + if (!bTerminateSlice) + encodeBinTrm(0); + + if (!m_bitIf) + resetBits(); // TODO: most likely unnecessary + } +} + +void Entropy::encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLuma, uint32_t offsetChroma, uint32_t absPartIdx, + uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, bool& bCodeDQP, uint32_t depthRange[2]) +{ + const bool subdiv = cu.m_tuDepth[absPartIdx] + cu.m_cuDepth[absPartIdx] > (uint8_t)depth; + uint32_t hChromaShift = cu.m_hChromaShift; + uint32_t vChromaShift = cu.m_vChromaShift; + uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, trIdx); + uint32_t cbfU = cu.getCbf(absPartIdx, TEXT_CHROMA_U, trIdx); + uint32_t cbfV = cu.getCbf(absPartIdx, TEXT_CHROMA_V, trIdx); + + if (!trIdx) + state.bakAbsPartIdxCU = absPartIdx; + + if (log2TrSize == 2 && cu.m_chromaFormat != X265_CSP_I444) + { + uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1); + if (!(absPartIdx & (partNum - 1))) + { + state.bakAbsPartIdx = absPartIdx; + state.bakChromaOffset = offsetChroma; + } + else if ((absPartIdx & (partNum - 1)) == (partNum - 1)) + { + cbfU = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_U, trIdx); + cbfV = cu.getCbf(state.bakAbsPartIdx, TEXT_CHROMA_V, trIdx); + } + } + + /* in each of these conditions, the subdiv flag is implied and not signaled, + * so we have checks to make sure the implied value matches our intentions */ + if (cu.m_predMode[absPartIdx] == MODE_INTRA && cu.m_partSize[absPartIdx] == SIZE_NxN && depth == cu.m_cuDepth[absPartIdx]) + { + X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n"); + } + else if (cu.m_predMode[absPartIdx] == MODE_INTER && (cu.m_partSize[absPartIdx] != SIZE_2Nx2N) && depth == cu.m_cuDepth[absPartIdx] && + cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1) + { + X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2TrSize %d, depthRange[0] %d\n", log2TrSize, depthRange[0]); + } + else if (log2TrSize > depthRange[1]) + { + X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n"); + } + else if (log2TrSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2TrSize == depthRange[0]) + { + X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n"); + } + else + { + X265_CHECK(log2TrSize > depthRange[0], "transform size failure\n"); + codeTransformSubdivFlag(subdiv, 5 - log2TrSize); + } + + const uint32_t trDepthCurr = depth - cu.m_cuDepth[absPartIdx]; + const bool bFirstCbfOfCU = trDepthCurr == 0; + + bool mCodeAll = true; + const uint32_t numPels = 1 << (log2TrSize * 2 - hChromaShift - vChromaShift); + if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) + mCodeAll = false; + + if (bFirstCbfOfCU || mCodeAll) + { + uint32_t tuSize = 1 << log2TrSize; + if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1)) + codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_U, trDepthCurr, (subdiv == 0)); + if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1)) + codeQtCbf(cu, absPartIdx, absPartIdxStep, (tuSize >> hChromaShift), (tuSize >> vChromaShift), TEXT_CHROMA_V, trDepthCurr, (subdiv == 0)); + } + else + { + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepthCurr - 1), "chroma xform size match failure\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepthCurr - 1), "chroma xform size match failure\n"); + } + + if (subdiv) + { + log2TrSize--; + uint32_t numCoeff = 1 << (log2TrSize * 2); + uint32_t numCoeffC = (numCoeff >> (hChromaShift + vChromaShift)); + trIdx++; + ++depth; + absPartIdxStep >>= 2; + const uint32_t partNum = NUM_CU_PARTITIONS >> (depth << 1); + + encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + + absPartIdx += partNum; + offsetLuma += numCoeff; + offsetChroma += numCoeffC; + encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + + absPartIdx += partNum; + offsetLuma += numCoeff; + offsetChroma += numCoeffC; + encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + + absPartIdx += partNum; + offsetLuma += numCoeff; + offsetChroma += numCoeffC; + encodeTransform(cu, state, offsetLuma, offsetChroma, absPartIdx, absPartIdxStep, depth, log2TrSize, trIdx, bCodeDQP, depthRange); + } + else + { + if (cu.m_predMode[absPartIdx] != MODE_INTRA && depth == cu.m_cuDepth[absPartIdx] && !cu.getCbf(absPartIdx, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdx, TEXT_CHROMA_V, 0)) + { + X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n"); + } + else + codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + + if (cbfY || cbfU || cbfV) + { + // dQP: only for CTU once + if (cu.m_slice->m_pps->bUseDQP) + { + if (bCodeDQP) + { + codeDeltaQP(cu, state.bakAbsPartIdxCU); + bCodeDQP = false; + } + } + } + if (cbfY) + codeCoeffNxN(cu, cu.m_trCoeff[0] + offsetLuma, absPartIdx, log2TrSize, TEXT_LUMA); + + int chFmt = cu.m_chromaFormat; + if (log2TrSize == 2 && chFmt != X265_CSP_I444) + { + uint32_t partNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1); + if ((absPartIdx & (partNum - 1)) == (partNum - 1)) + { + const uint32_t log2TrSizeC = 2; + const bool splitIntoSubTUs = (chFmt == X265_CSP_I422); + + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((depth - 1) << 1); + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, state.bakAbsPartIdx); + const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; + do + { + uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs); + if (cbf) + { + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + codeCoeffNxN(cu, coeffChroma + state.bakChromaOffset + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); + } + } + while (tuIterator.isNextSection()); + } + } + } + else + { + uint32_t log2TrSizeC = log2TrSize - hChromaShift; + const bool splitIntoSubTUs = (chFmt == X265_CSP_I422); + uint32_t curPartNum = NUM_CU_PARTITIONS >> (depth << 1); + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, curPartNum, absPartIdx); + const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; + do + { + uint32_t cbf = cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, trIdx + splitIntoSubTUs); + if (cbf) + { + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + codeCoeffNxN(cu, coeffChroma + offsetChroma + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); + } + } + while (tuIterator.isNextSection()); + } + } + } +} + +void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx) +{ + if (cu.isIntra(absPartIdx)) // If it is intra mode, encode intra prediction mode. + { + codeIntraDirLumaAng(cu, absPartIdx, true); + if (cu.m_chromaFormat != X265_CSP_I400) + { + uint32_t chromaDirMode[NUM_CHROMA_MODE]; + cu.getAllowedChromaDir(absPartIdx, chromaDirMode); + + codeIntraDirChroma(cu, absPartIdx, chromaDirMode); + + if ((cu.m_chromaFormat == X265_CSP_I444) && (cu.m_partSize[absPartIdx] == SIZE_NxN)) + { + uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2; + for (uint32_t i = 1; i <= 3; i++) + { + uint32_t offset = absPartIdx + i * partOffset; + cu.getAllowedChromaDir(offset, chromaDirMode); + codeIntraDirChroma(cu, offset, chromaDirMode); + } + } + } + } + else // if it is inter mode, encode motion vector and reference index + codePUWise(cu, absPartIdx); +} + +/** encode motion information for every PU block */ +void Entropy::codePUWise(const CUData& cu, uint32_t absPartIdx) +{ + PartSize partSize = (PartSize)cu.m_partSize[absPartIdx]; + uint32_t numPU = (partSize == SIZE_2Nx2N ? 1 : (partSize == SIZE_NxN ? 4 : 2)); + uint32_t depth = cu.m_cuDepth[absPartIdx]; + uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_maxFullDepth - depth) * 2) >> 4; + + for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += puOffset) + { + codeMergeFlag(cu, subPartIdx); + if (cu.m_mergeFlag[subPartIdx]) + codeMergeIndex(cu, subPartIdx); + else + { + if (cu.m_slice->isInterB()) + codeInterDir(cu, subPartIdx); + + uint32_t interDir = cu.m_interDir[subPartIdx]; + for (uint32_t list = 0; list < 2; list++) + { + if (interDir & (1 << list)) + { + X265_CHECK(cu.m_slice->m_numRefIdx[list] > 0, "numRefs should have been > 0\n"); + + codeRefFrmIdxPU(cu, subPartIdx, list); + codeMvd(cu, subPartIdx, list); + codeMVPIdx(cu.m_mvpIdx[list][subPartIdx]); + } + } + } + } +} + +/** encode reference frame index for a PU block */ +void Entropy::codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list) +{ + X265_CHECK(!cu.isIntra(absPartIdx), "intra block not expected\n"); + + if (cu.m_slice->m_numRefIdx[list] > 1) + codeRefFrmIdx(cu, absPartIdx, list); +} + +void Entropy::codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]) +{ + if (!cu.isIntra(absPartIdx)) + { + if (!(cu.m_mergeFlag[absPartIdx] && cu.m_partSize[absPartIdx] == SIZE_2Nx2N)) + codeQtRootCbf(cu.getQtRootCbf(absPartIdx)); + if (!cu.getQtRootCbf(absPartIdx)) + return; + } + + uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx]; + uint32_t lumaOffset = absPartIdx << (LOG2_UNIT_SIZE * 2); + uint32_t chromaOffset = lumaOffset >> (cu.m_hChromaShift + cu.m_vChromaShift); + uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> (depth << 1); + CoeffCodeState state; + encodeTransform(cu, state, lumaOffset, chromaOffset, absPartIdx, absPartIdxStep, depth, log2CUSize, 0, bCodeDQP, depthRange); +} + +void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane) +{ + int typeIdx = ctuParam.typeIdx; + + if (plane != 2) + { + encodeBin(typeIdx >= 0, m_contextState[OFF_SAO_TYPE_IDX_CTX]); + if (typeIdx >= 0) + encodeBinEP(typeIdx < SAO_BO ? 1 : 0); + } + + if (typeIdx >= 0) + { + enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) }; + if (typeIdx == SAO_BO) + { + for (int i = 0; i < SAO_BO_LEN; i++) + codeSaoMaxUvlc(abs(ctuParam.offset[i]), OFFSET_THRESH - 1); + + for (int i = 0; i < SAO_BO_LEN; i++) + if (ctuParam.offset[i] != 0) + encodeBinEP(ctuParam.offset[i] < 0); + + encodeBinsEP(ctuParam.bandPos, 5); + } + else // if (typeIdx < SAO_BO) + { + codeSaoMaxUvlc(ctuParam.offset[0], OFFSET_THRESH - 1); + codeSaoMaxUvlc(ctuParam.offset[1], OFFSET_THRESH - 1); + codeSaoMaxUvlc(-ctuParam.offset[2], OFFSET_THRESH - 1); + codeSaoMaxUvlc(-ctuParam.offset[3], OFFSET_THRESH - 1); + if (plane != 2) + encodeBinsEP((uint32_t)(typeIdx), 2); + } + } +} + +/** initialize context model with respect to QP and initialization value */ +uint8_t sbacInit(int qp, int initValue) +{ + qp = Clip3(0, 51, qp); + + int slope = (initValue >> 4) * 5 - 45; + int offset = ((initValue & 15) << 3) - 16; + int initState = X265_MIN(X265_MAX(1, (((slope * qp) >> 4) + offset)), 126); + uint32_t mpState = (initState >= 64); + uint32_t state = ((mpState ? (initState - 64) : (63 - initState)) << 1) + mpState; + + return (uint8_t)state; +} + +static void initBuffer(uint8_t* contextModel, SliceType sliceType, int qp, uint8_t* ctxModel, int size) +{ + ctxModel += sliceType * size; + + for (int n = 0; n < size; n++) + contextModel[n] = sbacInit(qp, ctxModel[n]); +} + +void Entropy::resetEntropy(const Slice& slice) +{ + int qp = slice.m_sliceQp; + SliceType sliceType = slice.m_sliceType; + + initBuffer(&m_contextState[OFF_SPLIT_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_SPLIT_FLAG, NUM_SPLIT_FLAG_CTX); + initBuffer(&m_contextState[OFF_SKIP_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_SKIP_FLAG, NUM_SKIP_FLAG_CTX); + initBuffer(&m_contextState[OFF_MERGE_FLAG_EXT_CTX], sliceType, qp, (uint8_t*)INIT_MERGE_FLAG_EXT, NUM_MERGE_FLAG_EXT_CTX); + initBuffer(&m_contextState[OFF_MERGE_IDX_EXT_CTX], sliceType, qp, (uint8_t*)INIT_MERGE_IDX_EXT, NUM_MERGE_IDX_EXT_CTX); + initBuffer(&m_contextState[OFF_PART_SIZE_CTX], sliceType, qp, (uint8_t*)INIT_PART_SIZE, NUM_PART_SIZE_CTX); + initBuffer(&m_contextState[OFF_PRED_MODE_CTX], sliceType, qp, (uint8_t*)INIT_PRED_MODE, NUM_PRED_MODE_CTX); + initBuffer(&m_contextState[OFF_ADI_CTX], sliceType, qp, (uint8_t*)INIT_INTRA_PRED_MODE, NUM_ADI_CTX); + initBuffer(&m_contextState[OFF_CHROMA_PRED_CTX], sliceType, qp, (uint8_t*)INIT_CHROMA_PRED_MODE, NUM_CHROMA_PRED_CTX); + initBuffer(&m_contextState[OFF_DELTA_QP_CTX], sliceType, qp, (uint8_t*)INIT_DQP, NUM_DELTA_QP_CTX); + initBuffer(&m_contextState[OFF_INTER_DIR_CTX], sliceType, qp, (uint8_t*)INIT_INTER_DIR, NUM_INTER_DIR_CTX); + initBuffer(&m_contextState[OFF_REF_NO_CTX], sliceType, qp, (uint8_t*)INIT_REF_PIC, NUM_REF_NO_CTX); + initBuffer(&m_contextState[OFF_MV_RES_CTX], sliceType, qp, (uint8_t*)INIT_MVD, NUM_MV_RES_CTX); + initBuffer(&m_contextState[OFF_QT_CBF_CTX], sliceType, qp, (uint8_t*)INIT_QT_CBF, NUM_QT_CBF_CTX); + initBuffer(&m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_TRANS_SUBDIV_FLAG, NUM_TRANS_SUBDIV_FLAG_CTX); + initBuffer(&m_contextState[OFF_QT_ROOT_CBF_CTX], sliceType, qp, (uint8_t*)INIT_QT_ROOT_CBF, NUM_QT_ROOT_CBF_CTX); + initBuffer(&m_contextState[OFF_SIG_CG_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_SIG_CG_FLAG, 2 * NUM_SIG_CG_FLAG_CTX); + initBuffer(&m_contextState[OFF_SIG_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_SIG_FLAG, NUM_SIG_FLAG_CTX); + initBuffer(&m_contextState[OFF_CTX_LAST_FLAG_X], sliceType, qp, (uint8_t*)INIT_LAST, NUM_CTX_LAST_FLAG_XY); + initBuffer(&m_contextState[OFF_CTX_LAST_FLAG_Y], sliceType, qp, (uint8_t*)INIT_LAST, NUM_CTX_LAST_FLAG_XY); + initBuffer(&m_contextState[OFF_ONE_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_ONE_FLAG, NUM_ONE_FLAG_CTX); + initBuffer(&m_contextState[OFF_ABS_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_ABS_FLAG, NUM_ABS_FLAG_CTX); + initBuffer(&m_contextState[OFF_MVP_IDX_CTX], sliceType, qp, (uint8_t*)INIT_MVP_IDX, NUM_MVP_IDX_CTX); + initBuffer(&m_contextState[OFF_SAO_MERGE_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_SAO_MERGE_FLAG, NUM_SAO_MERGE_FLAG_CTX); + initBuffer(&m_contextState[OFF_SAO_TYPE_IDX_CTX], sliceType, qp, (uint8_t*)INIT_SAO_TYPE_IDX, NUM_SAO_TYPE_IDX_CTX); + initBuffer(&m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_TRANSFORMSKIP_FLAG, 2 * NUM_TRANSFORMSKIP_FLAG_CTX); + initBuffer(&m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX], sliceType, qp, (uint8_t*)INIT_CU_TRANSQUANT_BYPASS_FLAG, NUM_TQUANT_BYPASS_FLAG_CTX); + // new structure + + start(); +} + +/* code explicit wp tables */ +void Entropy::codePredWeightTable(const Slice& slice) +{ + const WeightParam *wp; + bool bChroma = true; // 4:0:0 not yet supported + bool bDenomCoded = false; + int numRefDirs = slice.m_sliceType == B_SLICE ? 2 : 1; + uint32_t totalSignalledWeightFlags = 0; + + if ((slice.m_sliceType == P_SLICE && slice.m_pps->bUseWeightPred) || + (slice.m_sliceType == B_SLICE && slice.m_pps->bUseWeightedBiPred)) + { + for (int list = 0; list < numRefDirs; list++) + { + for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++) + { + wp = slice.m_weightPredTable[list][ref]; + if (!bDenomCoded) + { + WRITE_UVLC(wp[0].log2WeightDenom, "luma_log2_weight_denom"); + + if (bChroma) + { + int deltaDenom = wp[1].log2WeightDenom - wp[0].log2WeightDenom; + WRITE_SVLC(deltaDenom, "delta_chroma_log2_weight_denom"); + } + bDenomCoded = true; + } + WRITE_FLAG(wp[0].bPresentFlag, "luma_weight_lX_flag"); + totalSignalledWeightFlags += wp[0].bPresentFlag; + } + + if (bChroma) + { + for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++) + { + wp = slice.m_weightPredTable[list][ref]; + WRITE_FLAG(wp[1].bPresentFlag, "chroma_weight_lX_flag"); + totalSignalledWeightFlags += 2 * wp[1].bPresentFlag; + } + } + + for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++) + { + wp = slice.m_weightPredTable[list][ref]; + if (wp[0].bPresentFlag) + { + int deltaWeight = (wp[0].inputWeight - (1 << wp[0].log2WeightDenom)); + WRITE_SVLC(deltaWeight, "delta_luma_weight_lX"); + WRITE_SVLC(wp[0].inputOffset, "luma_offset_lX"); + } + + if (bChroma) + { + if (wp[1].bPresentFlag) + { + for (int plane = 1; plane < 3; plane++) + { + int deltaWeight = (wp[plane].inputWeight - (1 << wp[1].log2WeightDenom)); + WRITE_SVLC(deltaWeight, "delta_chroma_weight_lX"); + + int pred = (128 - ((128 * wp[plane].inputWeight) >> (wp[plane].log2WeightDenom))); + int deltaChroma = (wp[plane].inputOffset - pred); + WRITE_SVLC(deltaChroma, "delta_chroma_offset_lX"); + } + } + } + } + } + + X265_CHECK(totalSignalledWeightFlags <= 24, "total weights must be <= 24\n"); + } +} + +void Entropy::writeUnaryMaxSymbol(uint32_t symbol, uint8_t* scmModel, int offset, uint32_t maxSymbol) +{ + X265_CHECK(maxSymbol > 0, "maxSymbol too small\n"); + + encodeBin(symbol ? 1 : 0, scmModel[0]); + + if (!symbol) + return; + + bool bCodeLast = (maxSymbol > symbol); + + while (--symbol) + encodeBin(1, scmModel[offset]); + + if (bCodeLast) + encodeBin(0, scmModel[offset]); +} + +void Entropy::writeEpExGolomb(uint32_t symbol, uint32_t count) +{ + uint32_t bins = 0; + int numBins = 0; + + while (symbol >= (uint32_t)(1 << count)) + { + bins = 2 * bins + 1; + numBins++; + symbol -= 1 << count; + count++; + } + + bins = 2 * bins + 0; + numBins++; + + bins = (bins << count) | symbol; + numBins += count; + + X265_CHECK(numBins <= 32, "numBins too large\n"); + encodeBinsEP(bins, numBins); +} + +/** Coding of coeff_abs_level_minus3 */ +void Entropy::writeCoefRemainExGolomb(uint32_t codeNumber, uint32_t absGoRice) +{ + uint32_t length; + const uint32_t codeRemain = codeNumber & ((1 << absGoRice) - 1); + + if ((codeNumber >> absGoRice) < COEF_REMAIN_BIN_REDUCTION) + { + length = codeNumber >> absGoRice; + + X265_CHECK(codeNumber - (length << absGoRice) == (codeNumber & ((1 << absGoRice) - 1)), "codeNumber failure\n"); + X265_CHECK(length + 1 + absGoRice < 32, "length failure\n"); + encodeBinsEP((((1 << (length + 1)) - 2) << absGoRice) + codeRemain, length + 1 + absGoRice); + } + else + { + length = 0; + codeNumber = (codeNumber >> absGoRice) - COEF_REMAIN_BIN_REDUCTION; + if (codeNumber != 0) + { + unsigned long idx; + CLZ32(idx, codeNumber + 1); + length = idx; + codeNumber -= (1 << idx) - 1; + } + codeNumber = (codeNumber << absGoRice) + codeRemain; + + encodeBinsEP((1 << (COEF_REMAIN_BIN_REDUCTION + length + 1)) - 2, COEF_REMAIN_BIN_REDUCTION + length + 1); + encodeBinsEP(codeNumber, length + absGoRice); + } +} + +// SBAC RD +void Entropy::loadIntraDirModeLuma(const Entropy& src) +{ + X265_CHECK(src.m_valid, "invalid copy source context\n"); + m_fracBits = src.m_fracBits; + m_contextState[OFF_ADI_CTX] = src.m_contextState[OFF_ADI_CTX]; +} + +void Entropy::copyFrom(const Entropy& src) +{ + X265_CHECK(src.m_valid, "invalid copy source context\n"); + + copyState(src); + + memcpy(m_contextState, src.m_contextState, MAX_OFF_CTX_MOD * sizeof(uint8_t)); + markValid(); +} + +void Entropy::codeMVPIdx(uint32_t symbol) +{ + encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); +} + +void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth) +{ + PartSize partSize = (PartSize)cu.m_partSize[absPartIdx]; + + if (cu.isIntra(absPartIdx)) + { + if (depth == g_maxCUDepth) + encodeBin(partSize == SIZE_2Nx2N ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX]); + return; + } + + switch (partSize) + { + case SIZE_2Nx2N: + encodeBin(1, m_contextState[OFF_PART_SIZE_CTX]); + break; + + case SIZE_2NxN: + case SIZE_2NxnU: + case SIZE_2NxnD: + encodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]); + encodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 1]); + if (cu.m_slice->m_sps->maxAMPDepth > depth) + { + encodeBin((partSize == SIZE_2NxN) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]); + if (partSize != SIZE_2NxN) + encodeBinEP((partSize == SIZE_2NxnU ? 0 : 1)); + } + break; + + case SIZE_Nx2N: + case SIZE_nLx2N: + case SIZE_nRx2N: + encodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]); + encodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]); + if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3)) + encodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]); + if (cu.m_slice->m_sps->maxAMPDepth > depth) + { + encodeBin((partSize == SIZE_Nx2N) ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX + 3]); + if (partSize != SIZE_Nx2N) + encodeBinEP((partSize == SIZE_nLx2N ? 0 : 1)); + } + break; + default: + X265_CHECK(0, "invalid CU partition\n"); + break; + } +} + +void Entropy::codePredMode(int predMode) +{ + encodeBin(predMode == MODE_INTER ? 0 : 1, m_contextState[OFF_PRED_MODE_CTX]); +} + +void Entropy::codeCUTransquantBypassFlag(uint32_t symbol) +{ + encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); +} + +void Entropy::codeSkipFlag(const CUData& cu, uint32_t absPartIdx) +{ + // get context function is here + uint32_t symbol = cu.isSkipped(absPartIdx) ? 1 : 0; + uint32_t ctxSkip = cu.getCtxSkipFlag(absPartIdx); + + encodeBin(symbol, m_contextState[OFF_SKIP_FLAG_CTX + ctxSkip]); +} + +void Entropy::codeMergeFlag(const CUData& cu, uint32_t absPartIdx) +{ + const uint32_t symbol = cu.m_mergeFlag[absPartIdx] ? 1 : 0; + + encodeBin(symbol, m_contextState[OFF_MERGE_FLAG_EXT_CTX]); +} + +void Entropy::codeMergeIndex(const CUData& cu, uint32_t absPartIdx) +{ + uint32_t numCand = cu.m_slice->m_maxNumMergeCand; + + if (numCand > 1) + { + uint32_t unaryIdx = cu.m_mvpIdx[0][absPartIdx]; // merge candidate index was stored in L0 MVP idx + encodeBin((unaryIdx != 0), m_contextState[OFF_MERGE_IDX_EXT_CTX]); + + X265_CHECK(unaryIdx < numCand, "unaryIdx out of range\n"); + + if (unaryIdx != 0) + { + uint32_t mask = (1 << unaryIdx) - 2; + mask >>= (unaryIdx == numCand - 1) ? 1 : 0; + encodeBinsEP(mask, unaryIdx - (unaryIdx == numCand - 1)); + } + } +} + +void Entropy::codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) +{ + X265_CHECK(depth < g_maxCUDepth, "invalid depth\n"); + + uint32_t ctx = cu.getCtxSplitFlag(absPartIdx, depth); + uint32_t currSplitFlag = (cu.m_cuDepth[absPartIdx] > depth) ? 1 : 0; + + X265_CHECK(ctx < 3, "ctx out of range\n"); + encodeBin(currSplitFlag, m_contextState[OFF_SPLIT_FLAG_CTX + ctx]); +} + +void Entropy::codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx) +{ + encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); +} + +uint32_t Entropy::bitsIntraModeNonMPM() const +{ + uint32_t mstate = m_contextState[OFF_ADI_CTX]; + uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 0)) >> 15; + return bits + 5; /* fixed cost for encodeBinsEP() */ +} + +uint32_t Entropy::bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const +{ + X265_CHECK(dir == preds[0] || dir == preds[1] || dir == preds[2], "dir must be a most probable mode\n"); + uint32_t mstate = m_contextState[OFF_ADI_CTX]; + uint32_t bits = ((uint32_t)(m_fracBits & 32767) + sbacGetEntropyBits(mstate, 1)) >> 15; + return bits + (dir == preds[0] ? 1 : 2); +} + +void Entropy::codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple) +{ + uint32_t dir[4], j; + uint32_t preds[4][3]; + int predIdx[4]; + PartSize mode = (PartSize)cu.m_partSize[absPartIdx]; + uint32_t partNum = isMultiple ? (mode == SIZE_NxN ? 4 : 1) : 1; + uint32_t partOffset = (NUM_CU_PARTITIONS >> (cu.m_cuDepth[absPartIdx] << 1)) >> 2; + + for (j = 0; j < partNum; j++) + { + dir[j] = cu.m_lumaIntraDir[absPartIdx + partOffset * j]; + cu.getIntraDirLumaPredictor(absPartIdx + partOffset * j, preds[j]); + predIdx[j] = -1; + for (uint32_t i = 0; i < 3; i++) + if (dir[j] == preds[j][i]) + predIdx[j] = i; + + encodeBin((predIdx[j] != -1) ? 1 : 0, m_contextState[OFF_ADI_CTX]); + } + + for (j = 0; j < partNum; j++) + { + if (predIdx[j] != -1) + { + X265_CHECK((predIdx[j] >= 0) && (predIdx[j] <= 2), "predIdx out of range\n"); + // NOTE: Mapping + // 0 = 0 + // 1 = 10 + // 2 = 11 + int nonzero = (!!predIdx[j]); + encodeBinsEP(predIdx[j] + nonzero, 1 + nonzero); + } + else + { + if (preds[j][0] > preds[j][1]) + std::swap(preds[j][0], preds[j][1]); + + if (preds[j][0] > preds[j][2]) + std::swap(preds[j][0], preds[j][2]); + + if (preds[j][1] > preds[j][2]) + std::swap(preds[j][1], preds[j][2]); + + dir[j] += (dir[j] > preds[j][2]) ? -1 : 0; + dir[j] += (dir[j] > preds[j][1]) ? -1 : 0; + dir[j] += (dir[j] > preds[j][0]) ? -1 : 0; + + encodeBinsEP(dir[j], 5); + } + } +} + +void Entropy::codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode) +{ + uint32_t intraDirChroma = cu.m_chromaIntraDir[absPartIdx]; + + if (intraDirChroma == DM_CHROMA_IDX) + encodeBin(0, m_contextState[OFF_CHROMA_PRED_CTX]); + else + { + for (int i = 0; i < NUM_CHROMA_MODE - 1; i++) + { + if (intraDirChroma == chromaDirMode[i]) + { + intraDirChroma = i; + break; + } + } + + encodeBin(1, m_contextState[OFF_CHROMA_PRED_CTX]); + encodeBinsEP(intraDirChroma, 2); + } +} + +void Entropy::codeInterDir(const CUData& cu, uint32_t absPartIdx) +{ + const uint32_t interDir = cu.m_interDir[absPartIdx] - 1; + const uint32_t ctx = cu.m_cuDepth[absPartIdx]; // the context of the inter dir is the depth of the CU + + if (cu.m_partSize[absPartIdx] == SIZE_2Nx2N || cu.m_log2CUSize[absPartIdx] != 3) + encodeBin(interDir == 2 ? 1 : 0, m_contextState[OFF_INTER_DIR_CTX + ctx]); + if (interDir < 2) + encodeBin(interDir, m_contextState[OFF_INTER_DIR_CTX + 4]); +} + +void Entropy::codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list) +{ + uint32_t refFrame = cu.m_refIdx[list][absPartIdx]; + + encodeBin(refFrame > 0, m_contextState[OFF_REF_NO_CTX]); + + if (refFrame > 0) + { + uint32_t refNum = cu.m_slice->m_numRefIdx[list] - 2; + if (refNum == 0) + return; + + refFrame--; + encodeBin(refFrame > 0, m_contextState[OFF_REF_NO_CTX + 1]); + if (refFrame > 0) + { + uint32_t mask = (1 << refFrame) - 2; + mask >>= (refFrame == refNum) ? 1 : 0; + encodeBinsEP(mask, refFrame - (refFrame == refNum)); + } + } +} + +void Entropy::codeMvd(const CUData& cu, uint32_t absPartIdx, int list) +{ + const MV& mvd = cu.m_mvd[list][absPartIdx]; + const int hor = mvd.x; + const int ver = mvd.y; + + encodeBin(hor != 0 ? 1 : 0, m_contextState[OFF_MV_RES_CTX]); + encodeBin(ver != 0 ? 1 : 0, m_contextState[OFF_MV_RES_CTX]); + + const bool bHorAbsGr0 = hor != 0; + const bool bVerAbsGr0 = ver != 0; + const uint32_t horAbs = 0 > hor ? -hor : hor; + const uint32_t verAbs = 0 > ver ? -ver : ver; + + if (bHorAbsGr0) + encodeBin(horAbs > 1 ? 1 : 0, m_contextState[OFF_MV_RES_CTX + 1]); + + if (bVerAbsGr0) + encodeBin(verAbs > 1 ? 1 : 0, m_contextState[OFF_MV_RES_CTX + 1]); + + if (bHorAbsGr0) + { + if (horAbs > 1) + writeEpExGolomb(horAbs - 2, 1); + + encodeBinEP(0 > hor ? 1 : 0); + } + + if (bVerAbsGr0) + { + if (verAbs > 1) + writeEpExGolomb(verAbs - 2, 1); + + encodeBinEP(0 > ver ? 1 : 0); + } +} + +void Entropy::codeDeltaQP(const CUData& cu, uint32_t absPartIdx) +{ + int dqp = cu.m_qp[absPartIdx] - cu.getRefQP(absPartIdx); + + int qpBdOffsetY = QP_BD_OFFSET; + + dqp = (dqp + 78 + qpBdOffsetY + (qpBdOffsetY / 2)) % (52 + qpBdOffsetY) - 26 - (qpBdOffsetY / 2); + + uint32_t absDQp = (uint32_t)((dqp > 0) ? dqp : (-dqp)); + uint32_t TUValue = X265_MIN((int)absDQp, CU_DQP_TU_CMAX); + writeUnaryMaxSymbol(TUValue, &m_contextState[OFF_DELTA_QP_CTX], 1, CU_DQP_TU_CMAX); + if (absDQp >= CU_DQP_TU_CMAX) + writeEpExGolomb(absDQp - CU_DQP_TU_CMAX, CU_DQP_EG_k); + + if (absDQp > 0) + { + uint32_t sign = (dqp > 0 ? 0 : 1); + encodeBinEP(sign); + } +} + +void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel) +{ + uint32_t ctx = ctxCbf[ttype][trDepth]; + + bool canQuadSplit = (width >= (MIN_TU_SIZE * 2)) && (height >= (MIN_TU_SIZE * 2)); + uint32_t lowestTUDepth = trDepth + ((!lowestLevel && !canQuadSplit) ? 1 : 0); // unsplittable TUs inherit their parent's CBF + + if ((width != height) && (lowestLevel || !canQuadSplit)) // if sub-TUs are present + { + uint32_t subTUDepth = lowestTUDepth + 1; // if this is the lowest level of the TU-tree, the sub-TUs are directly below. + // Otherwise, this must be the level above the lowest level (as specified above) + uint32_t partIdxesPerSubTU = absPartIdxStep >> 1; + + for (uint32_t subTU = 0; subTU < 2; subTU++) + { + uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); + uint32_t cbf = cu.getCbf(subTUAbsPartIdx, ttype, subTUDepth); + + encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); + } + } + else + { + uint32_t cbf = cu.getCbf(absPartIdx, ttype, lowestTUDepth); + + encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); + } +} + +void Entropy::codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth) +{ + uint32_t ctx = ctxCbf[ttype][trDepth]; + uint32_t cbf = cu.getCbf(absPartIdx, ttype, trDepth); + encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); +} + +void Entropy::codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth) +{ + uint32_t ctx = ctxCbf[ttype][trDepth]; + encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctx]); +} + +void Entropy::codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype) +{ + if (cu.m_tqBypass[absPartIdx]) + return; + if (trSize != 4) + return; + + uint32_t useTransformSkip = cu.m_transformSkip[ttype][absPartIdx]; + encodeBin(useTransformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); +} + +void Entropy::codeQtRootCbf(uint32_t cbf) +{ + encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); +} + +void Entropy::codeQtCbfZero(TextType ttype, uint32_t trDepth) +{ + // this function is only used to estimate the bits when cbf is 0 + // and will never be called when writing the bitsream. + uint32_t ctx = ctxCbf[ttype][trDepth]; + encodeBin(0, m_contextState[OFF_QT_CBF_CTX + ctx]); +} + +void Entropy::codeQtRootCbfZero() +{ + // this function is only used to estimate the bits when cbf is 0 + // and will never be called when writing the bistream. + encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); +} + +/** Encode (X,Y) position of the last significant coefficient + * \param posx X component of last coefficient + * \param posy Y component of last coefficient + * \param log2TrSize + * \param bIsLuma + * \param scanIdx scan type (zig-zag, hor, ver) + * This method encodes the X and Y component within a block of the last significant coefficient. + */ +void Entropy::codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx) +{ + // swap + if (scanIdx == SCAN_VER) + std::swap(posx, posy); + + uint32_t ctxLast; + uint32_t groupIdxX = getGroupIdx(posx); + uint32_t groupIdxY = getGroupIdx(posy); + + int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA; + int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2; + uint32_t maxGroupIdx = log2TrSize * 2 - 1; + + // posX + uint8_t *ctxX = &m_contextState[OFF_CTX_LAST_FLAG_X]; + for (ctxLast = 0; ctxLast < groupIdxX; ctxLast++) + encodeBin(1, *(ctxX + blkSizeOffset + (ctxLast >> ctxShift))); + + if (groupIdxX < maxGroupIdx) + encodeBin(0, *(ctxX + blkSizeOffset + (ctxLast >> ctxShift))); + + // posY + uint8_t *ctxY = &m_contextState[OFF_CTX_LAST_FLAG_Y]; + for (ctxLast = 0; ctxLast < groupIdxY; ctxLast++) + encodeBin(1, *(ctxY + blkSizeOffset + (ctxLast >> ctxShift))); + + if (groupIdxY < maxGroupIdx) + encodeBin(0, *(ctxY + blkSizeOffset + (ctxLast >> ctxShift))); + + if (groupIdxX > 3) + { + uint32_t count = (groupIdxX - 2) >> 1; + posx = posx - g_minInGroup[groupIdxX]; + encodeBinsEP(posx, count); + } + if (groupIdxY > 3) + { + uint32_t count = (groupIdxY - 2) >> 1; + posy = posy - g_minInGroup[groupIdxY]; + encodeBinsEP(posy, count); + } +} + +void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype) +{ + uint32_t trSize = 1 << log2TrSize; + + // compute number of significant coefficients + uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1))); + + X265_CHECK(numSig > 0, "cbf check fail\n"); + + bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled && !cu.m_tqBypass[absPartIdx]; + + if (cu.m_slice->m_pps->bTransformSkipEnabled) + codeTransformSkipFlags(cu, absPartIdx, trSize, ttype); + + bool bIsLuma = ttype == TEXT_LUMA; + + // select scans + TUEntropyCodingParameters codingParameters; + cu.getTUEntropyCodingParameters(codingParameters, absPartIdx, log2TrSize, bIsLuma); + + //----- encode significance map ----- + + // Find position of last coefficient + int scanPosLast = 0; + uint32_t posLast; + uint64_t sigCoeffGroupFlag64 = 0; + const uint32_t maskPosXY = ((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1; + assert((uint32_t)((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1) == (((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1)); + do + { + posLast = codingParameters.scan[scanPosLast++]; + + const uint32_t isNZCoeff = (coeff[posLast] != 0); + // get L1 sig map + // NOTE: the new algorithm is complicated, so I keep reference code here + //uint32_t posy = posLast >> log2TrSize; + //uint32_t posx = posLast - (posy << log2TrSize); + //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE); + const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY); + sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx); + numSig -= isNZCoeff; + } + while (numSig > 0); + scanPosLast--; + + // Code position of last coefficient + int posLastY = posLast >> log2TrSize; + int posLastX = posLast & (trSize - 1); + codeLastSignificantXY(posLastX, posLastY, log2TrSize, bIsLuma, codingParameters.scanType); + + //===== code significance flag ===== + uint8_t * const baseCoeffGroupCtx = &m_contextState[OFF_SIG_CG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_CG_FLAG_CTX)]; + uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA]; + const int lastScanSet = scanPosLast >> MLS_CG_SIZE; + uint32_t c1 = 1; + uint32_t goRiceParam = 0; + int scanPosSig = scanPosLast; + + for (int subSet = lastScanSet; subSet >= 0; subSet--) + { + int numNonZero = 0; + int subPos = subSet << MLS_CG_SIZE; + goRiceParam = 0; + int absCoeff[1 << MLS_CG_SIZE]; + uint32_t coeffSigns = 0; + int lastNZPosInCG = -1; + int firstNZPosInCG = 1 << MLS_CG_SIZE; + if (scanPosSig == scanPosLast) + { + absCoeff[0] = int(abs(coeff[posLast])); + coeffSigns = (coeff[posLast] < 0); + numNonZero = 1; + lastNZPosInCG = scanPosSig; + firstNZPosInCG = scanPosSig; + scanPosSig--; + } + // encode significant_coeffgroup_flag + const int cgBlkPos = codingParameters.scanCG[subSet]; + const int cgPosY = cgBlkPos >> codingParameters.log2TrSizeCG; + const int cgPosX = cgBlkPos - (cgPosY << codingParameters.log2TrSizeCG); + const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); + + if (subSet == lastScanSet || !subSet) + sigCoeffGroupFlag64 |= cgBlkPosMask; + else + { + uint32_t sigCoeffGroup = ((sigCoeffGroupFlag64 & cgBlkPosMask) != 0); + uint32_t ctxSig = Quant::getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG); + encodeBin(sigCoeffGroup, baseCoeffGroupCtx[ctxSig]); + } + + // encode significant_coeff_flag + if (sigCoeffGroupFlag64 & cgBlkPosMask) + { + const int patternSigCtx = Quant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG); + uint32_t blkPos, sig, ctxSig; + for (; scanPosSig >= subPos; scanPosSig--) + { + blkPos = codingParameters.scan[scanPosSig]; + sig = (coeff[blkPos] != 0); + if (scanPosSig > subPos || subSet == 0 || numNonZero) + { + ctxSig = Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext); + encodeBin(sig, baseCtx[ctxSig]); + } + if (sig) + { + absCoeff[numNonZero] = int(abs(coeff[blkPos])); + coeffSigns = 2 * coeffSigns + ((uint32_t)coeff[blkPos] >> 31); + numNonZero++; + if (lastNZPosInCG < 0) + lastNZPosInCG = scanPosSig; + firstNZPosInCG = scanPosSig; + } + } + } + else + scanPosSig = subPos - 1; + + if (numNonZero > 0) + { + bool signHidden = (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD); + uint32_t ctxSet = (subSet > 0 && bIsLuma) ? 2 : 0; + + if (c1 == 0) + ctxSet++; + + c1 = 1; + uint8_t *baseCtxMod = bIsLuma ? &m_contextState[OFF_ONE_FLAG_CTX + 4 * ctxSet] : &m_contextState[OFF_ONE_FLAG_CTX + NUM_ONE_FLAG_CTX_LUMA + 4 * ctxSet]; + + int numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER); + int firstC2FlagIdx = -1; + for (int idx = 0; idx < numC1Flag; idx++) + { + uint32_t symbol = absCoeff[idx] > 1; + encodeBin(symbol, baseCtxMod[c1]); + if (symbol) + { + c1 = 0; + + if (firstC2FlagIdx == -1) + firstC2FlagIdx = idx; + } + else if ((c1 < 3) && (c1 > 0)) + c1++; + } + + if (!c1) + { + baseCtxMod = bIsLuma ? &m_contextState[OFF_ABS_FLAG_CTX + ctxSet] : &m_contextState[OFF_ABS_FLAG_CTX + NUM_ABS_FLAG_CTX_LUMA + ctxSet]; + if (firstC2FlagIdx != -1) + { + uint32_t symbol = absCoeff[firstC2FlagIdx] > 2; + encodeBin(symbol, baseCtxMod[0]); + } + } + + if (bHideFirstSign && signHidden) + encodeBinsEP((coeffSigns >> 1), numNonZero - 1); + else + encodeBinsEP(coeffSigns, numNonZero); + + int firstCoeff2 = 1; + if (!c1 || numNonZero > C1FLAG_NUMBER) + { + for (int idx = 0; idx < numNonZero; idx++) + { + int baseLevel = (idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1; + + if (absCoeff[idx] >= baseLevel) + { + writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam); + if (absCoeff[idx] > 3 * (1 << goRiceParam)) + goRiceParam = std::min(goRiceParam + 1, 4); + } + if (absCoeff[idx] >= 2) + firstCoeff2 = 0; + } + } + } + } +} + +void Entropy::codeSaoMaxUvlc(uint32_t code, uint32_t maxSymbol) +{ + X265_CHECK(maxSymbol > 0, "maxSymbol too small\n"); + + uint32_t isCodeNonZero = !!code; + + encodeBinEP(isCodeNonZero); + if (isCodeNonZero) + { + uint32_t isCodeLast = (maxSymbol > code); + uint32_t mask = (1 << (code - 1)) - 1; + uint32_t len = code - 1 + isCodeLast; + mask <<= isCodeLast; + + encodeBinsEP(mask, len); + } +} + +/* estimate bit cost for CBP, significant map and significant coefficients */ +void Entropy::estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const +{ + estCBFBit(estBitsSbac); + + estSignificantCoeffGroupMapBit(estBitsSbac, bIsLuma); + + // encode significance map + estSignificantMapBit(estBitsSbac, log2TrSize, bIsLuma); + + // encode significant coefficients + estSignificantCoefficientsBit(estBitsSbac, bIsLuma); +} + +/* estimate bit cost for each CBP bit */ +void Entropy::estCBFBit(EstBitsSbac& estBitsSbac) const +{ + const uint8_t *ctx = &m_contextState[OFF_QT_CBF_CTX]; + + for (uint32_t ctxInc = 0; ctxInc < NUM_QT_CBF_CTX; ctxInc++) + { + estBitsSbac.blockCbpBits[ctxInc][0] = sbacGetEntropyBits(ctx[ctxInc], 0); + estBitsSbac.blockCbpBits[ctxInc][1] = sbacGetEntropyBits(ctx[ctxInc], 1); + } + + ctx = &m_contextState[OFF_QT_ROOT_CBF_CTX]; + + estBitsSbac.blockRootCbpBits[0] = sbacGetEntropyBits(ctx[0], 0); + estBitsSbac.blockRootCbpBits[1] = sbacGetEntropyBits(ctx[0], 1); +} + +/* estimate SAMBAC bit cost for significant coefficient group map */ +void Entropy::estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const +{ + int firstCtx = 0, numCtx = NUM_SIG_CG_FLAG_CTX; + + for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++) + for (uint32_t bin = 0; bin < 2; bin++) + estBitsSbac.significantCoeffGroupBits[ctxIdx][bin] = sbacGetEntropyBits(m_contextState[OFF_SIG_CG_FLAG_CTX + ((bIsLuma ? 0 : NUM_SIG_CG_FLAG_CTX) + ctxIdx)], bin); +} + +/* estimate SAMBAC bit cost for significant coefficient map */ +void Entropy::estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const +{ + int firstCtx = 1, numCtx = 8; + + if (log2TrSize >= 4) + { + firstCtx = bIsLuma ? 21 : 12; + numCtx = bIsLuma ? 6 : 3; + } + else if (log2TrSize == 3) + { + firstCtx = 9; + numCtx = bIsLuma ? 12 : 3; + } + + if (bIsLuma) + { + for (uint32_t bin = 0; bin < 2; bin++) + estBitsSbac.significantBits[0][bin] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX], bin); + + for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++) + for (uint32_t bin = 0; bin < 2; bin++) + estBitsSbac.significantBits[ctxIdx][bin] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + ctxIdx], bin); + } + else + { + for (uint32_t bin = 0; bin < 2; bin++) + estBitsSbac.significantBits[0][bin] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + 0)], bin); + + for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++) + for (uint32_t bin = 0; bin < 2; bin++) + estBitsSbac.significantBits[ctxIdx][bin] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin); + } + int bitsX = 0, bitsY = 0; + + int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA; + int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2; + uint32_t maxGroupIdx = log2TrSize * 2 - 1; + + uint32_t ctx; + const uint8_t *ctxX = &m_contextState[OFF_CTX_LAST_FLAG_X]; + for (ctx = 0; ctx < maxGroupIdx; ctx++) + { + int ctxOffset = blkSizeOffset + (ctx >> ctxShift); + estBitsSbac.lastXBits[ctx] = bitsX + sbacGetEntropyBits(ctxX[ctxOffset], 0); + bitsX += sbacGetEntropyBits(ctxX[ctxOffset], 1); + } + + estBitsSbac.lastXBits[ctx] = bitsX; + + const uint8_t *ctxY = &m_contextState[OFF_CTX_LAST_FLAG_Y]; + for (ctx = 0; ctx < maxGroupIdx; ctx++) + { + int ctxOffset = blkSizeOffset + (ctx >> ctxShift); + estBitsSbac.lastYBits[ctx] = bitsY + sbacGetEntropyBits(ctxY[ctxOffset], 0); + bitsY += sbacGetEntropyBits(ctxY[ctxOffset], 1); + } + + estBitsSbac.lastYBits[ctx] = bitsY; +} + +/* estimate bit cost of significant coefficient */ +void Entropy::estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const +{ + if (bIsLuma) + { + const uint8_t *ctxOne = &m_contextState[OFF_ONE_FLAG_CTX]; + const uint8_t *ctxAbs = &m_contextState[OFF_ABS_FLAG_CTX]; + + for (int ctxIdx = 0; ctxIdx < NUM_ONE_FLAG_CTX_LUMA; ctxIdx++) + { + estBitsSbac.greaterOneBits[ctxIdx][0] = sbacGetEntropyBits(ctxOne[ctxIdx], 0); + estBitsSbac.greaterOneBits[ctxIdx][1] = sbacGetEntropyBits(ctxOne[ctxIdx], 1); + } + + for (int ctxIdx = 0; ctxIdx < NUM_ABS_FLAG_CTX_LUMA; ctxIdx++) + { + estBitsSbac.levelAbsBits[ctxIdx][0] = sbacGetEntropyBits(ctxAbs[ctxIdx], 0); + estBitsSbac.levelAbsBits[ctxIdx][1] = sbacGetEntropyBits(ctxAbs[ctxIdx], 1); + } + } + else + { + const uint8_t *ctxOne = &m_contextState[OFF_ONE_FLAG_CTX + NUM_ONE_FLAG_CTX_LUMA]; + const uint8_t *ctxAbs = &m_contextState[OFF_ABS_FLAG_CTX + NUM_ABS_FLAG_CTX_LUMA]; + + for (int ctxIdx = 0; ctxIdx < NUM_ONE_FLAG_CTX_CHROMA; ctxIdx++) + { + estBitsSbac.greaterOneBits[ctxIdx][0] = sbacGetEntropyBits(ctxOne[ctxIdx], 0); + estBitsSbac.greaterOneBits[ctxIdx][1] = sbacGetEntropyBits(ctxOne[ctxIdx], 1); + } + + for (int ctxIdx = 0; ctxIdx < NUM_ABS_FLAG_CTX_CHROMA; ctxIdx++) + { + estBitsSbac.levelAbsBits[ctxIdx][0] = sbacGetEntropyBits(ctxAbs[ctxIdx], 0); + estBitsSbac.levelAbsBits[ctxIdx][1] = sbacGetEntropyBits(ctxAbs[ctxIdx], 1); + } + } +} + +/* Initialize our context information from the nominated source */ +void Entropy::copyContextsFrom(const Entropy& src) +{ + X265_CHECK(src.m_valid, "invalid copy source context\n"); + + memcpy(m_contextState, src.m_contextState, MAX_OFF_CTX_MOD * sizeof(m_contextState[0])); + markValid(); +} + +void Entropy::start() +{ + m_low = 0; + m_range = 510; + m_bitsLeft = -12; + m_numBufferedBytes = 0; + m_bufferedByte = 0xff; +} + +void Entropy::finish() +{ + if (m_low >> (21 + m_bitsLeft)) + { + m_bitIf->writeByte(m_bufferedByte + 1); + while (m_numBufferedBytes > 1) + { + m_bitIf->writeByte(0x00); + m_numBufferedBytes--; + } + + m_low -= 1 << (21 + m_bitsLeft); + } + else + { + if (m_numBufferedBytes > 0) + m_bitIf->writeByte(m_bufferedByte); + + while (m_numBufferedBytes > 1) + { + m_bitIf->writeByte(0xff); + m_numBufferedBytes--; + } + } + m_bitIf->write(m_low >> 8, 13 + m_bitsLeft); +} + +void Entropy::copyState(const Entropy& other) +{ + m_low = other.m_low; + m_range = other.m_range; + m_bitsLeft = other.m_bitsLeft; + m_bufferedByte = other.m_bufferedByte; + m_numBufferedBytes = other.m_numBufferedBytes; + m_fracBits = other.m_fracBits; +} + +void Entropy::resetBits() +{ + m_low = 0; + m_bitsLeft = -12; + m_numBufferedBytes = 0; + m_bufferedByte = 0xff; + m_fracBits &= 32767; + if (m_bitIf) + m_bitIf->resetBits(); +} + +/** Encode bin */ +void Entropy::encodeBin(uint32_t binValue, uint8_t &ctxModel) +{ + uint32_t mstate = ctxModel; + + ctxModel = sbacNext(mstate, binValue); + + if (!m_bitIf) + { + m_fracBits += sbacGetEntropyBits(mstate, binValue); + return; + } + + uint32_t range = m_range; + uint32_t state = sbacGetState(mstate); + uint32_t lps = g_lpsTable[state][((uint8_t)range >> 6)]; + range -= lps; + + X265_CHECK(lps >= 2, "lps is too small\n"); + + int numBits = (uint32_t)(range - 256) >> 31; + uint32_t low = m_low; + + // NOTE: MPS must be LOWEST bit in mstate + X265_CHECK((uint32_t)((binValue ^ mstate) & 1) == (uint32_t)(binValue != sbacGetMps(mstate)), "binValue failure\n"); + if ((binValue ^ mstate) & 1) + { + // NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256 + //numBits = g_renormTable[lps >> 3]; + unsigned long idx; + CLZ32(idx, lps); + X265_CHECK(state != 63 || idx == 1, "state failure\n"); + + numBits = 8 - idx; + if (state >= 63) + numBits = 6; + X265_CHECK(numBits <= 6, "numBits failure\n"); + + low += range; + range = lps; + } + m_low = (low << numBits); + m_range = (range << numBits); + m_bitsLeft += numBits; + + if (m_bitsLeft >= 0) + writeOut(); +} + +/** Encode equiprobable bin */ +void Entropy::encodeBinEP(uint32_t binValue) +{ + if (!m_bitIf) + { + m_fracBits += 32768; + return; + } + m_low <<= 1; + if (binValue) + m_low += m_range; + m_bitsLeft++; + + if (m_bitsLeft >= 0) + writeOut(); +} + +/** Encode equiprobable bins */ +void Entropy::encodeBinsEP(uint32_t binValues, int numBins) +{ + if (!m_bitIf) + { + m_fracBits += 32768 * numBins; + return; + } + + while (numBins > 8) + { + numBins -= 8; + uint32_t pattern = binValues >> numBins; + m_low <<= 8; + m_low += m_range * pattern; + binValues -= pattern << numBins; + m_bitsLeft += 8; + + if (m_bitsLeft >= 0) + writeOut(); + } + + m_low <<= numBins; + m_low += m_range * binValues; + m_bitsLeft += numBins; + + if (m_bitsLeft >= 0) + writeOut(); +} + +/** Encode terminating bin */ +void Entropy::encodeBinTrm(uint32_t binValue) +{ + if (!m_bitIf) + { + m_fracBits += sbacGetEntropyBitsTrm(binValue); + return; + } + + m_range -= 2; + if (binValue) + { + m_low += m_range; + m_low <<= 7; + m_range = 2 << 7; + m_bitsLeft += 7; + } + else if (m_range >= 256) + return; + else + { + m_low <<= 1; + m_range <<= 1; + m_bitsLeft++; + } + + if (m_bitsLeft >= 0) + writeOut(); +} + +/** Move bits from register into bitstream */ +void Entropy::writeOut() +{ + uint32_t leadByte = m_low >> (13 + m_bitsLeft); + uint32_t low_mask = (uint32_t)(~0) >> (11 + 8 - m_bitsLeft); + + m_bitsLeft -= 8; + m_low &= low_mask; + + if (leadByte == 0xff) + m_numBufferedBytes++; + else + { + uint32_t numBufferedBytes = m_numBufferedBytes; + if (numBufferedBytes > 0) + { + uint32_t carry = leadByte >> 8; + uint32_t byteTowrite = m_bufferedByte + carry; + m_bitIf->writeByte(byteTowrite); + + byteTowrite = (0xff + carry) & 0xff; + while (numBufferedBytes > 1) + { + m_bitIf->writeByte(byteTowrite); + numBufferedBytes--; + } + } + m_numBufferedBytes = 1; + m_bufferedByte = (uint8_t)leadByte; + } +} + +const uint32_t g_entropyBits[128] = +{ + // Corrected table, most notably for last state + 0x07b23, 0x085f9, 0x074a0, 0x08cbc, 0x06ee4, 0x09354, 0x067f4, 0x09c1b, 0x060b0, 0x0a62a, 0x05a9c, 0x0af5b, 0x0548d, 0x0b955, 0x04f56, 0x0c2a9, + 0x04a87, 0x0cbf7, 0x045d6, 0x0d5c3, 0x04144, 0x0e01b, 0x03d88, 0x0e937, 0x039e0, 0x0f2cd, 0x03663, 0x0fc9e, 0x03347, 0x10600, 0x03050, 0x10f95, + 0x02d4d, 0x11a02, 0x02ad3, 0x12333, 0x0286e, 0x12cad, 0x02604, 0x136df, 0x02425, 0x13f48, 0x021f4, 0x149c4, 0x0203e, 0x1527b, 0x01e4d, 0x15d00, + 0x01c99, 0x166de, 0x01b18, 0x17017, 0x019a5, 0x17988, 0x01841, 0x18327, 0x016df, 0x18d50, 0x015d9, 0x19547, 0x0147c, 0x1a083, 0x0138e, 0x1a8a3, + 0x01251, 0x1b418, 0x01166, 0x1bd27, 0x01068, 0x1c77b, 0x00f7f, 0x1d18e, 0x00eda, 0x1d91a, 0x00e19, 0x1e254, 0x00d4f, 0x1ec9a, 0x00c90, 0x1f6e0, + 0x00c01, 0x1fef8, 0x00b5f, 0x208b1, 0x00ab6, 0x21362, 0x00a15, 0x21e46, 0x00988, 0x2285d, 0x00934, 0x22ea8, 0x008a8, 0x239b2, 0x0081d, 0x24577, + 0x007c9, 0x24ce6, 0x00763, 0x25663, 0x00710, 0x25e8f, 0x006a0, 0x26a26, 0x00672, 0x26f23, 0x005e8, 0x27ef8, 0x005ba, 0x284b5, 0x0055e, 0x29057, + 0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f, 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb +}; + +const uint8_t g_nextState[128][2] = +{ + { 2, 1 }, { 0, 3 }, { 4, 0 }, { 1, 5 }, { 6, 2 }, { 3, 7 }, { 8, 4 }, { 5, 9 }, + { 10, 4 }, { 5, 11 }, { 12, 8 }, { 9, 13 }, { 14, 8 }, { 9, 15 }, { 16, 10 }, { 11, 17 }, + { 18, 12 }, { 13, 19 }, { 20, 14 }, { 15, 21 }, { 22, 16 }, { 17, 23 }, { 24, 18 }, { 19, 25 }, + { 26, 18 }, { 19, 27 }, { 28, 22 }, { 23, 29 }, { 30, 22 }, { 23, 31 }, { 32, 24 }, { 25, 33 }, + { 34, 26 }, { 27, 35 }, { 36, 26 }, { 27, 37 }, { 38, 30 }, { 31, 39 }, { 40, 30 }, { 31, 41 }, + { 42, 32 }, { 33, 43 }, { 44, 32 }, { 33, 45 }, { 46, 36 }, { 37, 47 }, { 48, 36 }, { 37, 49 }, + { 50, 38 }, { 39, 51 }, { 52, 38 }, { 39, 53 }, { 54, 42 }, { 43, 55 }, { 56, 42 }, { 43, 57 }, + { 58, 44 }, { 45, 59 }, { 60, 44 }, { 45, 61 }, { 62, 46 }, { 47, 63 }, { 64, 48 }, { 49, 65 }, + { 66, 48 }, { 49, 67 }, { 68, 50 }, { 51, 69 }, { 70, 52 }, { 53, 71 }, { 72, 52 }, { 53, 73 }, + { 74, 54 }, { 55, 75 }, { 76, 54 }, { 55, 77 }, { 78, 56 }, { 57, 79 }, { 80, 58 }, { 59, 81 }, + { 82, 58 }, { 59, 83 }, { 84, 60 }, { 61, 85 }, { 86, 60 }, { 61, 87 }, { 88, 60 }, { 61, 89 }, + { 90, 62 }, { 63, 91 }, { 92, 64 }, { 65, 93 }, { 94, 64 }, { 65, 95 }, { 96, 66 }, { 67, 97 }, + { 98, 66 }, { 67, 99 }, { 100, 66 }, { 67, 101 }, { 102, 68 }, { 69, 103 }, { 104, 68 }, { 69, 105 }, + { 106, 70 }, { 71, 107 }, { 108, 70 }, { 71, 109 }, { 110, 70 }, { 71, 111 }, { 112, 72 }, { 73, 113 }, + { 114, 72 }, { 73, 115 }, { 116, 72 }, { 73, 117 }, { 118, 74 }, { 75, 119 }, { 120, 74 }, { 75, 121 }, + { 122, 74 }, { 75, 123 }, { 124, 76 }, { 77, 125 }, { 124, 76 }, { 77, 125 }, { 126, 126 }, { 127, 127 } +}; + +} diff --git a/source/encoder/entropy.h b/source/encoder/entropy.h new file mode 100644 index 0000000..bed06cf --- /dev/null +++ b/source/encoder/entropy.h @@ -0,0 +1,246 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_ENTROPY_H +#define X265_ENTROPY_H + +#include "common.h" +#include "bitstream.h" +#include "frame.h" +#include "contexts.h" +#include "slice.h" + +namespace x265 { +// private namespace + +struct SaoCtuParam; +struct EstBitsSbac; +class CUData; +struct CUGeom; +class ScalingList; + +enum SplitType +{ + DONT_SPLIT = 0, + VERTICAL_SPLIT = 1, + QUAD_SPLIT = 2, + NUMBER_OF_SPLIT_MODES = 3 +}; + +struct TURecurse +{ + uint32_t section; + uint32_t splitMode; + uint32_t absPartIdxTURelCU; + uint32_t absPartIdxStep; + + TURecurse(SplitType splitType, uint32_t _absPartIdxStep, uint32_t _absPartIdxTU) + { + static const uint32_t partIdxStepShift[NUMBER_OF_SPLIT_MODES] = { 0, 1, 2 }; + section = 0; + absPartIdxTURelCU = _absPartIdxTU; + splitMode = (uint32_t)splitType; + absPartIdxStep = _absPartIdxStep >> partIdxStepShift[splitMode]; + } + + bool isNextSection() + { + if (splitMode == DONT_SPLIT) + { + section++; + return false; + } + else + { + absPartIdxTURelCU += absPartIdxStep; + + section++; + return section < (uint32_t)(1 << splitMode); + } + } + + bool isLastSection() const + { + return (section + 1) >= (uint32_t)(1 << splitMode); + } +}; + +struct EstBitsSbac +{ + int significantCoeffGroupBits[NUM_SIG_CG_FLAG_CTX][2]; + int significantBits[NUM_SIG_FLAG_CTX][2]; + int lastXBits[10]; + int lastYBits[10]; + int greaterOneBits[NUM_ONE_FLAG_CTX][2]; + int levelAbsBits[NUM_ABS_FLAG_CTX][2]; + int blockCbpBits[NUM_QT_CBF_CTX][2]; + int blockRootCbpBits[2]; +}; + +class Entropy : public SyntaxElementWriter +{ +public: + + uint64_t m_pad; + uint8_t m_contextState[160]; // MAX_OFF_CTX_MOD + padding + + /* CABAC state */ + uint32_t m_low; + uint32_t m_range; + uint32_t m_bufferedByte; + int m_numBufferedBytes; + int m_bitsLeft; + uint64_t m_fracBits; + EstBitsSbac m_estBitsSbac; + + Entropy(); + + void setBitstream(Bitstream* p) { m_bitIf = p; } + + uint32_t getNumberOfWrittenBits() + { + X265_CHECK(!m_bitIf, "bit counting mode expected\n"); + return (uint32_t)(m_fracBits >> 15); + } + +#if CHECKED_BUILD || _DEBUG + bool m_valid; + void markInvalid() { m_valid = false; } + void markValid() { m_valid = true; } +#else + void markValid() { } +#endif + void zeroFract() { m_fracBits = 0; } + void resetBits(); + void resetEntropy(const Slice& slice); + + // SBAC RD + void load(const Entropy& src) { copyFrom(src); } + void store(Entropy& dest) const { dest.copyFrom(*this); } + void loadContexts(const Entropy& src) { copyContextsFrom(src); } + void loadIntraDirModeLuma(const Entropy& src); + void copyState(const Entropy& other); + + void codeVPS(const VPS& vps); + void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl); + void codePPS(const PPS& pps); + void codeVUI(const VUI& vui); + void codeAUD(const Slice& slice); + void codeHrdParameters(const HRDInfo& hrd); + + void codeSliceHeader(const Slice& slice, FrameData& encData); + void codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset); + void codeShortTermRefPicSet(const RPS& rps); + void finishSlice() { encodeBinTrm(1); finish(); dynamic_cast(m_bitIf)->writeByteAlignment(); } + + void encodeCTU(const CUData& cu, const CUGeom& cuGeom); + void codeSaoOffset(const SaoCtuParam& ctuParam, int plane); + void codeSaoMerge(uint32_t code) { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); } + + void codeCUTransquantBypassFlag(uint32_t symbol); + void codeSkipFlag(const CUData& cu, uint32_t absPartIdx); + void codeMergeFlag(const CUData& cu, uint32_t absPartIdx); + void codeMergeIndex(const CUData& cu, uint32_t absPartIdx); + void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth); + void codeMVPIdx(uint32_t symbol); + void codeMvd(const CUData& cu, uint32_t absPartIdx, int list); + + void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth); + void codePredMode(int predMode); + void codePredInfo(const CUData& cu, uint32_t absPartIdx); + void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx); + void codeQtCbf(const CUData& cu, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height, TextType ttype, uint32_t trDepth, bool lowestLevel); + void codeQtCbf(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t trDepth); + void codeQtCbf(uint32_t cbf, TextType ttype, uint32_t trDepth); + void codeQtCbfZero(TextType ttype, uint32_t trDepth); + void codeQtRootCbfZero(); + void codeCoeff(const CUData& cu, uint32_t absPartIdx, uint32_t depth, bool& bCodeDQP, uint32_t depthRange[2]); + void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype); + + uint32_t bitsIntraModeNonMPM() const; + uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const; + void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple); + void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode); + + // RDO functions + void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const; + void estCBFBit(EstBitsSbac& estBitsSbac) const; + void estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const; + void estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const; + void estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const; + +private: + + /* CABAC private methods */ + void start(); + void finish(); + + void encodeBin(uint32_t binValue, uint8_t& ctxModel); + void encodeBinEP(uint32_t binValue); + void encodeBinsEP(uint32_t binValues, int numBins); + void encodeBinTrm(uint32_t binValue); + + void encodeCU(const CUData& cu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP); + void finishCU(const CUData& cu, uint32_t absPartIdx, uint32_t depth); + + void writeOut(); + + /* SBac private methods */ + void writeUnaryMaxSymbol(uint32_t symbol, uint8_t* scmModel, int offset, uint32_t maxSymbol); + void writeEpExGolomb(uint32_t symbol, uint32_t count); + void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice); + + void codeProfileTier(const ProfileTierLevel& ptl); + void codeScalingList(const ScalingList&); + void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId); + + void codePredWeightTable(const Slice& slice); + void codeInterDir(const CUData& cu, uint32_t absPartIdx); + void codePUWise(const CUData& cu, uint32_t absPartIdx); + void codeQtRootCbf(uint32_t cbf); + void codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list); + void codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list); + + void codeSaoMaxUvlc(uint32_t code, uint32_t maxSymbol); + + void codeDeltaQP(const CUData& cu, uint32_t absPartIdx); + void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx); + void codeTransformSkipFlags(const CUData& cu, uint32_t absPartIdx, uint32_t trSize, TextType ttype); + + struct CoeffCodeState + { + uint32_t bakAbsPartIdx; + uint32_t bakChromaOffset; + uint32_t bakAbsPartIdxCU; + }; + + void encodeTransform(const CUData& cu, CoeffCodeState& state, uint32_t offsetLumaOffset, uint32_t offsetChroma, + uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t depth, uint32_t log2TrSize, uint32_t trIdx, + bool& bCodeDQP, uint32_t depthRange[2]); + + void copyFrom(const Entropy& src); + void copyContextsFrom(const Entropy& src); +}; +} + +#endif // ifndef X265_ENTROPY_H diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp new file mode 100644 index 0000000..c6e6915 --- /dev/null +++ b/source/encoder/frameencoder.cpp @@ -0,0 +1,1142 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Chung Shin Yee + * Min Chen + * Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "wavefront.h" +#include "param.h" + +#include "PPA/ppa.h" + +#include "encoder.h" +#include "frameencoder.h" +#include "common.h" +#include "slicetype.h" +#include "nal.h" + +namespace x265 { +void weightAnalyse(Slice& slice, Frame& frame, x265_param& param); + +FrameEncoder::FrameEncoder() + : WaveFront(NULL) + , m_threadActive(true) +{ + m_totalTime = 0; + m_frameEncoderID = 0; + m_bAllRowsStop = false; + m_vbvResetTriggerRow = -1; + m_outStreams = NULL; + m_substreamSizes = NULL; + m_nr = NULL; + m_tld = NULL; + m_rows = NULL; + m_top = NULL; + m_param = NULL; + m_frame = NULL; + m_cuGeoms = NULL; + m_ctuGeomMap = NULL; + memset(&m_frameStats, 0, sizeof(m_frameStats)); + memset(&m_rce, 0, sizeof(RateControlEntry)); +} + +void FrameEncoder::destroy() +{ + if (m_pool) + JobProvider::flush(); // ensure no worker threads are using this frame + + m_threadActive = false; + m_enable.trigger(); + + delete[] m_rows; + delete[] m_outStreams; + X265_FREE(m_cuGeoms); + X265_FREE(m_ctuGeomMap); + X265_FREE(m_substreamSizes); + X265_FREE(m_nr); + + m_frameFilter.destroy(); + + if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) + { + delete m_rce.picTimingSEI; + delete m_rce.hrdTiming; + } + + // wait for worker thread to exit + stop(); +} + +bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id) +{ + m_top = top; + m_param = top->m_param; + m_numRows = numRows; + m_numCols = numCols; + m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ? + 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0); + m_filterRowDelayCus = m_filterRowDelay * numCols; + m_frameEncoderID = id; + m_rows = new CTURow[m_numRows]; + bool ok = !!m_numRows; + + int range = m_param->searchRange; /* fpel search */ + range += 1; /* diamond search range check lag */ + range += 2; /* subpel refine */ + range += NTAPS_LUMA / 2; /* subpel filter half-length */ + m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize); + + // NOTE: 2 times of numRows because both Encoder and Filter in same queue + if (!WaveFront::init(m_numRows * 2)) + { + x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n"); + m_pool = NULL; + } + + m_frameFilter.init(top, this, numRows); + + // initialize HRD parameters of SPS + if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) + { + m_rce.picTimingSEI = new SEIPictureTiming; + m_rce.hrdTiming = new HRDTiming; + + ok &= m_rce.picTimingSEI && m_rce.hrdTiming; + } + + if (m_param->noiseReduction) + m_nr = X265_MALLOC(NoiseReduction, 1); + if (m_nr) + memset(m_nr, 0, sizeof(NoiseReduction)); + else + m_param->noiseReduction = 0; + + start(); + return ok; +} + +/* Generate a complete list of unique geom sets for the current picture dimensions */ +bool FrameEncoder::initializeGeoms(const FrameData& encData) +{ + /* Geoms only vary between CTUs in the presence of picture edges */ + int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1); + int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1); + int allocGeoms = 1; // body + if (heightRem && widthRem) + allocGeoms = 4; // body, right, bottom, corner + else if (heightRem || widthRem) + allocGeoms = 2; // body, right or bottom + + m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols); + m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS); + if (!m_cuGeoms || !m_ctuGeomMap) + return false; + + CUGeom cuLocalData[CUGeom::MAX_GEOMS]; + memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp + + int countGeoms = 0; + for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++) + { + /* TODO: detach this logic from TComDataCU */ + encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0); + encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData); + + m_ctuGeomMap[ctuAddr] = MAX_INT; + for (int i = 0; i < countGeoms; i++) + { + if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS)) + { + m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS; + break; + } + } + + if (m_ctuGeomMap[ctuAddr] == MAX_INT) + { + X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n"); + m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; + memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS); + countGeoms++; + } + } + + return true; +} + +bool FrameEncoder::startCompressFrame(Frame* curFrame) +{ + m_frame = curFrame; + curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it + curFrame->m_encData->m_slice->m_mref = m_mref; + if (!m_cuGeoms) + { + if (!initializeGeoms(*curFrame->m_encData)) + return false; + } + m_enable.trigger(); + return true; +} + +void FrameEncoder::threadMain() +{ + // worker thread routine for FrameEncoder + do + { + m_enable.wait(); // Encoder::encode() triggers this event + if (m_threadActive) + { + compressFrame(); + m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event + } + } + while (m_threadActive); +} + +void FrameEncoder::compressFrame() +{ + PPAScopeEvent(FrameEncoder_compressFrame); + int64_t startCompressTime = x265_mdate(); + Slice* slice = m_frame->m_encData->m_slice; + + /* Emit access unit delimiter unless this is the first frame and the user is + * not repeating headers (since AUD is supposed to be the first NAL in the access + * unit) */ + if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders)) + { + m_bs.resetBits(); + m_entropyCoder.setBitstream(&m_bs); + m_entropyCoder.codeAUD(*slice); + m_bs.writeByteAlignment(); + m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs); + } + if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) + m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs); + + // Weighted Prediction parameters estimation. + bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred; + bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred; + if (bUseWeightP || bUseWeightB) + weightAnalyse(*slice, *m_frame, *m_param); + else + slice->disableWeights(); + + // Generate motion references + int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) + { + WeightParam *w = NULL; + if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag) + w = slice->m_weightPredTable[l][ref]; + m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w); + } + } + + /* Get the QP for this frame from rate control. This call may block until + * frames ahead of it in encode order have called rateControlEnd() */ + int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top); + m_rce.newQp = qp; + + /* Clip slice QP to 0-51 spec range before encoding */ + slice->m_sliceQp = Clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp); + + m_initSliceContext.resetEntropy(*slice); + + m_frameFilter.start(m_frame, m_initSliceContext, qp); + + // reset entropy coders + m_entropyCoder.load(m_initSliceContext); + for (int i = 0; i < m_numRows; i++) + m_rows[i].init(m_initSliceContext); + + uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; + if (!m_outStreams) + { + m_outStreams = new Bitstream[numSubstreams]; + m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams); + if (!m_param->bEnableSAO) + for (uint32_t i = 0; i < numSubstreams; i++) + m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]); + } + else + for (uint32_t i = 0; i < numSubstreams; i++) + m_outStreams[i].resetBits(); + + if (m_frame->m_lowres.bKeyframe) + { + if (m_param->bEmitHRDSEI) + { + SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI; + + // since the temporal layer HRD is not ready, we assumed it is fixed + bpSei->m_auCpbRemovalDelayDelta = 1; + bpSei->m_cpbDelayOffset = 0; + bpSei->m_dpbDelayOffset = 0; + + // hrdFullness() calculates the initial CPB removal delay and offset + m_top->m_rateControl->hrdFullness(bpSei); + + m_bs.resetBits(); + bpSei->write(m_bs, *slice->m_sps); + m_bs.writeByteAlignment(); + + m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); + + m_top->m_lastBPSEI = m_rce.encodeOrder; + } + + // The recovery point SEI message assists a decoder in determining when the decoding + // process will produce acceptable pictures for display after the decoder initiates + // random access. The m_recoveryPocCnt is in units of POC(picture order count) which + // means pictures encoded after the CRA but precede it in display order(leading) are + // implicitly discarded after a random access seek regardless of the value of + // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA, + // so all pictures following the CRA in POC order are guaranteed to be displayable, + // so m_recoveryPocCnt is always 0. + SEIRecoveryPoint sei_recovery_point; + sei_recovery_point.m_recoveryPocCnt = 0; + sei_recovery_point.m_exactMatchingFlag = true; + sei_recovery_point.m_brokenLinkFlag = false; + + m_bs.resetBits(); + sei_recovery_point.write(m_bs, *slice->m_sps); + m_bs.writeByteAlignment(); + + m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); + } + + if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) + { + SEIPictureTiming *sei = m_rce.picTimingSEI; + const VUI *vui = &slice->m_sps->vuiParameters; + const HRDInfo *hrd = &vui->hrdParameters; + int poc = slice->m_poc; + + if (vui->frameFieldInfoPresentFlag) + { + if (m_param->interlaceMode == 2) + sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */; + else if (m_param->interlaceMode == 1) + sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */; + else + sei->m_picStruct = 0; + sei->m_sourceScanType = 0; + sei->m_duplicateFlag = false; + } + + if (vui->hrdParametersPresentFlag) + { + // The m_aucpbremoval delay specifies how many clock ticks the + // access unit associated with the picture timing SEI message has to + // wait after removal of the access unit with the most recent + // buffering period SEI message + sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - m_top->m_lastBPSEI), (1 << hrd->cpbRemovalDelayLength)); + sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder; + } + + m_bs.resetBits(); + sei->write(m_bs, *slice->m_sps); + m_bs.writeByteAlignment(); + m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); + } + + // Analyze CTU rows, most of the hard work is done here + // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a + // wave-front behind the CU compression and reconstruction + compressCTURows(); + + if (m_param->rc.bStatWrite) + { + int totalI = 0, totalP = 0, totalSkip = 0; + + // accumulate intra,inter,skip cu count per frame for 2 pass + for (int i = 0; i < m_numRows; i++) + { + m_frameStats.mvBits += m_rows[i].rowStats.mvBits; + m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits; + m_frameStats.miscBits += m_rows[i].rowStats.miscBits; + totalI += m_rows[i].rowStats.iCuCnt; + totalP += m_rows[i].rowStats.pCuCnt; + totalSkip += m_rows[i].rowStats.skipCuCnt; + } + int totalCuCount = totalI + totalP + totalSkip; + m_frameStats.percentIntra = (double)totalI / totalCuCount; + m_frameStats.percentInter = (double)totalP / totalCuCount; + m_frameStats.percentSkip = (double)totalSkip / totalCuCount; + } + + m_bs.resetBits(); + m_entropyCoder.load(m_initSliceContext); + m_entropyCoder.setBitstream(&m_bs); + m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData); + + // finish encode of each CTU row, only required when SAO is enabled + if (m_param->bEnableSAO) + encodeSlice(); + + // serialize each row, record final lengths in slice header + uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams); + + // complete the slice header by writing WPP row-starts + m_entropyCoder.setBitstream(&m_bs); + if (slice->m_pps->bEntropyCodingSyncEnabled) + m_entropyCoder.codeSliceHeaderWPPEntryPoints(*slice, m_substreamSizes, maxStreamSize); + m_bs.writeByteAlignment(); + + m_nalList.serialize(slice->m_nalUnitType, m_bs); + + if (m_param->decodedPictureHashSEI) + { + if (m_param->decodedPictureHashSEI == 1) + { + m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5; + for (int i = 0; i < 3; i++) + MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]); + } + else if (m_param->decodedPictureHashSEI == 2) + { + m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC; + for (int i = 0; i < 3; i++) + crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]); + } + else if (m_param->decodedPictureHashSEI == 3) + { + m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM; + for (int i = 0; i < 3; i++) + checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]); + } + + m_bs.resetBits(); + m_seiReconPictureDigest.write(m_bs, *slice->m_sps); + m_bs.writeByteAlignment(); + + m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs); + } + + uint64_t bytes = 0; + for (uint32_t i = 0; i < m_nalList.m_numNal; i++) + { + int type = m_nalList.m_nal[i].type; + + // exclude SEI + if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI) + { + bytes += m_nalList.m_nal[i].sizeBytes; + // and exclude start code prefix + bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3; + } + } + m_accessUnitBits = bytes << 3; + + m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000; + /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */ + if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0) + m_top->m_aborted = true; + + /* Accumulate NR statistics from all worker threads */ + if (m_nr) + { + for (int i = 0; i < m_top->m_numThreadLocalData; i++) + { + NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; + for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) + { + for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++) + m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff]; + + m_nr->count[cat] += nr->count[cat]; + } + } + } + + noiseReductionUpdate(); + + /* Copy updated NR coefficients back to all worker threads */ + if (m_nr) + { + for (int i = 0; i < m_top->m_numThreadLocalData; i++) + { + NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; + memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); + memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES); + memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); + } + } + + // Decrement referenced frame reference counts, allow them to be recycled + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) + { + Frame *refpic = slice->m_refPicList[l][ref]; + ATOMIC_DEC(&refpic->m_countRefEncoders); + } + } +} + +void FrameEncoder::encodeSlice() +{ + Slice* slice = m_frame->m_encData->m_slice; + const uint32_t widthInLCUs = slice->m_sps->numCuInWidth; + const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS; + const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; + + SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL; + for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++) + { + uint32_t col = cuAddr % widthInLCUs; + uint32_t lin = cuAddr / widthInLCUs; + uint32_t subStrm = lin % numSubstreams; + CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr); + + m_entropyCoder.setBitstream(&m_outStreams[subStrm]); + + // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line. + if (m_param->bEnableWavefront && !col && lin) + { + m_entropyCoder.copyState(m_initSliceContext); + m_entropyCoder.loadContexts(m_rows[lin - 1].bufferedEntropy); + } + + if (saoParam) + { + if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1]) + { + int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT; + int mergeUp = lin && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP; + if (col) + m_entropyCoder.codeSaoMerge(mergeLeft); + if (lin && !mergeLeft) + m_entropyCoder.codeSaoMerge(mergeUp); + if (!mergeLeft && !mergeUp) + { + if (saoParam->bSaoFlag[0]) + m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0); + if (saoParam->bSaoFlag[1]) + { + m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1); + m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2); + } + } + } + else + { + for (int i = 0; i < 3; i++) + saoParam->ctuParam[i][cuAddr].reset(); + } + } + + // final coding (bitstream generation) for this CU + m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]); + + if (m_param->bEnableWavefront) + { + if (col == 1) + // Store probabilities of second CTU in line into buffer + m_rows[lin].bufferedEntropy.loadContexts(m_entropyCoder); + + if (col == widthInLCUs - 1) + m_entropyCoder.finishSlice(); + } + } + if (!m_param->bEnableWavefront) + m_entropyCoder.finishSlice(); +} + +void FrameEncoder::compressCTURows() +{ + PPAScopeEvent(FrameEncoder_compressRows); + Slice* slice = m_frame->m_encData->m_slice; + + m_bAllRowsStop = false; + m_vbvResetTriggerRow = -1; + + m_SSDY = m_SSDU = m_SSDV = 0; + m_ssim = 0; + m_ssimCnt = 0; + memset(&m_frameStats, 0, sizeof(m_frameStats)); + + bool bUseWeightP = slice->m_pps->bUseWeightPred && slice->m_sliceType == P_SLICE; + bool bUseWeightB = slice->m_pps->bUseWeightedBiPred && slice->m_sliceType == B_SLICE; + int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; + + m_rows[0].active = true; + if (m_pool && m_param->bEnableWavefront) + { + WaveFront::clearEnabledRowMask(); + WaveFront::enqueue(); + + for (int row = 0; row < m_numRows; row++) + { + // block until all reference frames have reconstructed the rows we need + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) + { + Frame *refpic = slice->m_refPicList[l][ref]; + + int reconRowCount = refpic->m_reconRowCount.get(); + while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows)) + reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); + + if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) + m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows); + } + } + + enableRowEncoder(row); + if (row == 0) + enqueueRowEncoder(0); + else + m_pool->pokeIdleThread(); + } + + m_completionEvent.wait(); + + WaveFront::dequeue(); + } + else + { + for (int i = 0; i < this->m_numRows + m_filterRowDelay; i++) + { + // Encode + if (i < m_numRows) + { + // block until all reference frames have reconstructed the rows we need + for (int l = 0; l < numPredDir; l++) + { + int list = l; + for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) + { + Frame *refpic = slice->m_refPicList[list][ref]; + + int reconRowCount = refpic->m_reconRowCount.get(); + while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows)) + reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); + + if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) + m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows); + } + } + + processRow(i * 2 + 0, -1); + } + + // Filter + if (i >= m_filterRowDelay) + processRow((i - m_filterRowDelay) * 2 + 1, -1); + } + } + m_frameTime = (double)m_totalTime / 1000000; + m_totalTime = 0; +} + +void FrameEncoder::processRow(int row, int threadId) +{ + const int realRow = row >> 1; + const int typeNum = row & 1; + + ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld; + + if (!typeNum) + processRowEncoder(realRow, tld); + else + { + processRowFilter(realRow); + + // NOTE: Active next row + if (realRow != m_numRows - 1) + enqueueRowFilter(realRow + 1); + else + m_completionEvent.trigger(); + } +} + +// Called by worker threads +void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) +{ + PPAScopeEvent(Thread_ProcessRow); + + CTURow& curRow = m_rows[row]; + + { + ScopedLock self(curRow.lock); + if (!curRow.active) + /* VBV restart is in progress, exit out */ + return; + if (curRow.busy) + { + /* On multi-socket Windows servers, we have seen problems with + * ATOMIC_CAS which resulted in multiple worker threads processing + * the same CU row, which often resulted in bad pointer accesses. We + * believe the problem is fixed, but are leaving this check in place + * to prevent crashes in case it is not */ + x265_log(m_param, X265_LOG_WARNING, + "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n"); + return; + } + curRow.busy = true; + } + + /* When WPP is enabled, every row has its own row coder instance. Otherwise + * they share row 0 */ + Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder; + FrameData& curEncData = *m_frame->m_encData; + Slice *slice = curEncData.m_slice; + PicYuv* fencPic = m_frame->m_origPicYuv; + + tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride); + + int64_t startTime = x265_mdate(); + const uint32_t numCols = m_numCols; + const uint32_t lineStartCUAddr = row * numCols; + bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; + + while (curRow.completed < numCols) + { + int col = curRow.completed; + const uint32_t cuAddr = lineStartCUAddr + col; + CUData* ctu = curEncData.getPicCTU(cuAddr); + ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp); + + if (bIsVbv) + { + if (!row) + { + curEncData.m_rowStat[row].diagQp = curEncData.m_avgQpRc; + curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(curEncData.m_avgQpRc); + } + + if (row >= col && row && m_vbvResetTriggerRow != row) + curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp; + else + curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_rowStat[row].diagQp; + } + else + curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc; + + if (m_param->rc.aqMode || bIsVbv) + { + int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp); + tld.analysis.setQP(*slice, qp); + qp = Clip3(QP_MIN, QP_MAX_SPEC, qp); + ctu->setQPSubParts((char)qp, 0, 0); + curEncData.m_rowStat[row].sumQpAq += qp; + } + else + tld.analysis.setQP(*slice, slice->m_sliceQp); + + if (m_param->bEnableWavefront && !col && row) + { + // Load SBAC coder context from previous row and initialize row state. + rowCoder.copyState(m_initSliceContext); + rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy); + } + + // Does all the CU analysis, returns best top level mode decision + Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder); + + /* advance top-level row coder to include the context of this CTU. + * if SAO is disabled, rowCoder writes the final CTU bitstream */ + rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]); + + if (m_param->bEnableWavefront && col == 1) + // Save CABAC state for next row + curRow.bufferedEntropy.loadContexts(rowCoder); + + // Completed CU processing + curRow.completed++; + + if (m_param->bLogCuStats || m_param->rc.bStatWrite) + collectCTUStatistics(*ctu); + + // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass + if (m_param->rc.bStatWrite) + { + curRow.rowStats.mvBits += best.mvBits; + curRow.rowStats.coeffBits += best.coeffBits; + curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits); + StatisticLog* log = &m_sliceTypeLog[slice->m_sliceType]; + + for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + { + /* 1 << shift == number of 8x8 blocks at current depth */ + int shift = 2 * (g_maxCUDepth - depth); + curRow.rowStats.iCuCnt += log->qTreeIntraCnt[depth] << shift; + curRow.rowStats.pCuCnt += log->qTreeInterCnt[depth] << shift; + curRow.rowStats.skipCuCnt += log->qTreeSkipCnt[depth] << shift; + + // clear the row cu data from thread local object + log->qTreeIntraCnt[depth] = log->qTreeInterCnt[depth] = log->qTreeSkipCnt[depth] = 0; + } + } + + curEncData.m_cuStat[cuAddr].totalBits = best.totalBits; + x265_emms(); + + if (bIsVbv) + { + // Update encoded bits, satdCost, baseQP for each CU + curEncData.m_rowStat[row].diagSatd += curEncData.m_cuStat[cuAddr].vbvCost; + curEncData.m_rowStat[row].diagIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost; + curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits; + curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp; + curEncData.m_rowStat[row].numEncodedCUs = cuAddr; + + // If current block is at row diagonal checkpoint, call vbv ratecontrol. + + if (row == col && row) + { + double qpBase = curEncData.m_cuStat[cuAddr].baseQp; + int reEncode = m_top->m_rateControl->rowDiagonalVbvRateControl(m_frame, row, &m_rce, qpBase); + qpBase = Clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase); + curEncData.m_rowStat[row].diagQp = qpBase; + curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(qpBase); + + if (reEncode < 0) + { + x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n", + m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp); + + // prevent the WaveFront::findJob() method from providing new jobs + m_vbvResetTriggerRow = row; + m_bAllRowsStop = true; + + for (int r = m_numRows - 1; r >= row; r--) + { + CTURow& stopRow = m_rows[r]; + + if (r != row) + { + /* if row was active (ready to be run) clear active bit and bitmap bit for this row */ + stopRow.lock.acquire(); + while (stopRow.active) + { + if (dequeueRow(r * 2)) + stopRow.active = false; + else + GIVE_UP_TIME(); + } + + stopRow.lock.release(); + + bool bRowBusy = true; + do + { + stopRow.lock.acquire(); + bRowBusy = stopRow.busy; + stopRow.lock.release(); + + if (bRowBusy) + { + GIVE_UP_TIME(); + } + } + while (bRowBusy); + } + + m_outStreams[r].resetBits(); + stopRow.completed = 0; + memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats)); + curEncData.m_rowStat[r].numEncodedCUs = 0; + curEncData.m_rowStat[r].encodedBits = 0; + curEncData.m_rowStat[r].diagSatd = 0; + curEncData.m_rowStat[r].diagIntraSatd = 0; + curEncData.m_rowStat[r].sumQpRc = 0; + curEncData.m_rowStat[r].sumQpAq = 0; + } + + m_bAllRowsStop = false; + } + } + } + + // NOTE: do CU level Filter + if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) + // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas + m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); + + // NOTE: active next row + if (curRow.completed >= 2 && row < m_numRows - 1) + { + ScopedLock below(m_rows[row + 1].lock); + if (m_rows[row + 1].active == false && + m_rows[row + 1].completed + 2 <= curRow.completed && + (!m_bAllRowsStop || row + 1 < m_vbvResetTriggerRow)) + { + m_rows[row + 1].active = true; + enqueueRowEncoder(row + 1); + } + } + + ScopedLock self(curRow.lock); + if ((m_bAllRowsStop && row > m_vbvResetTriggerRow) || + (row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2)) + { + curRow.active = false; + curRow.busy = false; + m_totalTime += x265_mdate() - startTime; + return; + } + } + + /* *this row of CTUs has been encoded* */ + + /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */ + if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1)) + rowCoder.finishSlice(); + + /* If encoding with ABR, update update bits and complexity in rate control + * after a number of rows so the next frame's rateControlStart has more + * accurate data for estimation. At the start of the encode we update stats + * after half the frame is encoded, but after this initial period we update + * after refLagRows (the number of rows reference frames must have completed + * before referencees may begin encoding) */ + int rowCount = 0; + if (m_param->rc.rateControlMode == X265_RC_ABR) + { + if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom)) + rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1); + else + rowCount = X265_MIN(m_refLagRows, m_numRows - 1); + } + if (row == rowCount) + { + m_rce.rowTotalBits = 0; + if (bIsVbv) + for (int i = 0; i < rowCount; i++) + m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits; + else + for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++) + m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits; + + m_top->m_rateControl->rateControlUpdateStats(&m_rce); + } + + // trigger row-wise loop filters + if (row >= m_filterRowDelay) + { + enableRowFilter(row - m_filterRowDelay); + + // NOTE: Active Filter to first row (row 0) + if (row == m_filterRowDelay) + enqueueRowFilter(0); + } + if (row == m_numRows - 1) + { + for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++) + enableRowFilter(i); + } + + m_totalTime += x265_mdate() - startTime; + curRow.busy = false; +} + +void FrameEncoder::collectCTUStatistics(CUData& ctu) +{ + StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType]; + + if (ctu.m_slice->m_sliceType == I_SLICE) + { + uint32_t depth = 0; + for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) + { + depth = ctu.m_cuDepth[absPartIdx]; + + log->totalCu++; + log->cntIntra[depth]++; + log->qTreeIntraCnt[depth]++; + + if (ctu.m_partSize[absPartIdx] == SIZE_NONE) + { + log->totalCu--; + log->cntIntra[depth]--; + log->qTreeIntraCnt[depth]--; + } + else if (ctu.m_partSize[absPartIdx] == SIZE_NxN) + { + /* TODO: log intra modes at absPartIdx +0 to +3 */ + X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); + log->cntIntraNxN++; + log->cntIntra[depth]--; + } + else if (ctu.m_lumaIntraDir[absPartIdx] > 1) + log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; + else + log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; + } + } + else + { + uint32_t depth = 0; + for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) + { + depth = ctu.m_cuDepth[absPartIdx]; + + log->totalCu++; + log->cntTotalCu[depth]++; + + if (ctu.m_partSize[absPartIdx] == SIZE_NONE) + { + log->totalCu--; + log->cntTotalCu[depth]--; + } + else if (ctu.isSkipped(absPartIdx)) + { + log->totalCu--; + log->cntSkipCu[depth]++; + log->qTreeSkipCnt[depth]++; + } + else if (ctu.m_predMode[absPartIdx] == MODE_INTER) + { + log->cntInter[depth]++; + log->qTreeInterCnt[depth]++; + + if (ctu.m_partSize[absPartIdx] < AMP_ID) + log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++; + else + log->cuInterDistribution[depth][AMP_ID]++; + } + else if (ctu.m_predMode[absPartIdx] == MODE_INTRA) + { + log->cntIntra[depth]++; + log->qTreeIntraCnt[depth]++; + + if (ctu.m_partSize[absPartIdx] == SIZE_NxN) + { + X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); + log->cntIntraNxN++; + /* TODO: log intra modes at absPartIdx +0 to +3 */ + } + else if (ctu.m_lumaIntraDir[absPartIdx] > 1) + log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; + else + log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; + } + } + } +} + +/* DCT-domain noise reduction / adaptive deadzone from libavcodec */ +void FrameEncoder::noiseReductionUpdate() +{ + if (!m_nr) + return; + + static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12}; + + for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) + { + int trSize = cat & 3; + int coefCount = 1 << ((trSize + 2) * 2); + + if (m_nr->count[cat] > maxBlocksPerTrSize[trSize]) + { + for (int i = 0; i < coefCount; i++) + m_nr->residualSum[cat][i] >>= 1; + m_nr->count[cat] >>= 1; + } + + uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat]; + + for (int i = 0; i < coefCount; i++) + { + uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2; + uint64_t denom = m_nr->residualSum[cat][i] + 1; + m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom); + } + + // Don't denoise DC coefficients + m_nr->offsetDenoise[cat][0] = 0; + } +} + +int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp) +{ + x265_emms(); + double qp = baseQp; + + FrameData& curEncData = *m_frame->m_encData; + /* clear cuCostsForVbv from when vbv row reset was triggered */ + bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; + if (bIsVbv) + { + curEncData.m_cuStat[ctuAddr].vbvCost = 0; + curEncData.m_cuStat[ctuAddr].intraVbvCost = 0; + } + + /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */ + double qp_offset = 0; + uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16; + uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16; + uint32_t noOfBlocks = g_maxCUSize / 16; + uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks; + uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth; + + /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ + bool isReferenced = IS_REFERENCED(m_frame); + double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset; + + uint32_t cnt = 0, idx = 0; + for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++) + { + for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++) + { + idx = block_x + w + (block_y * maxBlockCols); + if (m_param->rc.aqMode) + qp_offset += qpoffs[idx]; + if (bIsVbv) + { + curEncData.m_cuStat[ctuAddr].vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK; + curEncData.m_cuStat[ctuAddr].intraVbvCost += m_frame->m_lowres.intraCost[idx]; + } + cnt++; + } + } + + qp_offset /= cnt; + qp += qp_offset; + + return Clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5)); +} + +Frame *FrameEncoder::getEncodedPicture(NALList& output) +{ + if (m_frame) + { + /* block here until worker thread completes */ + m_done.wait(); + + Frame *ret = m_frame; + m_frame = NULL; + output.takeContents(m_nalList); + return ret; + } + + return NULL; +} +} diff --git a/source/encoder/frameencoder.h b/source/encoder/frameencoder.h new file mode 100644 index 0000000..625c025 --- /dev/null +++ b/source/encoder/frameencoder.h @@ -0,0 +1,216 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Shin Yee + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FRAMEENCODER_H +#define X265_FRAMEENCODER_H + +#include "common.h" +#include "wavefront.h" +#include "bitstream.h" +#include "frame.h" +#include "picyuv.h" +#include "md5.h" + +#include "analysis.h" +#include "sao.h" + +#include "entropy.h" +#include "framefilter.h" +#include "ratecontrol.h" +#include "reference.h" +#include "nal.h" + +namespace x265 { +// private x265 namespace + +class ThreadPool; +class Encoder; + +#define ANGULAR_MODE_ID 2 +#define AMP_ID 3 +#define INTER_MODES 4 +#define INTRA_MODES 3 + +struct StatisticLog +{ + uint64_t cntInter[4]; + uint64_t cntIntra[4]; + uint64_t cuInterDistribution[4][INTER_MODES]; + uint64_t cuIntraDistribution[4][INTRA_MODES]; + uint64_t cntIntraNxN; + uint64_t cntSkipCu[4]; + uint64_t cntTotalCu[4]; + uint64_t totalCu; + + /* These states store the count of inter,intra and skip ctus within quad tree structure of each CU */ + uint32_t qTreeInterCnt[4]; + uint32_t qTreeIntraCnt[4]; + uint32_t qTreeSkipCnt[4]; + + StatisticLog() + { + memset(this, 0, sizeof(StatisticLog)); + } +}; + +/* manages the state of encoding one row of CTU blocks. When + * WPP is active, several rows will be simultaneously encoded. */ +struct CTURow +{ + Entropy bufferedEntropy; /* store CTU2 context for next row CTU0 */ + Entropy rowGoOnCoder; /* store context between CTUs, code bitstream if !SAO */ + + FrameStats rowStats; + + /* Threading variables */ + + /* This lock must be acquired when reading or writing m_active or m_busy */ + Lock lock; + + /* row is ready to run, has no neighbor dependencies. The row may have + * external dependencies (reference frame pixels) that prevent it from being + * processed, so it may stay with m_active=true for some time before it is + * encoded by a worker thread. */ + volatile bool active; + + /* row is being processed by a worker thread. This flag is only true when a + * worker thread is within the context of FrameEncoder::processRow(). This + * flag is used to detect multiple possible wavefront problems. */ + volatile bool busy; + + /* count of completed CUs in this row */ + volatile uint32_t completed; + + /* called at the start of each frame to initialize state */ + void init(Entropy& initContext) + { + active = false; + busy = false; + completed = 0; + memset(&rowStats, 0, sizeof(rowStats)); + rowGoOnCoder.load(initContext); + } +}; + +// Manages the wave-front processing of a single encoding frame +class FrameEncoder : public WaveFront, public Thread +{ +public: + + FrameEncoder(); + + virtual ~FrameEncoder() {} + + bool init(Encoder *top, int numRows, int numCols, int id); + + void destroy(); + + /* triggers encode of a new frame by the worker thread */ + bool startCompressFrame(Frame* curFrame); + + /* blocks until worker thread is done, returns access unit */ + Frame *getEncodedPicture(NALList& list); + + Event m_enable; + Event m_done; + bool m_threadActive; + + int m_numRows; + uint32_t m_numCols; + int m_refLagRows; + CTURow* m_rows; + RateControlEntry m_rce; + SEIDecodedPictureHash m_seiReconPictureDigest; + + uint64_t m_SSDY; + uint64_t m_SSDU; + uint64_t m_SSDV; + double m_ssim; + uint32_t m_ssimCnt; + MD5Context m_state[3]; + uint32_t m_crc[3]; + uint32_t m_checksum[3]; + double m_elapsedCompressTime; // elapsed time spent in worker threads + double m_frameTime; // wall time from frame start to finish + StatisticLog m_sliceTypeLog[3]; // per-slice type CU statistics + FrameStats m_frameStats; // stats of current frame for multi-pass encodes + volatile bool m_bAllRowsStop; + volatile int m_vbvResetTriggerRow; + uint64_t m_accessUnitBits; + + Encoder* m_top; + x265_param* m_param; + Frame* m_frame; + NoiseReduction* m_nr; + ThreadLocalData* m_tld; /* for --no-wpp */ + Bitstream* m_outStreams; + uint32_t* m_substreamSizes; + + CUGeom* m_cuGeoms; + uint32_t* m_ctuGeomMap; + + Bitstream m_bs; + MotionReference m_mref[2][MAX_NUM_REF + 1]; + Entropy m_entropyCoder; + Entropy m_initSliceContext; + FrameFilter m_frameFilter; + NALList m_nalList; + + int m_filterRowDelay; + int m_filterRowDelayCus; + Event m_completionEvent; + int64_t m_totalTime; + int m_frameEncoderID; + +protected: + + bool initializeGeoms(const FrameData& encData); + + /* analyze / compress frame, can be run in parallel within reference constraints */ + void compressFrame(); + + /* called by compressFrame to perform wave-front compression analysis */ + void compressCTURows(); + + /* called by compressFrame to generate final per-row bitstreams */ + void encodeSlice(); + + void threadMain(); + int calcQpForCu(uint32_t cuAddr, double baseQp); + void collectCTUStatistics(CUData& ctu); + void noiseReductionUpdate(); + + /* Called by WaveFront::findJob() */ + void processRow(int row, int threadId); + void processRowEncoder(int row, ThreadLocalData& tld); + void processRowFilter(int row) { m_frameFilter.processRow(row); } + + void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); } + void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); } + void enableRowEncoder(int row) { WaveFront::enableRow(row * 2 + 0); } + void enableRowFilter(int row) { WaveFront::enableRow(row * 2 + 1); } +}; +} + +#endif // ifndef X265_FRAMEENCODER_H diff --git a/source/encoder/framefilter.cpp b/source/encoder/framefilter.cpp new file mode 100644 index 0000000..aee75c6 --- /dev/null +++ b/source/encoder/framefilter.cpp @@ -0,0 +1,491 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Chung Shin Yee + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "encoder.h" +#include "framefilter.h" +#include "frameencoder.h" +#include "wavefront.h" +#include "PPA/ppa.h" + +using namespace x265; + +static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height); +static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt); + +FrameFilter::FrameFilter() + : m_param(NULL) + , m_frame(NULL) + , m_frameEncoder(NULL) + , m_ssimBuf(NULL) +{ +} + +void FrameFilter::destroy() +{ + if (m_param->bEnableSAO) + m_sao.destroy(); + + X265_FREE(m_ssimBuf); +} + +void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows) +{ + m_param = top->m_param; + m_frameEncoder = frame; + m_numRows = numRows; + m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp); + m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp); + m_pad[0] = top->m_sps.conformanceWindow.rightOffset; + m_pad[1] = top->m_sps.conformanceWindow.bottomOffset; + m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0; + m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize; + + m_deblock.init(); + + if (m_param->bEnableSAO) + if (!m_sao.create(m_param)) + m_param->bEnableSAO = 0; + + if (m_param->bEnableSsim) + m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3)); +} + +void FrameFilter::start(Frame *frame, Entropy& initState, int qp) +{ + m_frame = frame; + + if (m_param->bEnableSAO) + m_sao.startSlice(frame, initState, qp); +} + +void FrameFilter::processRow(int row) +{ + PPAScopeEvent(Thread_filterCU); + + if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO) + { + processRowPost(row); + return; + } + FrameData& encData = *m_frame->m_encData; + const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; + const uint32_t lineStartCUAddr = row * numCols; + + if (m_param->bEnableLoopFilter) + { + for (uint32_t col = 0; col < numCols; col++) + { + uint32_t cuAddr = lineStartCUAddr + col; + CUData* cu = encData.getPicCTU(cuAddr); + + m_deblock.deblockCTU(cu, Deblock::EDGE_VER); + + if (col > 0) + { + CUData* cuPrev = encData.getPicCTU(cuAddr - 1); + m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR); + } + } + + CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1); + m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR); + } + + // SAO + SAOParam* saoParam = encData.m_saoParam; + if (m_param->bEnableSAO) + { + m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext); + m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext); + m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext); + + m_sao.rdoSaoUnitRow(saoParam, row); + + // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug? + if (row >= m_saoRowDelay) + processSao(row - m_saoRowDelay); + } + + // this row of CTUs has been encoded + + if (row > 0) + processRowPost(row - 1); + + if (row == m_numRows - 1) + { + if (m_param->bEnableSAO) + { + m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame); + + for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++) + processSao(i); + } + + processRowPost(row); + } +} + +uint32_t FrameFilter::getCUHeight(int rowNum) const +{ + return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize; +} + +void FrameFilter::processRowPost(int row) +{ + PicYuv *reconPic = m_frame->m_reconPicYuv; + const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth; + const uint32_t lineStartCUAddr = row * numCols; + const int realH = getCUHeight(row); + + // Border extend Left and Right + primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX); + primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX); + primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX); + + // Border extend Top + if (!row) + { + const intptr_t stride = reconPic->m_stride; + const intptr_t strideC = reconPic->m_strideC; + pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX; + pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX; + pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX; + + for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++) + memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel)); + + for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++) + { + memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel)); + memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel)); + } + } + + // Border extend Bottom + if (row == m_numRows - 1) + { + const intptr_t stride = reconPic->m_stride; + const intptr_t strideC = reconPic->m_strideC; + pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride; + pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC; + pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC; + for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++) + memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel)); + + for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++) + { + memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel)); + memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel)); + } + } + + // Notify other FrameEncoders that this row of reconstructed pixels is available + m_frame->m_reconRowCount.incr(); + + uint32_t cuAddr = lineStartCUAddr; + if (m_param->bEnablePsnr) + { + PicYuv* origPic = m_frame->m_origPicYuv; + + intptr_t stride = reconPic->m_stride; + uint32_t width = reconPic->m_picWidth - m_pad[0]; + uint32_t height = getCUHeight(row); + + uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height); + height >>= m_vChromaShift; + width >>= m_hChromaShift; + stride = reconPic->m_strideC; + + uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height); + uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height); + + m_frameEncoder->m_SSDY += ssdY; + m_frameEncoder->m_SSDU += ssdU; + m_frameEncoder->m_SSDV += ssdV; + } + if (m_param->bEnableSsim && m_ssimBuf) + { + pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0]; + pixel *org = m_frame->m_origPicYuv->m_picOrg[0]; + intptr_t stride1 = m_frame->m_origPicYuv->m_stride; + intptr_t stride2 = m_frame->m_reconPicYuv->m_stride; + uint32_t bEnd = ((row + 1) == (this->m_numRows - 1)); + uint32_t bStart = (row == 0); + uint32_t minPixY = row * g_maxCUSize - 4 * !bStart; + uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd; + uint32_t ssim_cnt; + x265_emms(); + + /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right + * to avoid alignment of ssim blocks with DCT blocks. */ + minPixY += bStart ? 2 : -6; + m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2, + m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt); + m_frameEncoder->m_ssimCnt += ssim_cnt; + } + if (m_param->decodedPictureHashSEI == 1) + { + uint32_t height = getCUHeight(row); + uint32_t width = reconPic->m_picWidth; + intptr_t stride = reconPic->m_stride; + + if (!row) + { + for (int i = 0; i < 3; i++) + MD5Init(&m_frameEncoder->m_state[i]); + } + + updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride); + width >>= m_hChromaShift; + height >>= m_vChromaShift; + stride = reconPic->m_strideC; + + updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride); + updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride); + } + else if (m_param->decodedPictureHashSEI == 2) + { + uint32_t height = getCUHeight(row); + uint32_t width = reconPic->m_picWidth; + intptr_t stride = reconPic->m_stride; + if (!row) + m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff; + updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride); + width >>= m_hChromaShift; + height >>= m_vChromaShift; + stride = reconPic->m_strideC; + + updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride); + updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride); + } + else if (m_param->decodedPictureHashSEI == 3) + { + uint32_t width = reconPic->m_picWidth; + uint32_t height = getCUHeight(row); + intptr_t stride = reconPic->m_stride; + uint32_t cuHeight = g_maxCUSize; + if (!row) + m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0; + updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight); + width >>= m_hChromaShift; + height >>= m_vChromaShift; + stride = reconPic->m_strideC; + cuHeight >>= m_vChromaShift; + + updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight); + updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight); + } +} + +static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height) +{ + uint64_t ssd = 0; + + if ((width | height) & 3) + { + /* Slow Path */ + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + int diff = (int)(fenc[x] - rec[x]); + ssd += diff * diff; + } + + fenc += stride; + rec += stride; + } + + return ssd; + } + + uint32_t y = 0; + /* Consume Y in chunks of 64 */ + for (; y + 64 <= height; y += 64) + { + uint32_t x = 0; + + if (!(stride & 31)) + for (; x + 64 <= width; x += 64) + ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride); + + if (!(stride & 15)) + for (; x + 16 <= width; x += 16) + ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride); + + for (; x + 4 <= width; x += 4) + { + ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride); + ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride); + ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride); + ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride); + } + + fenc += stride * 64; + rec += stride * 64; + } + + /* Consume Y in chunks of 16 */ + for (; y + 16 <= height; y += 16) + { + uint32_t x = 0; + + if (!(stride & 31)) + for (; x + 64 <= width; x += 64) + ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride); + + if (!(stride & 15)) + for (; x + 16 <= width; x += 16) + ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride); + + for (; x + 4 <= width; x += 4) + ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride); + + fenc += stride * 16; + rec += stride * 16; + } + + /* Consume Y in chunks of 4 */ + for (; y + 4 <= height; y += 4) + { + uint32_t x = 0; + + if (!(stride & 15)) + for (; x + 16 <= width; x += 16) + ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride); + + for (; x + 4 <= width; x += 4) + ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride); + + fenc += stride * 4; + rec += stride * 4; + } + + return ssd; +} + +/* Function to calculate SSIM for each row */ +static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt) +{ + uint32_t z = 0; + float ssim = 0.0; + + int(*sum0)[4] = (int(*)[4])buf; + int(*sum1)[4] = sum0 + (width >> 2) + 3; + width >>= 2; + height >>= 2; + + for (uint32_t y = 1; y < height; y++) + { + for (; z <= y; z++) + { + std::swap(sum0, sum1); + for (uint32_t x = 0; x < width; x += 2) + primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]); + } + + for (uint32_t x = 0; x < width - 1; x += 4) + ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1)); + } + + cnt = (height - 1) * (width - 1); + return ssim; +} + +/* restore original YUV samples to recon after SAO (if lossless) */ +static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth) +{ + uint32_t size = g_maxCUSize >> depth; + int part = partitionFromSizes(size, size); + + PicYuv* reconPic = frame.m_reconPicYuv; + PicYuv* fencPic = frame.m_origPicYuv; + + pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx); + pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx); + + primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride); + + pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx); + pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx); + + pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx); + pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx); + + int csp = fencPic->m_picCsp; + primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC); + primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC); +} + +/* Original YUV restoration for CU in lossless coding */ +static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth) +{ + if (cu->m_cuDepth[absPartIdx] > depth) + { + /* TODO: this could use cuGeom.numPartition and flags */ + uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1); + uint32_t qNumParts = curNumParts >> 2; + uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples - cu->m_cuPelX; + uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY; + + /* process four split sub-cu at next depth */ + for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts) + { + if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax) + origCUSampleRestoration(cu, frame, absPartIdx, depth + 1); + } + + return; + } + + // restore original YUV samples + if (cu->m_tqBypass[absPartIdx]) + restoreOrigLosslessYuv(cu, frame, absPartIdx, depth); +} + +void FrameFilter::processSao(int row) +{ + SAOParam* saoParam = m_frame->m_encData->m_saoParam; + + if (saoParam->bSaoFlag[0]) + m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0); + + if (saoParam->bSaoFlag[1]) + { + m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1); + m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2); + } + + if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled) + { + uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth; + uint32_t lineStartCUAddr = row * numCols; + + for (uint32_t col = 0; col < numCols; col++) + origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0); + } +} diff --git a/source/encoder/framefilter.h b/source/encoder/framefilter.h new file mode 100644 index 0000000..acdec98 --- /dev/null +++ b/source/encoder/framefilter.h @@ -0,0 +1,75 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Chung Shin Yee + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FRAMEFILTER_H +#define X265_FRAMEFILTER_H + +#include "common.h" +#include "frame.h" +#include "deblock.h" +#include "sao.h" + +namespace x265 { +// private x265 namespace + +class Encoder; +class Entropy; +class FrameEncoder; +struct ThreadLocalData; + +// Manages the processing of a single frame loopfilter +class FrameFilter +{ +public: + + x265_param* m_param; + Frame* m_frame; + FrameEncoder* m_frameEncoder; + int m_hChromaShift; + int m_vChromaShift; + int m_pad[2]; + + Deblock m_deblock; + SAO m_sao; + int m_numRows; + int m_saoRowDelay; + int m_lastHeight; + + void* m_ssimBuf; /* Temp storage for ssim computation */ + + FrameFilter(); + + void init(Encoder *top, FrameEncoder *frame, int numRows); + void destroy(); + + void start(Frame *pic, Entropy& initState, int qp); + + void processRow(int row); + void processRowPost(int row); + void processSao(int row); + uint32_t getCUHeight(int rowNum) const; +}; +} + +#endif // ifndef X265_FRAMEFILTER_H diff --git a/source/encoder/level.cpp b/source/encoder/level.cpp new file mode 100644 index 0000000..f00f4ca --- /dev/null +++ b/source/encoder/level.cpp @@ -0,0 +1,397 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "slice.h" +#include "level.h" + +namespace x265 { +typedef struct +{ + uint32_t maxLumaSamples; + uint32_t maxLumaSamplesPerSecond; + uint32_t maxBitrateMain; + uint32_t maxBitrateHigh; + uint32_t maxCpbSizeMain; + uint32_t maxCpbSizeHigh; + uint32_t minCompressionRatio; + Level::Name levelEnum; + const char* name; + int levelIdc; +} LevelSpec; + +LevelSpec levels[] = +{ + { 36864, 552960, 128, MAX_UINT, 350, MAX_UINT, 2, Level::LEVEL1, "1", 10 }, + { 122880, 3686400, 1500, MAX_UINT, 1500, MAX_UINT, 2, Level::LEVEL2, "2", 20 }, + { 245760, 7372800, 3000, MAX_UINT, 3000, MAX_UINT, 2, Level::LEVEL2_1, "2.1", 21 }, + { 552960, 16588800, 6000, MAX_UINT, 6000, MAX_UINT, 2, Level::LEVEL3, "3", 30 }, + { 983040, 33177600, 10000, MAX_UINT, 10000, MAX_UINT, 2, Level::LEVEL3_1, "3.1", 31 }, + { 2228224, 66846720, 12000, 30000, 12000, 30000, 4, Level::LEVEL4, "4", 40 }, + { 2228224, 133693440, 20000, 50000, 20000, 50000, 4, Level::LEVEL4_1, "4.1", 41 }, + { 8912896, 267386880, 25000, 100000, 25000, 100000, 6, Level::LEVEL5, "5", 50 }, + { 8912896, 534773760, 40000, 160000, 40000, 160000, 8, Level::LEVEL5_1, "5.1", 51 }, + { 8912896, 1069547520, 60000, 240000, 60000, 240000, 8, Level::LEVEL5_2, "5.2", 52 }, + { 35651584, 1069547520, 60000, 240000, 60000, 240000, 8, Level::LEVEL6, "6", 60 }, + { 35651584, 2139095040, 120000, 480000, 120000, 480000, 8, Level::LEVEL6_1, "6.1", 61 }, + { 35651584, 4278190080U, 240000, 800000, 240000, 800000, 6, Level::LEVEL6_2, "6.2", 62 }, +}; + +/* determine minimum decoder level required to decode the described video */ +void determineLevel(const x265_param ¶m, VPS& vps) +{ + if (param.bLossless) + vps.ptl.profileIdc = Profile::NONE; + else if (param.internalCsp == X265_CSP_I420) + { + if (param.internalBitDepth == 8) + { + if (param.keyframeMax == 1 && param.maxNumReferences == 1) + vps.ptl.profileIdc = Profile::MAINSTILLPICTURE; + else + vps.ptl.profileIdc = Profile::MAIN; + } + else if (param.internalBitDepth == 10) + vps.ptl.profileIdc = Profile::MAIN10; + } + else + vps.ptl.profileIdc = Profile::MAINREXT; + + /* determine which profiles are compatible with this stream */ + + memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag)); + vps.ptl.profileCompatibilityFlag[vps.ptl.profileIdc] = true; + if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8) + vps.ptl.profileCompatibilityFlag[Profile::MAIN] = true; + else if (vps.ptl.profileIdc == Profile::MAIN) + vps.ptl.profileCompatibilityFlag[Profile::MAIN10] = true; + else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE) + { + vps.ptl.profileCompatibilityFlag[Profile::MAIN] = true; + vps.ptl.profileCompatibilityFlag[Profile::MAIN10] = true; + } + else if (vps.ptl.profileIdc == Profile::MAINREXT) + vps.ptl.profileCompatibilityFlag[Profile::MAINREXT] = true; + + uint32_t lumaSamples = param.sourceWidth * param.sourceHeight; + uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom)); + uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate; + + const uint32_t MaxDpbPicBuf = 6; + vps.ptl.levelIdc = Level::NONE; + vps.ptl.tierFlag = Level::MAIN; + + const size_t NumLevels = sizeof(levels) / sizeof(levels[0]); + uint32_t i; + for (i = 0; i < NumLevels; i++) + { + if (lumaSamples > levels[i].maxLumaSamples) + continue; + else if (samplesPerSec > levels[i].maxLumaSamplesPerSecond) + continue; + else if (bitrate > levels[i].maxBitrateMain && levels[i].maxBitrateHigh == MAX_UINT) + continue; + else if (bitrate > levels[i].maxBitrateHigh) + continue; + else if (param.sourceWidth > sqrt(levels[i].maxLumaSamples * 8.0f)) + continue; + else if (param.sourceHeight > sqrt(levels[i].maxLumaSamples * 8.0f)) + continue; + + uint32_t maxDpbSize = MaxDpbPicBuf; + if (lumaSamples <= (levels[i].maxLumaSamples >> 2)) + maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16); + else if (lumaSamples <= (levels[i].maxLumaSamples >> 1)) + maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16); + else if (lumaSamples <= ((3 * levels[i].maxLumaSamples) >> 2)) + maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16); + + /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than + * or equal to MaxDpbSize */ + if (vps.maxDecPicBuffering > maxDpbSize) + continue; + + /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */ + if (levels[i].levelEnum >= Level::LEVEL5 && param.maxCUSize < 32) + { + x265_log(¶m, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levels[i].name); + vps.ptl.profileIdc = Profile::NONE; + vps.ptl.levelIdc = Level::NONE; + vps.ptl.tierFlag = Level::MAIN; + x265_log(¶m, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n"); + return; + } + + /* The value of NumPocTotalCurr shall be less than or equal to 8 */ + int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics; + if (numPocTotalCurr > 8) + { + x265_log(¶m, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levels[i].name); + vps.ptl.profileIdc = Profile::NONE; + vps.ptl.levelIdc = Level::NONE; + vps.ptl.tierFlag = Level::MAIN; + x265_log(¶m, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n"); + return; + } + + vps.ptl.levelIdc = levels[i].levelEnum; + vps.ptl.minCrForLevel = levels[i].minCompressionRatio; + vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond; + + if (bitrate > levels[i].maxBitrateMain && bitrate <= levels[i].maxBitrateHigh && + levels[i].maxBitrateHigh != MAX_UINT) + vps.ptl.tierFlag = Level::HIGH; + else + vps.ptl.tierFlag = Level::MAIN; + break; + } + + vps.ptl.intraConstraintFlag = false; + vps.ptl.lowerBitRateConstraintFlag = true; + vps.ptl.bitDepthConstraint = param.internalBitDepth; + vps.ptl.chromaFormatConstraint = param.internalCsp; + + static const char *profiles[] = { "None", "Main", "Main 10", "Main Still Picture", "RExt" }; + static const char *tiers[] = { "Main", "High" }; + + const char *profile = profiles[vps.ptl.profileIdc]; + if (vps.ptl.profileIdc == Profile::MAINREXT) + { + if (param.internalCsp == X265_CSP_I422) + profile = "Main 4:2:2 10"; + if (param.internalCsp == X265_CSP_I444) + { + if (vps.ptl.bitDepthConstraint <= 8) + profile = "Main 4:4:4 8"; + else if (vps.ptl.bitDepthConstraint <= 10) + profile = "Main 4:4:4 10"; + } + } + x265_log(¶m, X265_LOG_INFO, "%s profile, Level-%s (%s tier)\n", + profile, levels[i].name, tiers[vps.ptl.tierFlag]); +} + +/* enforce a maximum decoder level requirement, in other words assure that a + * decoder of the specified level may decode the video about to be created. + * Lower parameters where necessary to ensure the video will be decodable by a + * decoder meeting this level of requirement. Some parameters (resolution and + * frame rate) are non-negotiable and thus this function may fail. In those + * circumstances it will be quite noisy */ +bool enforceLevel(x265_param& param, VPS& vps) +{ + vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : 1; + vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics); + + /* no level specified by user, just auto-detect from the configuration */ + if (param.levelIdc <= 0) + return true; + + uint32_t level = 0; + while (levels[level].levelIdc != param.levelIdc && level + 1 < sizeof(levels) / sizeof(levels[0])) + level++; + if (levels[level].levelIdc != param.levelIdc) + { + x265_log(¶m, X265_LOG_WARNING, "specified level %d does not exist\n", param.levelIdc); + return false; + } + + LevelSpec& l = levels[level]; + bool highTier = !!param.bHighTier; + if (highTier && l.maxBitrateHigh == MAX_UINT) + { + highTier = false; + x265_log(¶m, X265_LOG_WARNING, "Level %s has no High tier, using Main tier\n", l.name); + } + + uint32_t lumaSamples = param.sourceWidth * param.sourceHeight; + uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom)); + bool ok = true; + if (lumaSamples > l.maxLumaSamples) + ok = false; + else if (param.sourceWidth > sqrt(l.maxLumaSamples * 8.0f)) + ok = false; + else if (param.sourceHeight > sqrt(l.maxLumaSamples * 8.0f)) + ok = false; + if (!ok) + { + x265_log(¶m, X265_LOG_WARNING, "picture dimensions are out of range for specified level\n"); + return false; + } + else if (samplesPerSec > l.maxLumaSamplesPerSecond) + { + x265_log(¶m, X265_LOG_WARNING, "frame rate is out of range for specified level\n"); + return false; + } + + if ((uint32_t)param.rc.vbvMaxBitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain)) + { + param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain; + x265_log(¶m, X265_LOG_INFO, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate); + } + if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain)) + { + param.rc.vbvMaxBitrate = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain; + x265_log(¶m, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize); + } + + switch (param.rc.rateControlMode) + { + case X265_RC_ABR: + if ((uint32_t)param.rc.bitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain)) + { + param.rc.bitrate = l.maxBitrateHigh; + x265_log(¶m, X265_LOG_INFO, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate); + } + break; + + case X265_RC_CQP: + x265_log(¶m, X265_LOG_WARNING, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n"); + return false; + + case X265_RC_CRF: + if (!param.rc.vbvBufferSize || !param.rc.vbvMaxBitrate) + { + if (!param.rc.vbvMaxBitrate) + param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain; + if (!param.rc.vbvBufferSize) + param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain; + x265_log(¶m, X265_LOG_WARNING, "Specifying a decoder level with constant rate factor rate-control requires\n"); + x265_log(¶m, X265_LOG_WARNING, "enabling VBV with vbv-bufsize=%dkb vbv-maxrate=%dkbps. VBV outputs are non-deterministic!\n", + param.rc.vbvBufferSize, param.rc.vbvMaxBitrate); + } + break; + + default: + x265_log(¶m, X265_LOG_ERROR, "Unknown rate control mode is inconsistent with specifying a decoder level\n"); + return false; + } + + /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than or equal to MaxDpbSize */ + const uint32_t MaxDpbPicBuf = 6; + uint32_t maxDpbSize = MaxDpbPicBuf; + if (lumaSamples <= (l.maxLumaSamples >> 2)) + maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16); + else if (lumaSamples <= (l.maxLumaSamples >> 1)) + maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16); + else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2)) + maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16); + + int savedRefCount = param.maxNumReferences; + while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1) + { + param.maxNumReferences--; + vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics); + } + if (param.maxNumReferences != savedRefCount) + x265_log(¶m, X265_LOG_INFO, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences); + + /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */ + if (param.levelIdc >= 50 && param.maxCUSize < 32) + { + param.maxCUSize = 32; + x265_log(¶m, X265_LOG_INFO, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n"); + } + + /* The value of NumPocTotalCurr shall be less than or equal to 8 */ + int numPocTotalCurr = param.maxNumReferences + !!param.bframes; + if (numPocTotalCurr > 8) + { + param.maxNumReferences = 8 - !!param.bframes; + x265_log(¶m, X265_LOG_INFO, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences); + } + + return true; +} + +extern "C" +int x265_param_apply_profile(x265_param *param, const char *profile) +{ + if (!profile) + return 0; + if (!strcmp(profile, "main")) + { + /* SPSs shall have chroma_format_idc equal to 1 only */ + param->internalCsp = X265_CSP_I420; + +#if HIGH_BIT_DEPTH + /* SPSs shall have bit_depth_luma_minus8 equal to 0 only */ + x265_log(param, X265_LOG_ERROR, "Main profile not supported, compiled for Main10.\n"); + return -1; +#endif + } + else if (!strcmp(profile, "main10")) + { + /* SPSs shall have chroma_format_idc equal to 1 only */ + param->internalCsp = X265_CSP_I420; + + /* SPSs shall have bit_depth_luma_minus8 in the range of 0 to 2, inclusive + * this covers all builds of x265, currently */ + } + else if (!strcmp(profile, "mainstillpicture") || !strcmp(profile, "msp")) + { + /* SPSs shall have chroma_format_idc equal to 1 only */ + param->internalCsp = X265_CSP_I420; + + /* SPSs shall have sps_max_dec_pic_buffering_minus1[ sps_max_sub_layers_minus1 ] equal to 0 only */ + param->maxNumReferences = 1; + + /* The bitstream shall contain only one picture (we do not enforce this) */ + /* just in case the user gives us more than one picture: */ + param->keyframeMax = 1; + param->bOpenGOP = 0; + param->bRepeatHeaders = 1; + param->lookaheadDepth = 0; + param->bframes = 0; + param->scenecutThreshold = 0; + param->bFrameAdaptive = 0; + param->rc.cuTree = 0; + param->bEnableWeightedPred = 0; + param->bEnableWeightedBiPred = 0; + +#if HIGH_BIT_DEPTH + /* SPSs shall have bit_depth_luma_minus8 equal to 0 only */ + x265_log(param, X265_LOG_ERROR, "Mainstillpicture profile not supported, compiled for Main10.\n"); + return -1; +#endif + } + else if (!strcmp(profile, "main422-10")) + param->internalCsp = X265_CSP_I422; + else if (!strcmp(profile, "main444-8")) + { + param->internalCsp = X265_CSP_I444; +#if HIGH_BIT_DEPTH + x265_log(param, X265_LOG_ERROR, "Main 4:4:4 8 profile not supported, compiled for Main10.\n"); + return -1; +#endif + } + else if (!strcmp(profile, "main444-10")) + param->internalCsp = X265_CSP_I444; + else + { + x265_log(param, X265_LOG_ERROR, "unknown profile <%s>\n", profile); + return -1; + } + + return 0; +} +} diff --git a/source/encoder/level.h b/source/encoder/level.h new file mode 100644 index 0000000..03ca40d --- /dev/null +++ b/source/encoder/level.h @@ -0,0 +1,39 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_LEVEL_H +#define X265_LEVEL_H 1 + +#include "common.h" +#include "x265.h" + +namespace x265 { +// encoder private namespace + +struct VPS; +void determineLevel(const x265_param ¶m, VPS& vps); +bool enforceLevel(x265_param& param, VPS& vps); + +} + +#endif // ifndef X265_LEVEL_H diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp new file mode 100644 index 0000000..f6129ff --- /dev/null +++ b/source/encoder/motion.cpp @@ -0,0 +1,1169 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "lowres.h" +#include "motion.h" +#include "x265.h" + +#if _MSC_VER +#pragma warning(disable: 4127) // conditional expression is constant (macros use this construct) +#endif + +using namespace x265; + +namespace { +struct SubpelWorkload +{ + int hpel_iters; + int hpel_dirs; + int qpel_iters; + int qpel_dirs; + bool hpel_satd; +}; + +SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] = +{ + { 1, 4, 0, 4, false }, // 4 SAD HPEL only + { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL + { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL + { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL + { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL + { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default) + { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL + { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL +}; +} + +static int size_scale[NUM_LUMA_PARTITIONS]; +#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum]))) + +static void init_scales(void) +{ +#define SETUP_SCALE(W, H) \ + size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4; + SETUP_SCALE(4, 4); + SETUP_SCALE(8, 8); + SETUP_SCALE(8, 4); + SETUP_SCALE(4, 8); + SETUP_SCALE(16, 16); + SETUP_SCALE(16, 8); + SETUP_SCALE(8, 16); + SETUP_SCALE(16, 12); + SETUP_SCALE(12, 16); + SETUP_SCALE(4, 16); + SETUP_SCALE(16, 4); + SETUP_SCALE(32, 32); + SETUP_SCALE(32, 16); + SETUP_SCALE(16, 32); + SETUP_SCALE(32, 24); + SETUP_SCALE(24, 32); + SETUP_SCALE(32, 8); + SETUP_SCALE(8, 32); + SETUP_SCALE(64, 64); + SETUP_SCALE(64, 32); + SETUP_SCALE(32, 64); + SETUP_SCALE(64, 48); + SETUP_SCALE(48, 64); + SETUP_SCALE(64, 16); + SETUP_SCALE(16, 64); +#undef SETUP_SCALE +} + +MotionEstimate::MotionEstimate() + : searchMethod(3) + , subpelRefine(5) +{ + if (size_scale[0] == 0) + init_scales(); + + fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE); +} + +MotionEstimate::~MotionEstimate() +{ + X265_FREE(fenc); +} + +void MotionEstimate::setSourcePU(intptr_t offset, int width, int height) +{ + partEnum = partitionFromSizes(width, height); + X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); + sad = primitives.sad[partEnum]; + satd = primitives.satd[partEnum]; + sa8d = primitives.sa8d_inter[partEnum]; + sad_x3 = primitives.sad_x3[partEnum]; + sad_x4 = primitives.sad_x4[partEnum]; + + blockwidth = width; + blockheight = height; + blockOffset = offset; + + /* copy PU block into cache */ + primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride); +} + +/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ +static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; +static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */ +static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) }; +static const MV hex4[16] = +{ + MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3), + MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1), + MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1), + MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3), +}; +static const MV offsets[] = +{ + MV(-1, 0), MV(0, -1), + MV(-1, -1), MV(1, -1), + MV(-1, 0), MV(1, 0), + MV(-1, 1), MV(-1, -1), + MV(1, -1), MV(1, 1), + MV(-1, 0), MV(0, 1), + MV(-1, 1), MV(1, 1), + MV(1, 0), MV(0, 1), +}; // offsets for Two Point Search + +/* sum of absolute differences between MV candidates */ +static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates) +{ + int sum = 0; + + for (int i = 0; i < numCandidates - 1; i++) + { + sum += abs(mvc[i].x - mvc[i + 1].x) + + abs(mvc[i].y - mvc[i + 1].y); + } + + return sum; +} + +#define COST_MV_PT_DIST(mx, my, point, dist) \ + do \ + { \ + MV tmv(mx, my); \ + int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \ + cost += mvcost(tmv << 2); \ + if (cost < bcost) { \ + bcost = cost; \ + bmv = tmv; \ + bPointNr = point; \ + bDistance = dist; \ + } \ + } while (0) + +#define COST_MV(mx, my) \ + do \ + { \ + int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \ + cost += mvcost(MV(mx, my) << 2); \ + COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \ + } while (0) + +#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \ + { \ + pixel *pix_base = fref + bmv.x + bmv.y * stride; \ + sad_x3(fenc, \ + pix_base + (m0x) + (m0y) * stride, \ + pix_base + (m1x) + (m1y) * stride, \ + pix_base + (m2x) + (m2y) * stride, \ + stride, costs); \ + (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \ + (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \ + (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \ + } + +#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ + { \ + sad_x4(fenc, \ + fref + (m0x) + (m0y) * stride, \ + fref + (m1x) + (m1y) * stride, \ + fref + (m2x) + (m2y) * stride, \ + fref + (m3x) + (m3y) * stride, \ + stride, costs); \ + costs[0] += mvcost(MV(m0x, m0y) << 2); \ + costs[1] += mvcost(MV(m1x, m1y) << 2); \ + costs[2] += mvcost(MV(m2x, m2y) << 2); \ + costs[3] += mvcost(MV(m3x, m3y) << 2); \ + COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \ + COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \ + COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \ + COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \ + } + +#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ + { \ + pixel *pix_base = fref + omv.x + omv.y * stride; \ + sad_x4(fenc, \ + pix_base + (m0x) + (m0y) * stride, \ + pix_base + (m1x) + (m1y) * stride, \ + pix_base + (m2x) + (m2y) * stride, \ + pix_base + (m3x) + (m3y) * stride, \ + stride, costs); \ + costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \ + costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \ + costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \ + costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \ + COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \ + COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \ + COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \ + COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \ + } + +#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \ + { \ + pixel *pix_base = fref + bmv.x + bmv.y * stride; \ + sad_x4(fenc, \ + pix_base + (m0x) + (m0y) * stride, \ + pix_base + (m1x) + (m1y) * stride, \ + pix_base + (m2x) + (m2y) * stride, \ + pix_base + (m3x) + (m3y) * stride, \ + stride, costs); \ + (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \ + (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \ + (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \ + (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \ + } + +#define DIA1_ITER(mx, my) \ + { \ + omv.x = mx; omv.y = my; \ + COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \ + } + +#define CROSS(start, x_max, y_max) \ + { \ + int16_t i = start; \ + if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \ + for (; i < (x_max) - 2; i += 4) { \ + COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \ + for (; i < (x_max); i += 2) \ + { \ + if (omv.x + i <= mvmax.x) \ + COST_MV(omv.x + i, omv.y); \ + if (omv.x - i >= mvmin.x) \ + COST_MV(omv.x - i, omv.y); \ + } \ + i = start; \ + if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \ + for (; i < (y_max) - 2; i += 4) { \ + COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \ + for (; i < (y_max); i += 2) \ + { \ + if (omv.y + i <= mvmax.y) \ + COST_MV(omv.x, omv.y + i); \ + if (omv.y - i >= mvmin.y) \ + COST_MV(omv.x, omv.y - i); \ + } \ + } + +void MotionEstimate::StarPatternSearch(ReferencePlanes *ref, + const MV & mvmin, + const MV & mvmax, + MV & bmv, + int & bcost, + int & bPointNr, + int & bDistance, + int earlyExitIters, + int merange) +{ + ALIGN_VAR_16(int, costs[16]); + pixel *fref = ref->fpelPlane + blockOffset; + size_t stride = ref->lumaStride; + + MV omv = bmv; + int saved = bcost; + int rounds = 0; + + { + int16_t dist = 1; + + /* bPointNr + 2 + 4 * 5 + 7 + */ + const int16_t top = omv.y - dist; + const int16_t bottom = omv.y + dist; + const int16_t left = omv.x - dist; + const int16_t right = omv.x + dist; + + if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y) + { + COST_MV_PT_DIST_X4(omv.x, top, 2, dist, + left, omv.y, 4, dist, + right, omv.y, 5, dist, + omv.x, bottom, 7, dist); + } + else + { + if (top >= mvmin.y) // check top + { + COST_MV_PT_DIST(omv.x, top, 2, dist); + } + if (left >= mvmin.x) // check middle left + { + COST_MV_PT_DIST(left, omv.y, 4, dist); + } + if (right <= mvmax.x) // check middle right + { + COST_MV_PT_DIST(right, omv.y, 5, dist); + } + if (bottom <= mvmax.y) // check bottom + { + COST_MV_PT_DIST(omv.x, bottom, 7, dist); + } + } + if (bcost < saved) + rounds = 0; + else if (++rounds >= earlyExitIters) + return; + } + + for (int16_t dist = 2; dist <= 8; dist <<= 1) + { + /* bPointNr + 2 + 1 3 + 4 * 5 + 6 8 + 7 + Points 2, 4, 5, 7 are dist + Points 1, 3, 6, 8 are dist>>1 + */ + const int16_t top = omv.y - dist; + const int16_t bottom = omv.y + dist; + const int16_t left = omv.x - dist; + const int16_t right = omv.x + dist; + const int16_t top2 = omv.y - (dist >> 1); + const int16_t bottom2 = omv.y + (dist >> 1); + const int16_t left2 = omv.x - (dist >> 1); + const int16_t right2 = omv.x + (dist >> 1); + saved = bcost; + + if (top >= mvmin.y && left >= mvmin.x && + right <= mvmax.x && bottom <= mvmax.y) // check border + { + COST_MV_PT_DIST_X4(omv.x, top, 2, dist, + left2, top2, 1, dist >> 1, + right2, top2, 3, dist >> 1, + left, omv.y, 4, dist); + COST_MV_PT_DIST_X4(right, omv.y, 5, dist, + left2, bottom2, 6, dist >> 1, + right2, bottom2, 8, dist >> 1, + omv.x, bottom, 7, dist); + } + else // check border for each mv + { + if (top >= mvmin.y) // check top + { + COST_MV_PT_DIST(omv.x, top, 2, dist); + } + if (top2 >= mvmin.y) // check half top + { + if (left2 >= mvmin.x) // check half left + { + COST_MV_PT_DIST(left2, top2, 1, (dist >> 1)); + } + if (right2 <= mvmax.x) // check half right + { + COST_MV_PT_DIST(right2, top2, 3, (dist >> 1)); + } + } + if (left >= mvmin.x) // check left + { + COST_MV_PT_DIST(left, omv.y, 4, dist); + } + if (right <= mvmax.x) // check right + { + COST_MV_PT_DIST(right, omv.y, 5, dist); + } + if (bottom2 <= mvmax.y) // check half bottom + { + if (left2 >= mvmin.x) // check half left + { + COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1)); + } + if (right2 <= mvmax.x) // check half right + { + COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1)); + } + } + if (bottom <= mvmax.y) // check bottom + { + COST_MV_PT_DIST(omv.x, bottom, 7, dist); + } + } + + if (bcost < saved) + rounds = 0; + else if (++rounds >= earlyExitIters) + return; + } + + for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1) + { + const int16_t top = omv.y - dist; + const int16_t bottom = omv.y + dist; + const int16_t left = omv.x - dist; + const int16_t right = omv.x + dist; + + saved = bcost; + if (top >= mvmin.y && left >= mvmin.x && + right <= mvmax.x && bottom <= mvmax.y) // check border + { + /* index + 0 + 3 + 2 + 1 + 0 3 2 1 * 1 2 3 0 + 1 + 2 + 3 + 0 + */ + + COST_MV_PT_DIST_X4(omv.x, top, 0, dist, + left, omv.y, 0, dist, + right, omv.y, 0, dist, + omv.x, bottom, 0, dist); + + for (int16_t index = 1; index < 4; index++) + { + int16_t posYT = top + ((dist >> 2) * index); + int16_t posYB = bottom - ((dist >> 2) * index); + int16_t posXL = omv.x - ((dist >> 2) * index); + int16_t posXR = omv.x + ((dist >> 2) * index); + + COST_MV_PT_DIST_X4(posXL, posYT, 0, dist, + posXR, posYT, 0, dist, + posXL, posYB, 0, dist, + posXR, posYB, 0, dist); + } + } + else // check border for each mv + { + if (top >= mvmin.y) // check top + { + COST_MV_PT_DIST(omv.x, top, 0, dist); + } + if (left >= mvmin.x) // check left + { + COST_MV_PT_DIST(left, omv.y, 0, dist); + } + if (right <= mvmax.x) // check right + { + COST_MV_PT_DIST(right, omv.y, 0, dist); + } + if (bottom <= mvmax.y) // check bottom + { + COST_MV_PT_DIST(omv.x, bottom, 0, dist); + } + for (int16_t index = 1; index < 4; index++) + { + int16_t posYT = top + ((dist >> 2) * index); + int16_t posYB = bottom - ((dist >> 2) * index); + int16_t posXL = omv.x - ((dist >> 2) * index); + int16_t posXR = omv.x + ((dist >> 2) * index); + + if (posYT >= mvmin.y) // check top + { + if (posXL >= mvmin.x) // check left + { + COST_MV_PT_DIST(posXL, posYT, 0, dist); + } + if (posXR <= mvmax.x) // check right + { + COST_MV_PT_DIST(posXR, posYT, 0, dist); + } + } + if (posYB <= mvmax.y) // check bottom + { + if (posXL >= mvmin.x) // check left + { + COST_MV_PT_DIST(posXL, posYB, 0, dist); + } + if (posXR <= mvmax.x) // check right + { + COST_MV_PT_DIST(posXR, posYB, 0, dist); + } + } + } + } + + if (bcost < saved) + rounds = 0; + else if (++rounds >= earlyExitIters) + return; + } +} + +int MotionEstimate::motionEstimate(ReferencePlanes *ref, + const MV & mvmin, + const MV & mvmax, + const MV & qmvp, + int numCandidates, + const MV * mvc, + int merange, + MV & outQMv) +{ + ALIGN_VAR_16(int, costs[16]); + size_t stride = ref->lumaStride; + pixel *fref = ref->fpelPlane + blockOffset; + + setMVP(qmvp); + + MV qmvmin = mvmin.toQPel(); + MV qmvmax = mvmax.toQPel(); + + /* The term cost used here means satd/sad values for that particular search. + * The costs used in ME integer search only includes the SAD cost of motion + * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD + * cost of residual and sqrtLambda * MVD bits. Mode decision will be based + * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits + * (mode + MVD bits). */ + + // measure SAD cost at clipped QPEL MVP + MV pmv = qmvp.clipped(qmvmin, qmvmax); + MV bestpre = pmv; + int bprecost; + + if (ref->isLowres) + bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad); + else + bprecost = subpelCompare(ref, pmv, sad); + + /* re-measure full pel rounded MVP with SAD as search start point */ + MV bmv = pmv.roundToFPel(); + int bcost = bprecost; + if (pmv.isSubpel()) + { + bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2); + } + + // measure SAD cost at MV(0) if MVP is not zero + if (pmv.notZero()) + { + int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0)); + if (cost < bcost) + { + bcost = cost; + bmv = 0; + } + } + + // measure SAD cost at each QPEL motion vector candidate + for (int i = 0; i < numCandidates; i++) + { + MV m = mvc[i].clipped(qmvmin, qmvmax); + if (m.notZero() && m != pmv && m != bestpre) // check already measured + { + int cost; + if (ref->isLowres) + cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m); + else + cost = subpelCompare(ref, m, sad) + mvcost(m); + + if (cost < bprecost) + { + bprecost = cost; + bestpre = m; + } + } + } + + pmv = pmv.roundToFPel(); + MV omv = bmv; // current search origin or starting point + + switch (searchMethod) + { + case X265_DIA_SEARCH: + { + /* diamond search, radius 1 */ + bcost <<= 4; + int i = merange; + do + { + COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); + COPY1_IF_LT(bcost, (costs[0] << 4) + 1); + COPY1_IF_LT(bcost, (costs[1] << 4) + 3); + COPY1_IF_LT(bcost, (costs[2] << 4) + 4); + COPY1_IF_LT(bcost, (costs[3] << 4) + 12); + if (!(bcost & 15)) + break; + bmv.x -= (bcost << 28) >> 30; + bmv.y -= (bcost << 30) >> 30; + bcost &= ~15; + } + while (--i && bmv.checkRange(mvmin, mvmax)); + bcost >>= 4; + break; + } + + case X265_HEX_SEARCH: + { +me_hex2: + /* hexagon search, radius 2 */ +#if 0 + for (int i = 0; i < merange / 2; i++) + { + omv = bmv; + COST_MV(omv.x - 2, omv.y); + COST_MV(omv.x - 1, omv.y + 2); + COST_MV(omv.x + 1, omv.y + 2); + COST_MV(omv.x + 2, omv.y); + COST_MV(omv.x + 1, omv.y - 2); + COST_MV(omv.x - 1, omv.y - 2); + if (omv == bmv) + break; + if (!bmv.checkRange(mvmin, mvmax)) + break; + } + +#else // if 0 + /* equivalent to the above, but eliminates duplicate candidates */ + COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs); + bcost <<= 3; + COPY1_IF_LT(bcost, (costs[0] << 3) + 2); + COPY1_IF_LT(bcost, (costs[1] << 3) + 3); + COPY1_IF_LT(bcost, (costs[2] << 3) + 4); + COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs); + COPY1_IF_LT(bcost, (costs[0] << 3) + 5); + COPY1_IF_LT(bcost, (costs[1] << 3) + 6); + COPY1_IF_LT(bcost, (costs[2] << 3) + 7); + + if (bcost & 7) + { + int dir = (bcost & 7) - 2; + bmv += hex2[dir + 1]; + + /* half hexagon, not overlapping the previous iteration */ + for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--) + { + COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y, + hex2[dir + 1].x, hex2[dir + 1].y, + hex2[dir + 2].x, hex2[dir + 2].y, + costs); + bcost &= ~7; + COPY1_IF_LT(bcost, (costs[0] << 3) + 1); + COPY1_IF_LT(bcost, (costs[1] << 3) + 2); + COPY1_IF_LT(bcost, (costs[2] << 3) + 3); + if (!(bcost & 7)) + break; + dir += (bcost & 7) - 2; + dir = mod6m1[dir + 1]; + bmv += hex2[dir + 1]; + } + } + bcost >>= 3; +#endif // if 0 + + /* square refine */ + int dir = 0; + COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); + COPY2_IF_LT(bcost, costs[0], dir, 1); + COPY2_IF_LT(bcost, costs[1], dir, 2); + COPY2_IF_LT(bcost, costs[2], dir, 3); + COPY2_IF_LT(bcost, costs[3], dir, 4); + COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs); + COPY2_IF_LT(bcost, costs[0], dir, 5); + COPY2_IF_LT(bcost, costs[1], dir, 6); + COPY2_IF_LT(bcost, costs[2], dir, 7); + COPY2_IF_LT(bcost, costs[3], dir, 8); + bmv += square1[dir]; + break; + } + + case X265_UMH_SEARCH: + { + int ucost1, ucost2; + int16_t cross_start = 1; + + /* refine predictors */ + omv = bmv; + ucost1 = bcost; + DIA1_ITER(pmv.x, pmv.y); + if (pmv.notZero()) + DIA1_ITER(0, 0); + + ucost2 = bcost; + if (bmv.notZero() && bmv != pmv) + DIA1_ITER(bmv.x, bmv.y); + if (bcost == ucost2) + cross_start = 3; + + /* Early Termination */ + omv = bmv; + if (bcost == ucost2 && SAD_THRESH(2000)) + { + COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0); + COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2); + if (bcost == ucost1 && SAD_THRESH(500)) + break; + if (bcost == ucost2) + { + int16_t range = (int16_t)(merange >> 1) | 1; + CROSS(3, range, range); + COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1); + COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2); + if (bcost == ucost2) + break; + cross_start = range + 2; + } + } + + // TODO: Need to study x264's logic for building mvc list to understand why they + // have special cases here for 16x16, and whether they apply to HEVC CTU + + // adaptive search range based on mvc variability + if (numCandidates) + { + /* range multipliers based on casual inspection of some statistics of + * average distance between current predictor and final mv found by ESA. + * these have not been tuned much by actual encoding. */ + static const uint8_t range_mul[4][4] = + { + { 3, 3, 4, 4 }, + { 3, 4, 4, 4 }, + { 4, 4, 4, 5 }, + { 4, 4, 5, 6 }, + }; + + int mvd; + int sad_ctx, mvd_ctx; + int denom = 1; + + if (numCandidates == 1) + { + if (LUMA_64x64 == partEnum) + /* mvc is probably the same as mvp, so the difference isn't meaningful. + * but prediction usually isn't too bad, so just use medium range */ + mvd = 25; + else + mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); + } + else + { + /* calculate the degree of agreement between predictors. */ + + /* in 64x64, mvc includes all the neighbors used to make mvp, + * so don't count mvp separately. */ + + denom = numCandidates - 1; + mvd = 0; + if (partEnum != LUMA_64x64) + { + mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); + denom++; + } + mvd += x265_predictor_difference(mvc, numCandidates); + } + + sad_ctx = SAD_THRESH(1000) ? 0 + : SAD_THRESH(2000) ? 1 + : SAD_THRESH(4000) ? 2 : 3; + mvd_ctx = mvd < 10 * denom ? 0 + : mvd < 20 * denom ? 1 + : mvd < 40 * denom ? 2 : 3; + + merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2; + } + + /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. + * we are still centered on the same place as the DIA2. is this desirable? */ + CROSS(cross_start, merange, merange >> 1); + COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2); + + /* hexagon grid */ + omv = bmv; + const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4; + const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4; + uint16_t i = 1; + do + { + if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x, + mvmax.y - omv.y, omv.y - mvmin.y)) + { + for (int j = 0; j < 16; j++) + { + MV mv = omv + (hex4[j] * i); + if (mv.checkRange(mvmin, mvmax)) + COST_MV(mv.x, mv.y); + } + } + else + { + int16_t dir = 0; + pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride; + size_t dy = (size_t)i * stride; +#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \ + sad_x4(fenc, \ + fref_base x0 * i + (y0 - 2 * k + 4) * dy, \ + fref_base x1 * i + (y1 - 2 * k + 4) * dy, \ + fref_base x2 * i + (y2 - 2 * k + 4) * dy, \ + fref_base x3 * i + (y3 - 2 * k + 4) * dy, \ + stride, costs + 4 * k); \ + fref_base += 2 * dy; +#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i] +#define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15)) + + SADS(0, +0, -4, +0, +4, -2, -3, +2, -3); + SADS(1, -4, -2, +4, -2, -4, -1, +4, -1); + SADS(2, -4, +0, +4, +0, -4, +1, +4, +1); + SADS(3, -4, +2, +4, +2, -2, +3, +2, +3); + ADD_MVCOST(0, 0, -4); + ADD_MVCOST(1, 0, 4); + ADD_MVCOST(2, -2, -3); + ADD_MVCOST(3, 2, -3); + ADD_MVCOST(4, -4, -2); + ADD_MVCOST(5, 4, -2); + ADD_MVCOST(6, -4, -1); + ADD_MVCOST(7, 4, -1); + ADD_MVCOST(8, -4, 0); + ADD_MVCOST(9, 4, 0); + ADD_MVCOST(10, -4, 1); + ADD_MVCOST(11, 4, 1); + ADD_MVCOST(12, -4, 2); + ADD_MVCOST(13, 4, 2); + ADD_MVCOST(14, -2, 3); + ADD_MVCOST(15, 2, 3); + MIN_MV(0, 0, -4); + MIN_MV(1, 0, 4); + MIN_MV(2, -2, -3); + MIN_MV(3, 2, -3); + MIN_MV(4, -4, -2); + MIN_MV(5, 4, -2); + MIN_MV(6, -4, -1); + MIN_MV(7, 4, -1); + MIN_MV(8, -4, 0); + MIN_MV(9, 4, 0); + MIN_MV(10, -4, 1); + MIN_MV(11, 4, 1); + MIN_MV(12, -4, 2); + MIN_MV(13, 4, 2); + MIN_MV(14, -2, 3); + MIN_MV(15, 2, 3); +#undef SADS +#undef ADD_MVCOST +#undef MIN_MV + if (dir) + { + bmv.x = omv.x + i * (dir >> 4); + bmv.y = omv.y + i * ((dir << 28) >> 28); + } + } + } + while (++i <= merange >> 2); + if (bmv.checkRange(mvmin, mvmax)) + goto me_hex2; + break; + } + + case X265_STAR_SEARCH: // Adapted from HM ME + { + int bPointNr = 0; + int bDistance = 0; + + const int EarlyExitIters = 3; + StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange); + if (bDistance == 1) + { + // if best distance was only 1, check two missing points. If no new point is found, stop + if (bPointNr) + { + /* For a given direction 1 to 8, check nearest two outer X pixels + X X + X 1 2 3 X + 4 * 5 + X 6 7 8 X + X X + */ + int saved = bcost; + const MV mv1 = bmv + offsets[(bPointNr - 1) * 2]; + const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1]; + if (mv1.checkRange(mvmin, mvmax)) + { + COST_MV(mv1.x, mv1.y); + } + if (mv2.checkRange(mvmin, mvmax)) + { + COST_MV(mv2.x, mv2.y); + } + if (bcost == saved) + break; + } + else + break; + } + + const int RasterDistance = 5; + if (bDistance > RasterDistance) + { + // raster search refinement if original search distance was too big + MV tmv; + for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance) + { + for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance) + { + if (tmv.x + (RasterDistance * 3) <= mvmax.x) + { + pixel *pix_base = fref + tmv.y * stride + tmv.x; + sad_x4(fenc, + pix_base, + pix_base + RasterDistance, + pix_base + RasterDistance * 2, + pix_base + RasterDistance * 3, + stride, costs); + costs[0] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[0], bmv, tmv); + tmv.x += RasterDistance; + costs[1] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[1], bmv, tmv); + tmv.x += RasterDistance; + costs[2] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[2], bmv, tmv); + tmv.x += RasterDistance; + costs[3] += mvcost(tmv << 3); + COPY2_IF_LT(bcost, costs[3], bmv, tmv); + } + else + COST_MV(tmv.x, tmv.y); + } + } + } + + while (bDistance > 0) + { + // center a new search around current best + bDistance = 0; + bPointNr = 0; + const int MaxIters = 32; + StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange); + + if (bDistance == 1) + { + if (!bPointNr) + break; + + /* For a given direction 1 to 8, check nearest 2 outer X pixels + X X + X 1 2 3 X + 4 * 5 + X 6 7 8 X + X X + */ + const MV mv1 = bmv + offsets[(bPointNr - 1) * 2]; + const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1]; + if (mv1.checkRange(mvmin, mvmax)) + { + COST_MV(mv1.x, mv1.y); + } + if (mv2.checkRange(mvmin, mvmax)) + { + COST_MV(mv2.x, mv2.y); + } + break; + } + } + + break; + } + + case X265_FULL_SEARCH: + { + // dead slow exhaustive search, but at least it uses sad_x4() + MV tmv; + for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++) + { + for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++) + { + if (tmv.x + 3 <= mvmax.x) + { + pixel *pix_base = fref + tmv.y * stride + tmv.x; + sad_x4(fenc, + pix_base, + pix_base + 1, + pix_base + 2, + pix_base + 3, + stride, costs); + costs[0] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[0], bmv, tmv); + tmv.x++; + costs[1] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[1], bmv, tmv); + tmv.x++; + costs[2] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[2], bmv, tmv); + tmv.x++; + costs[3] += mvcost(tmv << 2); + COPY2_IF_LT(bcost, costs[3], bmv, tmv); + } + else + COST_MV(tmv.x, tmv.y); + } + } + + break; + } + + default: + X265_CHECK(0, "invalid motion estimate mode\n"); + break; + } + + if (bprecost < bcost) + { + bmv = bestpre; + bcost = bprecost; + } + else + bmv = bmv.toQPel(); // promote search bmv to qpel + + SubpelWorkload& wl = workload[this->subpelRefine]; + + if (!bcost) + { + /* if there was zero residual at the clipped MVP, we can skip subpel + * refine, but we do need to include the mvcost in the returned cost */ + bcost = mvcost(bmv); + } + else if (ref->isLowres) + { + int bdir = 0, cost; + for (int i = 1; i <= wl.hpel_dirs; i++) + { + MV qmv = bmv + square1[i] * 2; + cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); + COPY2_IF_LT(bcost, cost, bdir, i); + } + + bmv += square1[bdir] * 2; + bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv); + + bdir = 0; + for (int i = 1; i <= wl.qpel_dirs; i++) + { + MV qmv = bmv + square1[i]; + cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); + COPY2_IF_LT(bcost, cost, bdir, i); + } + + bmv += square1[bdir]; + } + else + { + pixelcmp_t hpelcomp; + + if (wl.hpel_satd) + { + bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); + hpelcomp = satd; + } + else + hpelcomp = sad; + + for (int iter = 0; iter < wl.hpel_iters; iter++) + { + int bdir = 0, cost; + for (int i = 1; i <= wl.hpel_dirs; i++) + { + MV qmv = bmv + square1[i] * 2; + cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); + COPY2_IF_LT(bcost, cost, bdir, i); + } + + if (bdir) + bmv += square1[bdir] * 2; + else + break; + } + + /* if HPEL search used SAD, remeasure with SATD before QPEL */ + if (!wl.hpel_satd) + bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); + + for (int iter = 0; iter < wl.qpel_iters; iter++) + { + int bdir = 0, cost; + for (int i = 1; i <= wl.qpel_dirs; i++) + { + MV qmv = bmv + square1[i]; + cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); + COPY2_IF_LT(bcost, cost, bdir, i); + } + + if (bdir) + bmv += square1[bdir]; + else + break; + } + } + + x265_emms(); + outQMv = bmv; + return bcost; +} + +int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp) +{ + int xFrac = qmv.x & 0x3; + int yFrac = qmv.y & 0x3; + + if ((yFrac | xFrac) == 0) + { + pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride; + return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride); + } + else + { + /* We are taking a short-cut here if the reference is weighted. To be + * accurate we should be interpolating unweighted pixels and weighting + * the final 16bit values prior to rounding and downshifting. Instead we + * are simply interpolating the weighted full-pel pixels. Not 100% + * accurate but good enough for fast qpel ME */ + ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); + pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride; + if (yFrac == 0) + { + primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac); + } + else if (xFrac == 0) + { + primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac); + } + else + { + ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]); + + int filterSize = NTAPS_LUMA; + int halfFilterSize = filterSize >> 1; + primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1); + primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac); + } + return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE); + } +} diff --git a/source/encoder/motion.h b/source/encoder/motion.h new file mode 100644 index 0000000..51687f5 --- /dev/null +++ b/source/encoder/motion.h @@ -0,0 +1,111 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_MOTIONESTIMATE_H +#define X265_MOTIONESTIMATE_H + +#include "primitives.h" +#include "reference.h" +#include "mv.h" +#include "bitcost.h" + +namespace x265 { +// private x265 namespace + +class MotionEstimate : public BitCost +{ +protected: + + /* Aligned copy of original pixels, extra room for manual alignment */ + pixel *fencplane; + intptr_t fencLumaStride; + + pixelcmp_t sad; + pixelcmp_t satd; + pixelcmp_t sa8d; + pixelcmp_x3_t sad_x3; + pixelcmp_x4_t sad_x4; + + intptr_t blockOffset; + int partEnum; + int searchMethod; + int subpelRefine; + + /* subpel generation buffers */ + int blockwidth; + int blockheight; + + MotionEstimate& operator =(const MotionEstimate&); + +public: + + static const int COST_MAX = 1 << 28; + + pixel *fenc; + + MotionEstimate(); + + ~MotionEstimate(); + + void setSearchMethod(int i) { searchMethod = i; } + + void setSubpelRefine(int i) { subpelRefine = i; } + + /* Methods called at slice setup */ + + void setSourcePlane(pixel *Y, intptr_t luma) + { + fencplane = Y; + fencLumaStride = luma; + } + + void setSourcePU(intptr_t offset, int pwidth, int pheight); + + /* buf*() and motionEstimate() methods all use cached fenc pixels and thus + * require setSourcePU() to be called prior. */ + + inline int bufSAD(pixel *fref, intptr_t stride) { return sad(fenc, FENC_STRIDE, fref, stride); } + + inline int bufSA8D(pixel *fref, intptr_t stride) { return sa8d(fenc, FENC_STRIDE, fref, stride); } + + inline int bufSATD(pixel *fref, intptr_t stride) { return satd(fenc, FENC_STRIDE, fref, stride); } + + int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv); + + int subpelCompare(ReferencePlanes * ref, const MV &qmv, pixelcmp_t); + +protected: + + inline void StarPatternSearch(ReferencePlanes *ref, + const MV & mvmin, + const MV & mvmax, + MV & bmv, + int & bcost, + int & bPointNr, + int & bDistance, + int earlyExitIters, + int merange); +}; +} + +#endif // ifndef X265_MOTIONESTIMATE_H diff --git a/source/encoder/nal.cpp b/source/encoder/nal.cpp new file mode 100644 index 0000000..c38c651 --- /dev/null +++ b/source/encoder/nal.cpp @@ -0,0 +1,218 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "bitstream.h" +#include "nal.h" + +using namespace x265; + +NALList::NALList() + : m_numNal(0) + , m_buffer(NULL) + , m_occupancy(0) + , m_allocSize(0) + , m_extraBuffer(NULL) + , m_extraOccupancy(0) + , m_extraAllocSize(0) +{} + +void NALList::takeContents(NALList& other) +{ + /* take other NAL buffer, discard our old one */ + X265_FREE(m_buffer); + m_buffer = other.m_buffer; + m_allocSize = other.m_allocSize; + m_occupancy = other.m_occupancy; + + /* copy packet data */ + m_numNal = other.m_numNal; + memcpy(m_nal, other.m_nal, sizeof(x265_nal) * m_numNal); + + /* reset other list, re-allocate their buffer with same size */ + other.m_numNal = 0; + other.m_occupancy = 0; + other.m_buffer = X265_MALLOC(uint8_t, m_allocSize); +} + +void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs) +{ + static const char startCodePrefix[] = { 0, 0, 0, 1 }; + + uint32_t payloadSize = bs.getNumberOfWrittenBytes(); + const uint8_t* bpayload = bs.getFIFO(); + if (!bpayload) + return; + + uint32_t nextSize = m_occupancy + sizeof(startCodePrefix) + 2 + payloadSize + (payloadSize >> 1) + m_extraOccupancy; + if (nextSize > m_allocSize) + { + uint8_t *temp = X265_MALLOC(uint8_t, nextSize); + if (temp) + { + memcpy(temp, m_buffer, m_occupancy); + + /* fixup existing payload pointers */ + for (uint32_t i = 0; i < m_numNal; i++) + m_nal[i].payload = temp + (m_nal[i].payload - m_buffer); + + X265_FREE(m_buffer); + m_buffer = temp; + m_allocSize = nextSize; + } + else + { + x265_log(NULL, X265_LOG_ERROR, "Unable to realloc access unit buffer\n"); + return; + } + } + + uint8_t *out = m_buffer + m_occupancy; + uint32_t bytes = 0; + + if (!m_numNal || nalUnitType == NAL_UNIT_SPS || nalUnitType == NAL_UNIT_PPS) + { + memcpy(out, startCodePrefix, 4); + bytes += 4; + } + else + { + memcpy(out, startCodePrefix + 1, 3); + bytes += 3; + } + + /* 16 bit NAL header: + * forbidden_zero_bit 1-bit + * nal_unit_type 6-bits + * nuh_reserved_zero_6bits 6-bits + * nuh_temporal_id_plus1 3-bits */ + out[bytes++] = (uint8_t)nalUnitType << 1; + out[bytes++] = 1; + + /* 7.4.1 ... + * Within the NAL unit, the following three-byte sequences shall not occur at + * any byte-aligned position: + * - 0x000000 + * - 0x000001 + * - 0x000002 */ + for (uint32_t i = 0; i < payloadSize; i++) + { + if (i > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03) + { + /* inject 0x03 to prevent emulating a start code */ + out[bytes] = out[bytes - 1]; + out[bytes - 1] = 0x03; + bytes++; + } + + out[bytes++] = bpayload[i]; + } + + X265_CHECK(bytes <= 4 + 2 + payloadSize + (payloadSize >> 1), "NAL buffer overflow\n"); + + if (m_extraOccupancy) + { + /* these bytes were escaped by serializeSubstreams */ + memcpy(out + bytes, m_extraBuffer, m_extraOccupancy); + bytes += m_extraOccupancy; + m_extraOccupancy = 0; + } + + /* 7.4.1.1 + * ... when the last byte of the RBSP data is equal to 0x00 (which can + * only occur when the RBSP ends in a cabac_zero_word), a final byte equal + * to 0x03 is appended to the end of the data. */ + if (!out[bytes - 1]) + out[bytes++] = 0x03; + m_occupancy += bytes; + + X265_CHECK(m_numNal < (uint32_t)MAX_NAL_UNITS, "NAL count overflow\n"); + + x265_nal& nal = m_nal[m_numNal++]; + nal.type = nalUnitType; + nal.sizeBytes = bytes; + nal.payload = out; +} + +/* concatenate and escape WPP sub-streams, return escaped row lengths. + * These streams will be appended to the next serialized NAL */ +uint32_t NALList::serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams) +{ + uint32_t maxStreamSize = 0; + uint32_t estSize = 0; + for (uint32_t s = 0; s < streamCount; s++) + estSize += streams[s].getNumberOfWrittenBytes(); + estSize += estSize >> 1; + + if (estSize > m_extraAllocSize) + { + uint8_t *temp = X265_MALLOC(uint8_t, estSize); + if (temp) + { + X265_FREE(m_extraBuffer); + m_extraBuffer = temp; + m_extraAllocSize = estSize; + } + else + { + x265_log(NULL, X265_LOG_ERROR, "Unable to realloc WPP substream concatenation buffer\n"); + return 0; + } + } + + uint32_t bytes = 0; + uint8_t *out = m_extraBuffer; + for (uint32_t s = 0; s < streamCount; s++) + { + const Bitstream& stream = streams[s]; + uint32_t inSize = stream.getNumberOfWrittenBytes(); + const uint8_t *inBytes = stream.getFIFO(); + uint32_t prevBufSize = bytes; + + if (inBytes) + { + for (uint32_t i = 0; i < inSize; i++) + { + if (bytes > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03) + { + /* inject 0x03 to prevent emulating a start code */ + out[bytes] = out[bytes - 1]; + out[bytes - 1] = 0x03; + bytes++; + } + + out[bytes++] = inBytes[i]; + } + } + + if (s < streamCount - 1) + { + streamSizeBytes[s] = bytes - prevBufSize; + if (streamSizeBytes[s] > maxStreamSize) + maxStreamSize = streamSizeBytes[s]; + } + } + + m_extraOccupancy = bytes; + return maxStreamSize; +} diff --git a/source/encoder/nal.h b/source/encoder/nal.h new file mode 100644 index 0000000..3b55dd1 --- /dev/null +++ b/source/encoder/nal.h @@ -0,0 +1,64 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_NAL_H +#define X265_NAL_H + +#include "common.h" +#include "x265.h" + +namespace x265 { +// private namespace + +class Bitstream; + +class NALList +{ + static const int MAX_NAL_UNITS = 16; + +public: + + x265_nal m_nal[MAX_NAL_UNITS]; + uint32_t m_numNal; + + uint8_t* m_buffer; + uint32_t m_occupancy; + uint32_t m_allocSize; + + uint8_t* m_extraBuffer; + uint32_t m_extraOccupancy; + uint32_t m_extraAllocSize; + + NALList(); + ~NALList() { X265_FREE(m_buffer); X265_FREE(m_extraBuffer); } + + void takeContents(NALList& other); + + void serialize(NalUnitType nalUnitType, const Bitstream& bs); + + uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams); +}; + +} + +#endif // ifndef X265_NAL_H diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp new file mode 100644 index 0000000..f54b101 --- /dev/null +++ b/source/encoder/ratecontrol.cpp @@ -0,0 +1,2382 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Sumalatha Polureddy + * Aarthi Priya Thirumalai + * Xun Xu, PPLive Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "param.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" + +#include "encoder.h" +#include "slicetype.h" +#include "ratecontrol.h" +#include "sei.h" + +#define BR_SHIFT 6 +#define CPB_SHIFT 4 + +using namespace x265; + +/* Amortize the partial cost of I frames over the next N frames */ +const double RateControl::s_amortizeFraction = 0.85; +const int RateControl::s_amortizeFrames = 75; +const int RateControl::s_slidingWindowFrames = 20; +const char *RateControl::s_defaultStatFileName = "x265_2pass.log"; + +namespace { +#define CMP_OPT_FIRST_PASS(opt, param_val)\ +{\ + bErr = 0;\ + p = strstr(opts, opt "=");\ + char* q = strstr(opts, "no-"opt);\ + if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\ + bErr = 1;\ + else if (!param_val && !q && !p)\ + bErr = 1;\ + else if (param_val && (q || !strstr(opts, opt)))\ + bErr = 1;\ + if (bErr)\ + {\ + x265_log(m_param, X265_LOG_ERROR, "different " opt " setting than first pass (%d vs %d)\n", param_val, i);\ + return false;\ + }\ +} + +inline int calcScale(uint32_t x) +{ + static uint8_t lut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; + int y, z = (((x & 0xffff) - 1) >> 27) & 16; + x >>= z; + z += y = (((x & 0xff) - 1) >> 28) & 8; + x >>= y; + z += y = (((x & 0xf) - 1) >> 29) & 4; + x >>= y; + return z + lut[x&0xf]; +} + +inline int calcLength(uint32_t x) +{ + static uint8_t lut[16] = {4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}; + int y, z = (((x >> 16) - 1) >> 27) & 16; + x >>= z ^ 16; + z += y = ((x - 0x100) >> 28) & 8; + x >>= y ^ 8; + z += y = ((x - 0x10) >> 29) & 4; + x >>= y ^ 4; + return z + lut[x]; +} + +inline void reduceFraction(int* n, int* d) +{ + int a = *n; + int b = *d; + int c; + if (!a || !b) + return; + c = a % b; + while (c) + { + a = b; + b = c; + c = a % b; + } + *n /= b; + *d /= b; +} + +inline char *strcatFilename(const char *input, const char *suffix) +{ + char *output = X265_MALLOC(char, strlen(input) + strlen(suffix) + 1); + if (!output) + { + x265_log(NULL, X265_LOG_ERROR, "unable to allocate memory for filename\n"); + return NULL; + } + strcpy(output, input); + strcat(output, suffix); + return output; +} + +inline double qScale2bits(RateControlEntry *rce, double qScale) +{ + if (qScale < 0.1) + qScale = 0.1; + return (rce->coeffBits + .1) * pow(rce->qScale / qScale, 1.1) + + rce->mvBits * pow(X265_MAX(rce->qScale, 1) / X265_MAX(qScale, 1), 0.5) + + rce->miscBits; +} + +inline void copyRceData(RateControlEntry* rce, RateControlEntry* rce2Pass) +{ + rce->coeffBits = rce2Pass->coeffBits; + rce->mvBits = rce2Pass->mvBits; + rce->miscBits = rce2Pass->miscBits; + rce->iCuCount = rce2Pass->iCuCount; + rce->pCuCount = rce2Pass->pCuCount; + rce->skipCuCount = rce2Pass->skipCuCount; + rce->keptAsRef = rce2Pass->keptAsRef; + rce->qScale = rce2Pass->qScale; + rce->newQScale = rce2Pass->newQScale; + rce->expectedBits = rce2Pass->expectedBits; + rce->expectedVbv = rce2Pass->expectedVbv; + rce->blurredComplexity = rce2Pass->blurredComplexity; + rce->sliceType = rce2Pass->sliceType; +} + +} // end anonymous namespace +/* Compute variance to derive AC energy of each block */ +static inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int i) +{ + uint32_t sum = (uint32_t)sum_ssd; + uint32_t ssd = (uint32_t)(sum_ssd >> 32); + + curFrame->m_lowres.wp_sum[i] += sum; + curFrame->m_lowres.wp_ssd[i] += ssd; + return ssd - ((uint64_t)sum * sum >> shift); +} + +/* Find the energy of each block in Y/Cb/Cr plane */ +static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int bChroma, int colorFormat) +{ + if ((colorFormat != X265_CSP_I444) && bChroma) + { + ALIGN_VAR_8(pixel, pix[8 * 8]); + primitives.luma_copy_pp[LUMA_8x8](pix, 8, src, srcStride); + return acEnergyVar(curFrame, primitives.var[BLOCK_8x8](pix, 8), 6, bChroma); + } + else + return acEnergyVar(curFrame, primitives.var[BLOCK_16x16](src, srcStride), 8, bChroma); +} + +/* Find the total AC energy of each block in all planes */ +uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y) +{ + intptr_t stride = curFrame->m_origPicYuv->m_stride; + intptr_t cStride = curFrame->m_origPicYuv->m_strideC; + intptr_t blockOffsetLuma = block_x + (block_y * stride); + int colorFormat = m_param->internalCsp; + int hShift = CHROMA_H_SHIFT(colorFormat); + int vShift = CHROMA_V_SHIFT(colorFormat); + intptr_t blockOffsetChroma = (block_x >> hShift) + ((block_y >> vShift) * cStride); + + uint32_t var; + + var = acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat); + var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat); + var += acEnergyPlane(curFrame, curFrame->m_origPicYuv->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat); + x265_emms(); + return var; +} + +void RateControl::calcAdaptiveQuantFrame(Frame *curFrame) +{ + /* Actual adaptive quantization */ + int maxCol = curFrame->m_origPicYuv->m_picWidth; + int maxRow = curFrame->m_origPicYuv->m_picHeight; + + for (int y = 0; y < 3; y++) + { + curFrame->m_lowres.wp_ssd[y] = 0; + curFrame->m_lowres.wp_sum[y] = 0; + } + + /* Calculate Qp offset for each 16x16 block in the frame */ + int block_xy = 0; + int block_x = 0, block_y = 0; + double strength = 0.f; + if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0) + { + /* Need to init it anyways for CU tree */ + int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int cuCount = cuWidth * cuHeight; + + if (m_param->rc.aqMode && m_param->rc.aqStrength == 0) + { + memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double)); + memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double)); + for (int cuxy = 0; cuxy < cuCount; cuxy++) + curFrame->m_lowres.invQscaleFactor[cuxy] = 256; + } + + /* Need variance data for weighted prediction */ + if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred) + { + for (block_y = 0; block_y < maxRow; block_y += 16) + for (block_x = 0; block_x < maxCol; block_x += 16) + acEnergyCu(curFrame, block_x, block_y); + } + } + else + { + block_xy = 0; + double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0; + if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE) + { + double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5); + for (block_y = 0; block_y < maxRow; block_y += 16) + { + for (block_x = 0; block_x < maxCol; block_x += 16) + { + uint32_t energy = acEnergyCu(curFrame, block_x, block_y); + qp_adj = pow(energy + 1, 0.1); + curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj; + avg_adj += qp_adj; + avg_adj_pow2 += qp_adj * qp_adj; + block_xy++; + } + } + + avg_adj /= m_ncu; + avg_adj_pow2 /= m_ncu; + strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction; + avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj; + } + else + strength = m_param->rc.aqStrength * 1.0397f; + + block_xy = 0; + for (block_y = 0; block_y < maxRow; block_y += 16) + { + for (block_x = 0; block_x < maxCol; block_x += 16) + { + if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE) + { + qp_adj = curFrame->m_lowres.qpCuTreeOffset[block_xy]; + qp_adj = strength * (qp_adj - avg_adj); + } + else + { + uint32_t energy = acEnergyCu(curFrame, block_x, block_y); + qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8))); + } + curFrame->m_lowres.qpAqOffset[block_xy] = qp_adj; + curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj; + curFrame->m_lowres.invQscaleFactor[block_xy] = x265_exp2fix8(qp_adj); + block_xy++; + } + } + } + + if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred) + { + int hShift = CHROMA_H_SHIFT(m_param->internalCsp); + int vShift = CHROMA_V_SHIFT(m_param->internalCsp); + maxCol = ((maxCol + 8) >> 4) << 4; + maxRow = ((maxRow + 8) >> 4) << 4; + int width[3] = { maxCol, maxCol >> hShift, maxCol >> hShift }; + int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift }; + + for (int i = 0; i < 3; i++) + { + uint64_t sum, ssd; + sum = curFrame->m_lowres.wp_sum[i]; + ssd = curFrame->m_lowres.wp_ssd[i]; + curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]); + } + } +} + +RateControl::RateControl(x265_param *p) +{ + m_param = p; + int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + m_ncu = lowresCuWidth * lowresCuHeight; + + if (m_param->rc.cuTree) + m_qCompress = 1; + else + m_qCompress = m_param->rc.qCompress; + + // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate() + m_residualFrames = 0; + m_partialResidualFrames = 0; + m_residualCost = 0; + m_partialResidualCost = 0; + m_rateFactorMaxIncrement = 0; + m_rateFactorMaxDecrement = 0; + m_fps = m_param->fpsNum / m_param->fpsDenom; + m_startEndOrder.set(0); + m_bTerminated = false; + m_finalFrameCount = 0; + m_numEntries = 0; + if (m_param->rc.rateControlMode == X265_RC_CRF) + { + m_param->rc.qp = (int)m_param->rc.rfConstant; + m_param->rc.bitrate = 0; + + double baseCplx = m_ncu * (m_param->bframes ? 120 : 80); + double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0; + m_rateFactorConstant = pow(baseCplx, 1 - m_qCompress) / + x265_qp2qScale(m_param->rc.rfConstant + mbtree_offset); + if (m_param->rc.rfConstantMax) + { + m_rateFactorMaxIncrement = m_param->rc.rfConstantMax - m_param->rc.rfConstant; + if (m_rateFactorMaxIncrement <= 0) + { + x265_log(m_param, X265_LOG_WARNING, "CRF max must be greater than CRF\n"); + m_rateFactorMaxIncrement = 0; + } + } + if (m_param->rc.rfConstantMin) + m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin; + } + m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead; + m_2pass = m_param->rc.rateControlMode == X265_RC_ABR && m_param->rc.bStatRead; + m_bitrate = m_param->rc.bitrate * 1000; + m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum; + m_qp = m_param->rc.qp; + m_lastRceq = 1; /* handles the cmplxrsum when the previous frame cost is zero */ + m_shortTermCplxSum = 0; + m_shortTermCplxCount = 0; + m_lastNonBPictType = I_SLICE; + m_isAbrReset = false; + m_lastAbrResetPoc = -1; + m_statFileOut = NULL; + m_cutreeStatFileOut = m_cutreeStatFileIn = NULL; + m_rce2Pass = NULL; + + // vbv initialization + m_param->rc.vbvBufferSize = Clip3(0, 2000000, m_param->rc.vbvBufferSize); + m_param->rc.vbvMaxBitrate = Clip3(0, 2000000, m_param->rc.vbvMaxBitrate); + m_param->rc.vbvBufferInit = Clip3(0.0, 2000000.0, m_param->rc.vbvBufferInit); + m_singleFrameVbv = 0; + if (m_param->rc.vbvBufferSize) + { + if (m_param->rc.rateControlMode == X265_RC_CQP) + { + x265_log(m_param, X265_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n"); + m_param->rc.vbvBufferSize = 0; + m_param->rc.vbvMaxBitrate = 0; + } + else if (m_param->rc.vbvMaxBitrate == 0) + { + if (m_param->rc.rateControlMode == X265_RC_ABR) + { + x265_log(m_param, X265_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n"); + m_param->rc.vbvMaxBitrate = m_param->rc.bitrate; + } + else + { + x265_log(m_param, X265_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n"); + m_param->rc.vbvBufferSize = 0; + } + } + else if (m_param->rc.vbvMaxBitrate < m_param->rc.bitrate && + m_param->rc.rateControlMode == X265_RC_ABR) + { + x265_log(m_param, X265_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n"); + m_param->rc.bitrate = m_param->rc.vbvMaxBitrate; + } + } + else if (m_param->rc.vbvMaxBitrate) + { + x265_log(m_param, X265_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n"); + m_param->rc.vbvMaxBitrate = 0; + } + m_isVbv = m_param->rc.vbvMaxBitrate > 0 && m_param->rc.vbvBufferSize > 0; + if (m_param->bEmitHRDSEI && !m_isVbv) + { + x265_log(m_param, X265_LOG_WARNING, "NAL HRD parameters require VBV parameters, ignored\n"); + m_param->bEmitHRDSEI = 0; + } + + m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && !m_2pass && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate; + m_leadingBframes = m_param->bframes; + m_bframeBits = 0; + m_leadingNoBSatd = 0; + m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor); + m_pbOffset = 6.0 * X265_LOG2(m_param->rc.pbFactor); + + /* Adjust the first frame in order to stabilize the quality level compared to the rest */ +#define ABR_INIT_QP_MIN (24) +#define ABR_INIT_QP_MAX (40) +#define CRF_INIT_QP (int)m_param->rc.rfConstant + for (int i = 0; i < 3; i++) + m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN); + + if (m_param->rc.rateControlMode == X265_RC_CQP) + { + if (m_qp && !m_param->bLossless) + { + m_qpConstant[P_SLICE] = m_qp; + m_qpConstant[I_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_qp - m_ipOffset + 0.5)); + m_qpConstant[B_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_qp + m_pbOffset + 0.5)); + } + else + { + m_qpConstant[P_SLICE] = m_qpConstant[I_SLICE] = m_qpConstant[B_SLICE] = m_qp; + } + } + + /* qstep - value set as encoder specific */ + m_lstep = pow(2, m_param->rc.qpStep / 6.0); + + for (int i = 0; i < 2; i++) + m_cuTreeStats.qpBuffer[i] = NULL; +} + +bool RateControl::init(const SPS *sps) +{ + if (m_isVbv) + { + /* We don't support changing the ABR bitrate right now, + * so if the stream starts as CBR, keep it CBR. */ + if (m_param->rc.vbvBufferSize < (int)(m_param->rc.vbvMaxBitrate / m_fps)) + { + m_param->rc.vbvBufferSize = (int)(m_param->rc.vbvMaxBitrate / m_fps); + x265_log(m_param, X265_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n", + m_param->rc.vbvBufferSize); + } + int vbvBufferSize = m_param->rc.vbvBufferSize * 1000; + int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000; + + if (m_param->bEmitHRDSEI) + { + const HRDInfo* hrd = &sps->vuiParameters.hrdParameters; + vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT); + vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT); + } + m_bufferRate = vbvMaxBitrate / m_fps; + m_vbvMaxRate = vbvMaxBitrate; + m_bufferSize = vbvBufferSize; + m_singleFrameVbv = m_bufferRate * 1.1 > m_bufferSize; + + if (m_param->rc.vbvBufferInit > 1.) + m_param->rc.vbvBufferInit = Clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize); + m_param->rc.vbvBufferInit = Clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize)); + m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit; + } + + m_totalBits = 0; + m_framesDone = 0; + m_residualCost = 0; + m_partialResidualCost = 0; + for (int i = 0; i < s_slidingWindowFrames; i++) + { + m_satdCostWindow[i] = 0; + m_encodedBitsWindow[i] = 0; + } + m_sliderPos = 0; + + /* 720p videos seem to be a good cutoff for cplxrSum */ + double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1; + + /* estimated ratio that produces a reasonable QP for the first I-frame */ + m_cplxrSum = .01 * pow(7.0e5, m_qCompress) * pow(m_ncu, 0.5) * tuneCplxFactor; + m_wantedBitsWindow = m_bitrate * m_frameDuration; + m_accumPNorm = .01; + m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm; + + /* Frame Predictors and Row predictors used in vbv */ + for (int i = 0; i < 5; i++) + { + m_pred[i].coeff = 2.0; + m_pred[i].count = 1.0; + m_pred[i].decay = 0.5; + m_pred[i].offset = 0.0; + } + m_predBfromP = m_pred[0]; + if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead)) + { + /* If the user hasn't defined the stat filename, use the default value */ + const char *fileName = m_param->rc.statFileName; + if (!fileName) + fileName = s_defaultStatFileName; + /* Load stat file and init 2pass algo */ + if (m_param->rc.bStatRead) + { + m_expectedBitsSum = 0; + char *p, *statsIn, *statsBuf; + /* read 1st pass stats */ + statsIn = statsBuf = x265_slurp_file(fileName); + if (!statsBuf) + return false; + if (m_param->rc.cuTree) + { + char *tmpFile = strcatFilename(fileName, ".cutree"); + if (!tmpFile) + return false; + m_cutreeStatFileIn = fopen(tmpFile, "rb"); + X265_FREE(tmpFile); + if (!m_cutreeStatFileIn) + { + x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", tmpFile); + return false; + } + } + + /* check whether 1st pass options were compatible with current options */ + if (strncmp(statsBuf, "#options:", 9)) + { + x265_log(m_param, X265_LOG_ERROR,"options list in stats file not valid\n"); + return false; + } + { + int i, j; + uint32_t k , l; + bool bErr = false; + char *opts = statsBuf; + statsIn = strchr(statsBuf, '\n'); + if (!statsIn) + { + x265_log(m_param, X265_LOG_ERROR, "Malformed stats file\n"); + return false; + } + *statsIn = '\0'; + statsIn++; + if (sscanf(opts, "#options: %dx%d", &i, &j) != 2) + { + x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n"); + return false; + } + if ((p = strstr(opts, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2) + { + x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n"); + return false; + } + if (k != m_param->fpsNum || l != m_param->fpsDenom) + { + x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n", + m_param->fpsNum, m_param->fpsDenom, k, l); + return false; + } + CMP_OPT_FIRST_PASS("bitdepth", m_param->internalBitDepth); + CMP_OPT_FIRST_PASS("weightp", m_param->bEnableWeightedPred); + CMP_OPT_FIRST_PASS("bframes", m_param->bframes); + CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid); + CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP); + CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax); + CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold); + + if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS) + { + m_param->bFrameAdaptive = i; + } + else if (m_param->bframes) + { + x265_log(m_param, X265_LOG_ERROR, "b-adapt method specified in stats file not valid\n"); + return false; + } + + if ((p = strstr(opts, "rc-lookahead=")) != 0 && sscanf(p, "rc-lookahead=%d", &i)) + m_param->lookaheadDepth = i; + } + /* find number of pics */ + p = statsIn; + int numEntries; + for (numEntries = -1; p; numEntries++) + p = strchr(p + 1, ';'); + if (!numEntries) + { + x265_log(m_param, X265_LOG_ERROR, "empty stats file\n"); + return false; + } + m_numEntries = numEntries; + + if (m_param->totalFrames < m_numEntries && m_param->totalFrames > 0) + { + x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n", + m_param->totalFrames, m_numEntries); + } + if (m_param->totalFrames > m_numEntries) + { + x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n", + m_param->totalFrames, m_numEntries); + return false; + } + + m_rce2Pass = X265_MALLOC(RateControlEntry, m_numEntries); + if (!m_rce2Pass) + { + x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n"); + return false; + } + /* init all to skipped p frames */ + for (int i = 0; i < m_numEntries; i++) + { + RateControlEntry *rce = &m_rce2Pass[i]; + rce->sliceType = P_SLICE; + rce->qScale = rce->newQScale = x265_qp2qScale(20); + rce->miscBits = m_ncu + 10; + rce->newQp = 0; + } + /* read stats */ + p = statsIn; + double totalQpAq = 0; + for (int i = 0; i < m_numEntries; i++) + { + RateControlEntry *rce; + int frameNumber; + char picType; + int e; + char *next; + double qpRc, qpAq; + next = strstr(p, ";"); + if (next) + *next++ = 0; + e = sscanf(p, " in:%d ", &frameNumber); + if (frameNumber < 0 || frameNumber >= m_numEntries) + { + x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i); + return false; + } + rce = &m_rce2Pass[frameNumber]; + e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf", + &picType, &qpRc, &qpAq, &rce->coeffBits, + &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount, + &rce->skipCuCount); + rce->keptAsRef = true; + if (picType == 'b' || picType == 'p') + rce->keptAsRef = false; + if (picType == 'I' || picType == 'i') + rce->sliceType = I_SLICE; + else if (picType == 'P' || picType == 'p') + rce->sliceType = P_SLICE; + else if (picType == 'B' || picType == 'b') + rce->sliceType = B_SLICE; + else + e = -1; + if (e < 10) + { + x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e); + return false; + } + rce->qScale = x265_qp2qScale(qpRc); + totalQpAq += qpAq; + p = next; + } + X265_FREE(statsBuf); + + if (m_param->rc.rateControlMode == X265_RC_ABR) + { + if (!initPass2()) + return false; + } /* else we're using constant quant, so no need to run the bitrate allocation */ + } + /* Open output file */ + /* If input and output files are the same, output to a temp file + * and move it to the real name only when it's complete */ + if (m_param->rc.bStatWrite) + { + char *p, *statFileTmpname; + statFileTmpname = strcatFilename(fileName, ".temp"); + if (!statFileTmpname) + return false; + m_statFileOut = fopen(statFileTmpname, "wb"); + X265_FREE(statFileTmpname); + if (!m_statFileOut) + { + x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", statFileTmpname); + return false; + } + p = x265_param2string(m_param); + if (p) + fprintf(m_statFileOut, "#options: %s\n", p); + X265_FREE(p); + if (m_param->rc.cuTree && !m_param->rc.bStatRead) + { + statFileTmpname = strcatFilename(fileName, ".cutree.temp"); + if (!statFileTmpname) + return false; + m_cutreeStatFileOut = fopen(statFileTmpname, "wb"); + X265_FREE(statFileTmpname); + if (!m_cutreeStatFileOut) + { + x265_log(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s\n", statFileTmpname); + return false; + } + } + } + if (m_param->rc.cuTree) + { + m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t)); + if (m_param->bBPyramid && m_param->rc.bStatRead) + m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t)); + m_cuTreeStats.qpBufPos = -1; + } + } + return true; +} + +void RateControl::initHRD(SPS *sps) +{ + int vbvBufferSize = m_param->rc.vbvBufferSize * 1000; + int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000; + + // Init HRD + HRDInfo* hrd = &sps->vuiParameters.hrdParameters; + hrd->cbrFlag = m_isCbr; + + // normalize HRD size and rate to the value / scale notation + hrd->bitRateScale = Clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT); + hrd->bitRateValue = (vbvMaxBitrate >> (hrd->bitRateScale + BR_SHIFT)); + + hrd->cpbSizeScale = Clip3(0, 15, calcScale(vbvBufferSize) - CPB_SHIFT); + hrd->cpbSizeValue = (vbvBufferSize >> (hrd->cpbSizeScale + CPB_SHIFT)); + int bitRateUnscale = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT); + int cpbSizeUnscale = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT); + + // arbitrary + #define MAX_DURATION 0.5 + + TimingInfo *time = &sps->vuiParameters.timingInfo; + int maxCpbOutputDelay = (int)(X265_MIN(m_param->keyframeMax * MAX_DURATION * time->timeScale / time->numUnitsInTick, INT_MAX)); + int maxDpbOutputDelay = (int)(sps->maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick); + int maxDelay = (int)(90000.0 * cpbSizeUnscale / bitRateUnscale + 0.5); + + hrd->initialCpbRemovalDelayLength = 2 + Clip3(4, 22, 32 - calcLength(maxDelay)); + hrd->cpbRemovalDelayLength = Clip3(4, 31, 32 - calcLength(maxCpbOutputDelay)); + hrd->dpbOutputDelayLength = Clip3(4, 31, 32 - calcLength(maxDpbOutputDelay)); + + #undef MAX_DURATION +} + +bool RateControl::initPass2() +{ + uint64_t allConstBits = 0; + uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration); + double rateFactor, stepMult; + double qBlur = m_param->rc.qblur; + double cplxBlur = m_param->rc.complexityBlur; + const int filterSize = (int)(qBlur * 4) | 1; + double expectedBits; + double *qScale, *blurredQscale; + double baseCplx = m_ncu * (m_param->bframes ? 120 : 80); + double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION; + + /* find total/average complexity & const_bits */ + for (int i = 0; i < m_numEntries; i++) + allConstBits += m_rce2Pass[i].miscBits; + + if (allAvailableBits < allConstBits) + { + x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n", + (int)(allConstBits * m_fps / m_numEntries * 1000.)); + return false; + } + + /* Blur complexities, to reduce local fluctuation of QP. + * We don't blur the QPs directly, because then one very simple frame + * could drag down the QP of a nearby complex frame and give it more + * bits than intended. */ + for (int i = 0; i < m_numEntries; i++) + { + double weightSum = 0; + double cplxSum = 0; + double weight = 1.0; + double gaussianWeight; + /* weighted average of cplx of future frames */ + for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++) + { + RateControlEntry *rcj = &m_rce2Pass[i + j]; + weight *= 1 - pow(rcj->iCuCount / m_ncu, 2); + if (weight < 0.0001) + break; + gaussianWeight = weight * exp(-j * j / 200.0); + weightSum += gaussianWeight; + cplxSum += gaussianWeight * (qScale2bits(rcj, 1) - rcj->miscBits) / clippedDuration; + } + /* weighted average of cplx of past frames */ + weight = 1.0; + for (int j = 0; j <= cplxBlur * 2 && j <= i; j++) + { + RateControlEntry *rcj = &m_rce2Pass[i - j]; + gaussianWeight = weight * exp(-j * j / 200.0); + weightSum += gaussianWeight; + cplxSum += gaussianWeight * (qScale2bits(rcj, 1) - rcj->miscBits) / clippedDuration; + weight *= 1 - pow(rcj->iCuCount / m_ncu, 2); + if (weight < .0001) + break; + } + m_rce2Pass[i].blurredComplexity = cplxSum / weightSum; + } + + CHECKED_MALLOC(qScale, double, m_numEntries); + if (filterSize > 1) + { + CHECKED_MALLOC(blurredQscale, double, m_numEntries); + } + else + blurredQscale = qScale; + + /* Search for a factor which, when multiplied by the RCEQ values from + * each frame, adds up to the desired total size. + * There is no exact closed-form solution because of VBV constraints and + * because qscale2bits is not invertible, but we can start with the simple + * approximation of scaling the 1st pass by the ratio of bitrates. + * The search range is probably overkill, but speed doesn't matter here. */ + + expectedBits = 1; + for (int i = 0; i < m_numEntries; i++) + { + RateControlEntry* rce = &m_rce2Pass[i]; + double q = getQScale(rce, 1.0); + expectedBits += qScale2bits(rce, q); + m_lastQScaleFor[rce->sliceType] = q; + } + stepMult = allAvailableBits / expectedBits; + + rateFactor = 0; + for (double step = 1E4 * stepMult; step > 1E-7 * stepMult; step *= 0.5) + { + expectedBits = 0; + rateFactor += step; + + m_lastNonBPictType = -1; + m_lastAccumPNorm = 1; + m_accumPNorm = 0; + + m_lastQScaleFor[0] = m_lastQScaleFor[1] = + m_lastQScaleFor[2] = pow(baseCplx, 1 - m_qCompress) / rateFactor; + + /* find qscale */ + for (int i = 0; i < m_numEntries; i++) + { + RateControlEntry *rce = &m_rce2Pass[i]; + qScale[i] = getQScale(rce, rateFactor); + m_lastQScaleFor[rce->sliceType] = qScale[i]; + } + + /* fixed I/B qscale relative to P */ + for (int i = m_numEntries - 1; i >= 0; i--) + { + qScale[i] = getDiffLimitedQScale(&m_rce2Pass[i], qScale[i]); + X265_CHECK(qScale[i] >= 0, "qScale became negative\n"); + } + + /* smooth curve */ + if (filterSize > 1) + { + X265_CHECK(filterSize % 2 == 1, "filterSize not an odd number\n"); + for (int i = 0; i < m_numEntries; i++) + { + double q = 0.0, sum = 0.0; + + for (int j = 0; j < filterSize; j++) + { + int idx = i + j - filterSize / 2; + double d = idx - i; + double coeff = qBlur == 0 ? 1.0 : exp(-d * d / (qBlur * qBlur)); + if (idx < 0 || idx >= m_numEntries) + continue; + if (m_rce2Pass[i].sliceType != m_rce2Pass[idx].sliceType) + continue; + q += qScale[idx] * coeff; + sum += coeff; + } + blurredQscale[i] = q / sum; + } + } + + /* find expected bits */ + for (int i = 0; i < m_numEntries; i++) + { + RateControlEntry *rce = &m_rce2Pass[i]; + rce->newQScale = clipQscale(NULL, rce, blurredQscale[i]); // check if needed + X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n"); + expectedBits += qScale2bits(rce, rce->newQScale); + } + + if (expectedBits > allAvailableBits) + rateFactor -= step; + } + + X265_FREE(qScale); + if (filterSize > 1) + X265_FREE(blurredQscale); + + if (m_isVbv) + if (!vbv2Pass(allAvailableBits)) + return false; + expectedBits = countExpectedBits(); + + if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01) + { + double avgq = 0; + for (int i = 0; i < m_numEntries; i++) + avgq += m_rce2Pass[i].newQScale; + avgq = x265_qScale2qp(avgq / m_numEntries); + + if (expectedBits > allAvailableBits || !m_isVbv) + x265_log(m_param, X265_LOG_WARNING, "Error: 2pass curve failed to converge\n"); + x265_log(m_param, X265_LOG_WARNING, "target: %.2f kbit/s, expected: %.2f kbit/s, avg QP: %.4f\n", + (double)m_param->rc.bitrate, + expectedBits * m_fps / (m_numEntries * 1000.), + avgq); + if (expectedBits < allAvailableBits && avgq < QP_MIN + 2) + { + x265_log(m_param, X265_LOG_WARNING, "try reducing target bitrate\n"); + } + else if (expectedBits > allAvailableBits && avgq > QP_MAX_SPEC - 2) + { + x265_log(m_param, X265_LOG_WARNING, "try increasing target bitrate\n"); + } + else if (!(m_2pass && m_isVbv)) + x265_log(m_param, X265_LOG_WARNING, "internal error\n"); + } + + return true; + +fail: + x265_log(m_param, X265_LOG_WARNING, "two-pass ABR initialization failed\n"); + return false; +} + +bool RateControl::vbv2Pass(uint64_t allAvailableBits) +{ + /* for each interval of bufferFull .. underflow, uniformly increase the qp of all + * frames in the interval until either buffer is full at some intermediate frame or the + * last frame in the interval no longer underflows. Recompute intervals and repeat. + * Then do the converse to put bits back into overflow areas until target size is met */ + + double *fills; + double expectedBits = 0; + double adjustment; + double prevBits = 0; + int t0, t1; + int iterations = 0 , adjMin, adjMax; + CHECKED_MALLOC(fills, double, m_numEntries + 1); + fills++; + + /* adjust overall stream size */ + do + { + iterations++; + prevBits = expectedBits; + + if (expectedBits) + { /* not first iteration */ + adjustment = X265_MAX(X265_MIN(expectedBits / allAvailableBits, 0.999), 0.9); + fills[-1] = m_bufferSize * m_param->rc.vbvBufferInit; + t0 = 0; + /* fix overflows */ + adjMin = 1; + while (adjMin && findUnderflow(fills, &t0, &t1, 1)) + { + adjMin = fixUnderflow(t0, t1, adjustment, MIN_QPSCALE, MAX_MAX_QPSCALE); + t0 = t1; + } + } + + fills[-1] = m_bufferSize * (1. - m_param->rc.vbvBufferInit); + t0 = 0; + /* fix underflows -- should be done after overflow, as we'd better undersize target than underflowing VBV */ + adjMax = 1; + while (adjMax && findUnderflow(fills, &t0, &t1, 0)) + adjMax = fixUnderflow(t0, t1, 1.001, MIN_QPSCALE, MAX_MAX_QPSCALE ); + + expectedBits = countExpectedBits(); + } + while ((expectedBits < .995 * allAvailableBits) && ((int64_t)(expectedBits+.5) > (int64_t)(prevBits+.5))); + + if (!adjMax) + x265_log(m_param, X265_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n"); + + /* store expected vbv filling values for tracking when encoding */ + for (int i = 0; i < m_numEntries; i++) + m_rce2Pass[i].expectedVbv = m_bufferSize - fills[i]; + + X265_FREE(fills - 1); + return true; + +fail: + x265_log(m_param, X265_LOG_ERROR, "malloc failure in two-pass VBV init\n"); + return false; +} + +/* In 2pass, force the same frame types as in the 1st pass */ +int RateControl::rateControlSliceType(int frameNum) +{ + if (m_param->rc.bStatRead) + { + if (frameNum >= m_numEntries) + { + /* We could try to initialize everything required for ABR and + * adaptive B-frames, but that would be complicated. + * So just calculate the average QP used so far. */ + m_param->rc.qp = (m_accumPQp < 1) ? ABR_INIT_QP_MAX : (int)(m_accumPQp + 0.5); + m_qpConstant[P_SLICE] = Clip3(0, QP_MAX_MAX, m_param->rc.qp); + m_qpConstant[I_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_param->rc.qp - m_ipOffset + 0.5)); + m_qpConstant[B_SLICE] = Clip3(0, QP_MAX_MAX, (int)(m_param->rc.qp + m_pbOffset + 0.5)); + + x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", m_numEntries); + x265_log(m_param, X265_LOG_ERROR, "continuing anyway, at constant QP=%d\n", m_param->rc.qp); + if (m_param->bFrameAdaptive) + x265_log(m_param, X265_LOG_ERROR, "disabling adaptive B-frames\n"); + + m_isAbr = 0; + m_2pass = 0; + m_param->rc.rateControlMode = X265_RC_CQP; + m_param->rc.bStatRead = 0; + m_param->bFrameAdaptive = 0; + m_param->scenecutThreshold = 0; + m_param->rc.cuTree = 0; + if (m_param->bframes > 1) + m_param->bframes = 1; + return X265_TYPE_AUTO; + } + int frameType = m_rce2Pass[frameNum].sliceType == I_SLICE ? (frameNum > 0 && m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR) + : m_rce2Pass[frameNum].sliceType == P_SLICE ? X265_TYPE_P + : (m_rce2Pass[frameNum].sliceType == B_SLICE && m_rce2Pass[frameNum].keptAsRef? X265_TYPE_BREF : X265_TYPE_B); + return frameType; + } + else + return X265_TYPE_AUTO; +} + +int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc) +{ + int orderValue = m_startEndOrder.get(); + int startOrdinal = rce->encodeOrder * 2; + + while (orderValue < startOrdinal && !m_bTerminated) + orderValue = m_startEndOrder.waitForChange(orderValue); + + if (!curFrame) + { + // faked rateControlStart calls when the encoder is flushing + m_startEndOrder.incr(); + return 0; + } + + FrameData& curEncData = *curFrame->m_encData; + m_curSlice = curEncData.m_slice; + m_sliceType = m_curSlice->m_sliceType; + rce->sliceType = m_sliceType; + rce->poc = m_curSlice->m_poc; + if (m_param->rc.bStatRead) + { + X265_CHECK(rce->poc >= 0 && rce->poc < m_numEntries, "bad encode ordinal\n"); + copyRceData(rce, &m_rce2Pass[rce->poc]); + } + rce->isActive = true; + if (m_sliceType == B_SLICE) + rce->bframes = m_leadingBframes; + else + m_leadingBframes = curFrame->m_lowres.leadingBframes; + + rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame; + rce->bufferRate = m_bufferRate; + rce->rowCplxrSum = 0.0; + rce->rowTotalBits = 0; + if (m_isVbv) + { + if (rce->rowPreds[0][0].count == 0) + { + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 2; j++) + { + rce->rowPreds[i][j].coeff = 0.25; + rce->rowPreds[i][j].count = 1.0; + rce->rowPreds[i][j].decay = 0.5; + rce->rowPreds[i][j].offset = 0.0; + } + } + } + rce->rowPred[0] = &rce->rowPreds[m_sliceType][0]; + rce->rowPred[1] = &rce->rowPreds[m_sliceType][1]; + m_predictedBits = m_totalBits; + updateVbvPlan(enc); + rce->bufferFill = m_bufferFill; + + int mincr = enc->m_vps.ptl.minCrForLevel; + /* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */ + if (enc->m_vps.ptl.profileIdc > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE) + rce->frameSizeMaximum = 1e9; + else + { + /* The spec has a special case for the first frame. */ + if (rce->encodeOrder == 0) + { + /* 1.5 * (Max( PicSizeInSamplesY, fR * MaxLumaSr) + MaxLumaSr * (AuCpbRemovalTime[ 0 ] -AuNominalRemovalTime[ 0 ])) ? MinCr */ + double fr = 1. / 300; + int picSizeInSamplesY = m_param->sourceWidth * m_param->sourceHeight; + rce->frameSizeMaximum = 8 * 1.5 * X265_MAX(picSizeInSamplesY, fr * enc->m_vps.ptl.maxLumaSrForLevel) / mincr; + } + else + { + /* 1.5 * MaxLumaSr * (AuCpbRemovalTime[ n ] - AyCpbRemovalTime[ n - 1 ]) ? MinCr */ + rce->frameSizeMaximum = 8 * 1.5 * enc->m_vps.ptl.maxLumaSrForLevel * m_frameDuration / mincr; + } + } + } + if (m_isAbr || m_2pass) // ABR,CRF + { + if (m_isAbr || m_isVbv) + { + m_currentSatd = curFrame->m_lowres.satdCost >> (X265_DEPTH - 8); + /* Update rce for use in rate control VBV later */ + rce->lastSatd = m_currentSatd; + } + double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce)); + q = Clip3((double)QP_MIN, (double)QP_MAX_MAX, q); + m_qp = int(q + 0.5); + rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = q; + /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */ + rce->qRceq = m_lastRceq; + accumPQpUpdate(); + } + else // CQP + { + if (m_sliceType == B_SLICE && IS_REFERENCED(curFrame)) + m_qp = (m_qpConstant[B_SLICE] + m_qpConstant[P_SLICE]) / 2; + else + m_qp = m_qpConstant[m_sliceType]; + curEncData.m_avgQpAq = curEncData.m_avgQpRc = m_qp; + } + if (m_sliceType != B_SLICE) + { + m_lastNonBPictType = m_sliceType; + m_leadingNoBSatd = m_currentSatd; + } + rce->leadingNoBSatd = m_leadingNoBSatd; + if (curFrame->m_forceqp) + { + m_qp = int32_t(curFrame->m_forceqp + 0.5) - 1; + m_qp = Clip3(QP_MIN, QP_MAX_MAX, m_qp); + rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp; + } + // Do not increment m_startEndOrder here. Make rateControlEnd of previous thread + // to wait until rateControlUpdateStats of this frame is called + m_framesDone++; + return m_qp; +} + +void RateControl::accumPQpUpdate() +{ + m_accumPQp *= .95; + m_accumPNorm *= .95; + m_accumPNorm += 1; + if (m_sliceType == I_SLICE) + m_accumPQp += m_qp + m_ipOffset; + else + m_accumPQp += m_qp; +} + +double RateControl::getDiffLimitedQScale(RateControlEntry *rce, double q) +{ + // force I/B quants as a function of P quants + const double lastPqScale = m_lastQScaleFor[P_SLICE]; + const double lastNonBqScale = m_lastQScaleFor[m_lastNonBPictType]; + if (rce->sliceType == I_SLICE) + { + double iq = q; + double pq = x265_qp2qScale(m_accumPQp / m_accumPNorm); + double ipFactor = fabs(m_param->rc.ipFactor); + /* don't apply ipFactor if the following frame is also I */ + if (m_accumPNorm <= 0) + q = iq; + else if (m_param->rc.ipFactor < 0) + q = iq / ipFactor; + else if (m_accumPNorm >= 1) + q = pq / ipFactor; + else + q = m_accumPNorm * pq / ipFactor + (1 - m_accumPNorm) * iq; + } + else if (rce->sliceType == B_SLICE) + { + if (m_param->rc.pbFactor > 0) + q = lastNonBqScale; + if (!rce->keptAsRef) + q *= fabs(m_param->rc.pbFactor); + } + else if (rce->sliceType == P_SLICE + && m_lastNonBPictType == P_SLICE + && rce->coeffBits == 0) + { + q = lastPqScale; + } + + /* last qscale / qdiff stuff */ + if (m_lastNonBPictType == rce->sliceType && + (rce->sliceType != I_SLICE || m_lastAccumPNorm < 1)) + { + double maxQscale = m_lastQScaleFor[rce->sliceType] * m_lstep; + double minQscale = m_lastQScaleFor[rce->sliceType] / m_lstep; + q = Clip3(minQscale, maxQscale, q); + } + + m_lastQScaleFor[rce->sliceType] = q; + if (rce->sliceType != B_SLICE) + m_lastNonBPictType = rce->sliceType; + if (rce->sliceType == I_SLICE) + { + m_lastAccumPNorm = m_accumPNorm; + m_accumPNorm = 0; + m_accumPQp = 0; + } + if (rce->sliceType == P_SLICE) + { + double mask = 1 - pow(rce->iCuCount / m_ncu, 2); + m_accumPQp = mask * (x265_qScale2qp(q) + m_accumPQp); + m_accumPNorm = mask * (1 + m_accumPNorm); + } + + return q; +} + +double RateControl::countExpectedBits() +{ + double expectedBits = 0; + for( int i = 0; i < m_numEntries; i++ ) + { + RateControlEntry *rce = &m_rce2Pass[i]; + rce->expectedBits = (uint64_t)expectedBits; + expectedBits += qScale2bits(rce, rce->newQScale); + } + return expectedBits; +} + +bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int over) +{ + /* find an interval ending on an overflow or underflow (depending on whether + * we're adding or removing bits), and starting on the earliest frame that + * can influence the buffer fill of that end frame. */ + const double bufferMin = .1 * m_bufferSize; + const double bufferMax = .9 * m_bufferSize; + double fill = fills[*t0 - 1]; + double parity = over ? 1. : -1.; + int start = -1, end = -1; + for (int i = *t0; i < m_numEntries; i++) + { + fill += (m_frameDuration * m_vbvMaxRate - + qScale2bits(&m_rce2Pass[i], m_rce2Pass[i].newQScale)) * parity; + fill = Clip3(0.0, m_bufferSize, fill); + fills[i] = fill; + if (fill <= bufferMin || i == 0) + { + if (end >= 0) + break; + start = i; + } + else if (fill >= bufferMax && start >= 0) + end = i; + } + *t0 = start; + *t1 = end; + return start >= 0 && end >= 0; +} + +bool RateControl::fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax) +{ + double qscaleOrig, qscaleNew; + bool adjusted = false; + if (t0 > 0) + t0++; + for (int i = t0; i <= t1; i++) + { + qscaleOrig = m_rce2Pass[i].newQScale; + qscaleOrig = Clip3(qscaleMin, qscaleMax, qscaleOrig); + qscaleNew = qscaleOrig * adjustment; + qscaleNew = Clip3(qscaleMin, qscaleMax, qscaleNew); + m_rce2Pass[i].newQScale = qscaleNew; + adjusted = adjusted || (qscaleNew != qscaleOrig); + } + return adjusted; +} + +bool RateControl::cuTreeReadFor2Pass(Frame* frame) +{ + uint8_t sliceTypeActual = (uint8_t)m_rce2Pass[frame->m_poc].sliceType; + + if (m_rce2Pass[frame->m_poc].keptAsRef) + { + uint8_t type; + if (m_cuTreeStats.qpBufPos < 0) + { + do + { + m_cuTreeStats.qpBufPos++; + + if (!fread(&type, 1, 1, m_cutreeStatFileIn)) + goto fail; + if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), m_ncu, m_cutreeStatFileIn) != (size_t)m_ncu) + goto fail; + + if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1) + { + x265_log(m_param, X265_LOG_ERROR, "CU-tree frametype %d doesn't match actual frametype %d.\n", type, sliceTypeActual); + return false; + } + } + while(type != sliceTypeActual); + } + for (int i = 0; i < m_ncu; i++) + { + int16_t qpFix8 = m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos][i]; + frame->m_lowres.qpCuTreeOffset[i] = (double)(qpFix8) / 256.0; + frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]); + } + m_cuTreeStats.qpBufPos--; + } + else + calcAdaptiveQuantFrame(frame); + return true; + +fail: + x265_log(m_param, X265_LOG_ERROR, "Incomplete CU-tree stats file.\n"); + return false; +} + +double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce) +{ + double q; + + if (m_2pass) + { + if (m_sliceType != rce->sliceType) + { + x265_log(m_param, X265_LOG_ERROR, "slice=%c but 2pass stats say %c\n", + g_sliceTypeToChar[m_sliceType], g_sliceTypeToChar[rce->sliceType]); + } + } + else + { + if (m_isAbr) + { + double slidingWindowCplxSum = 0; + int start = m_sliderPos > s_slidingWindowFrames ? m_sliderPos : 0; + for (int cnt = 0; cnt < s_slidingWindowFrames; cnt++, start++) + { + int pos = start % s_slidingWindowFrames; + slidingWindowCplxSum *= 0.5; + if (!m_satdCostWindow[pos]) + break; + slidingWindowCplxSum += m_satdCostWindow[pos] / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION); + } + rce->movingAvgSum = slidingWindowCplxSum; + m_satdCostWindow[m_sliderPos % s_slidingWindowFrames] = rce->lastSatd; + m_sliderPos++; + } + } + + if (m_sliceType == B_SLICE) + { + /* B-frames don't have independent rate control, but rather get the + * average QP of the two adjacent P-frames + an offset */ + Slice* prevRefSlice = m_curSlice->m_refPicList[0][0]->m_encData->m_slice; + Slice* nextRefSlice = m_curSlice->m_refPicList[1][0]->m_encData->m_slice; + double q0 = m_curSlice->m_refPicList[0][0]->m_encData->m_avgQpRc; + double q1 = m_curSlice->m_refPicList[1][0]->m_encData->m_avgQpRc; + bool i0 = prevRefSlice->m_sliceType == I_SLICE; + bool i1 = nextRefSlice->m_sliceType == I_SLICE; + int dt0 = abs(m_curSlice->m_poc - prevRefSlice->m_poc); + int dt1 = abs(m_curSlice->m_poc - nextRefSlice->m_poc); + + // Skip taking a reference frame before the Scenecut if ABR has been reset. + if (m_lastAbrResetPoc >= 0) + { + if (prevRefSlice->m_sliceType == P_SLICE && prevRefSlice->m_poc < m_lastAbrResetPoc) + { + i0 = i1; + dt0 = dt1; + q0 = q1; + } + } + if (prevRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refPicList[0][0])) + q0 -= m_pbOffset / 2; + if (nextRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refPicList[1][0])) + q1 -= m_pbOffset / 2; + if (i0 && i1) + q = (q0 + q1) / 2 + m_ipOffset; + else if (i0) + q = q1; + else if (i1) + q = q0; + else + q = (q0 * dt1 + q1 * dt0) / (dt0 + dt1); + + if (IS_REFERENCED(curFrame)) + q += m_pbOffset / 2; + else + q += m_pbOffset; + rce->qpNoVbv = q; + double qScale = x265_qp2qScale(q); + + if (!m_2pass && m_isVbv) + { + if (m_leadingBframes > 5) + { + qScale = clipQscale(curFrame, rce, qScale); + m_lastQScaleFor[m_sliceType] = qScale; + } + rce->frameSizePlanned = predictSize(&m_predBfromP, qScale, (double)m_leadingNoBSatd); + } + else if (m_2pass && m_isVbv) + { + rce->frameSizePlanned = qScale2bits(rce, qScale); + } + /* Limit planned size by MinCR */ + if (m_isVbv) + rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum); + rce->frameSizeEstimated = rce->frameSizePlanned; + rce->newQScale = qScale; + return qScale; + } + else + { + double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate; + if (m_2pass) + { + int64_t diff; + if (!m_isVbv) + { + m_predictedBits = m_totalBits; + if (rce->encodeOrder < m_param->frameNumThreads) + m_predictedBits += (int64_t)(rce->encodeOrder * m_bitrate / m_fps); + else + m_predictedBits += (int64_t)(m_param->frameNumThreads * m_bitrate / m_fps); + } + /* Adjust ABR buffer based on distance to the end of the video. */ + if (m_numEntries > rce->encodeOrder) + { + uint64_t finalBits = m_rce2Pass[m_numEntries - 1].expectedBits; + double videoPos = (double)rce->expectedBits / finalBits; + double scaleFactor = sqrt((1 - videoPos) * m_numEntries); + abrBuffer *= 0.5 * X265_MAX(scaleFactor, 0.5); + } + diff = m_predictedBits - (int64_t)rce->expectedBits; + q = rce->newQScale; + q /= Clip3(0.5, 2.0, (double)(abrBuffer - diff) / abrBuffer); + if (m_expectedBitsSum > 0) + { + /* Adjust quant based on the difference between + * achieved and expected bitrate so far */ + double curTime = (double)rce->encodeOrder / m_numEntries; + double w = Clip3(0.0, 1.0, curTime * 100); + q *= pow((double)m_totalBits / m_expectedBitsSum, w); + } + rce->qpNoVbv = x265_qScale2qp(q); + if (m_isVbv) + { + /* Do not overflow vbv */ + double expectedSize = qScale2bits(rce, q); + double expectedVbv = m_bufferFill + m_bufferRate - expectedSize; + double expectedFullness = rce->expectedVbv / m_bufferSize; + double qmax = q * (2 - expectedFullness); + double sizeConstraint = 1 + expectedFullness; + qmax = X265_MAX(qmax, rce->newQScale); + if (expectedFullness < .05) + qmax = MAX_MAX_QPSCALE; + qmax = X265_MIN(qmax, MAX_MAX_QPSCALE); + while (((expectedVbv < rce->expectedVbv/sizeConstraint) && (q < qmax)) || + ((expectedVbv < 0) && (q < MAX_MAX_QPSCALE))) + { + q *= 1.05; + expectedSize = qScale2bits(rce, q); + expectedVbv = m_bufferFill + m_bufferRate - expectedSize; + } + } + q = Clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q); + } + else + { + /* 1pass ABR */ + + /* Calculate the quantizer which would have produced the desired + * average bitrate if it had been applied to all frames so far. + * Then modulate that quant based on the current frame's complexity + * relative to the average complexity so far (using the 2pass RCEQ). + * Then bias the quant up or down if total size so far was far from + * the target. + * Result: Depending on the value of rate_tolerance, there is a + * tradeoff between quality and bitrate precision. But at large + * tolerances, the bit distribution approaches that of 2pass. */ + + double wantedBits, overflow = 1; + + m_shortTermCplxSum *= 0.5; + m_shortTermCplxCount *= 0.5; + m_shortTermCplxSum += m_currentSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION); + m_shortTermCplxCount++; + /* coeffBits to be used in 2-pass */ + rce->coeffBits = (int)m_currentSatd; + rce->blurredComplexity = m_shortTermCplxSum / m_shortTermCplxCount; + rce->mvBits = 0; + rce->sliceType = m_sliceType; + + if (m_param->rc.rateControlMode == X265_RC_CRF) + { + q = getQScale(rce, m_rateFactorConstant); + } + else + { + if (!m_param->rc.bStatRead) + checkAndResetABR(rce, false); + q = getQScale(rce, m_wantedBitsWindow / m_cplxrSum); + + /* ABR code can potentially be counterproductive in CBR, so just + * don't bother. Don't run it if the frame complexity is zero + * either. */ + if (!m_isCbr && m_currentSatd) + { + /* use framesDone instead of POC as poc count is not serial with bframes enabled */ + double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration; + wantedBits = timeDone * m_bitrate; + if (wantedBits > 0 && m_totalBits > 0 && !m_partialResidualFrames) + { + abrBuffer *= X265_MAX(1, sqrt(timeDone)); + overflow = Clip3(.5, 2.0, 1.0 + (m_totalBits - wantedBits) / abrBuffer); + q *= overflow; + } + } + } + + if (m_sliceType == I_SLICE && m_param->keyframeMax > 1 + && m_lastNonBPictType != I_SLICE && !m_isAbrReset) + { + q = x265_qp2qScale(m_accumPQp / m_accumPNorm); + q /= fabs(m_param->rc.ipFactor); + } + else if (m_framesDone > 0) + { + if (m_param->rc.rateControlMode != X265_RC_CRF) + { + double lqmin = 0, lqmax = 0; + lqmin = m_lastQScaleFor[m_sliceType] / m_lstep; + lqmax = m_lastQScaleFor[m_sliceType] * m_lstep; + if (!m_partialResidualFrames) + { + if (overflow > 1.1 && m_framesDone > 3) + lqmax *= m_lstep; + else if (overflow < 0.9) + lqmin /= m_lstep; + } + q = Clip3(lqmin, lqmax, q); + } + } + else if (m_qCompress != 1 && m_param->rc.rateControlMode == X265_RC_CRF) + { + q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor); + } + else if (m_framesDone == 0 && !m_isVbv) + { + /* for ABR alone, clip the first I frame qp */ + double lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep; + q = X265_MIN(lqmax, q); + } + q = Clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q); + rce->qpNoVbv = x265_qScale2qp(q); + q = clipQscale(curFrame, rce, q); + } + m_lastQScaleFor[m_sliceType] = q; + if ((m_curSlice->m_poc == 0 || m_lastQScaleFor[P_SLICE] < q) && !(m_2pass && !m_isVbv)) + m_lastQScaleFor[P_SLICE] = q * fabs(m_param->rc.ipFactor); + + if (m_2pass && m_isVbv) + rce->frameSizePlanned = qScale2bits(rce, q); + else + rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + + /* Always use up the whole VBV in this case. */ + if (m_singleFrameVbv) + rce->frameSizePlanned = m_bufferRate; + /* Limit planned size by MinCR */ + if (m_isVbv) + rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum); + rce->frameSizeEstimated = rce->frameSizePlanned; + rce->newQScale = q; + return q; + } +} + +void RateControl::rateControlUpdateStats(RateControlEntry* rce) +{ + if (!m_param->rc.bStatWrite && !m_param->rc.bStatRead) + { + if (rce->sliceType == I_SLICE) + { + /* previous I still had a residual; roll it into the new loan */ + if (m_partialResidualFrames) + rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames; + + m_partialResidualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax); + m_partialResidualCost = (int)((rce->rowTotalBits * s_amortizeFraction) /m_partialResidualFrames); + rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames; + } + else if (m_partialResidualFrames) + { + rce->rowTotalBits += m_partialResidualCost; + m_partialResidualFrames--; + } + } + if (rce->sliceType != B_SLICE) + rce->rowCplxrSum = rce->rowTotalBits * x265_qp2qScale(rce->qpaRc) / rce->qRceq; + else + rce->rowCplxrSum = rce->rowTotalBits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor)); + + m_cplxrSum += rce->rowCplxrSum; + m_totalBits += rce->rowTotalBits; + + /* do not allow the next frame to enter rateControlStart() until this + * frame has updated its mid-frame statistics */ + m_startEndOrder.incr(); + + if (rce->encodeOrder < m_param->frameNumThreads - 1) + m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames +} + +void RateControl::checkAndResetABR(RateControlEntry* rce, bool isFrameDone) +{ + double abrBuffer = 2 * m_param->rc.rateTolerance * m_bitrate; + + // Check if current Slice is a scene cut that follows low detailed/blank frames + if (rce->lastSatd > 4 * rce->movingAvgSum) + { + if (!m_isAbrReset && rce->movingAvgSum > 0) + { + int64_t shrtTermWantedBits = (int64_t) (X265_MIN(m_sliderPos, s_slidingWindowFrames) * m_bitrate * m_frameDuration); + int64_t shrtTermTotalBitsSum = 0; + // Reset ABR if prev frames are blank to prevent further sudden overflows/ high bit rate spikes. + for (int i = 0; i < s_slidingWindowFrames ; i++) + shrtTermTotalBitsSum += m_encodedBitsWindow[i]; + double underflow = (shrtTermTotalBitsSum - shrtTermWantedBits) / abrBuffer; + const double epsilon = 0.0001f; + if (underflow < epsilon && !isFrameDone) + { + init(m_curSlice->m_sps); + m_shortTermCplxSum = rce->lastSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION); + m_shortTermCplxCount = 1; + m_isAbrReset = true; + m_lastAbrResetPoc = rce->poc; + } + } + else + { + // Clear flag to reset ABR and continue as usual. + m_isAbrReset = false; + } + } +} + +void RateControl::hrdFullness(SEIBufferingPeriod *seiBP) +{ + const VUI* vui = &m_curSlice->m_sps->vuiParameters; + const HRDInfo* hrd = &vui->hrdParameters; + int num = 90000; + int denom = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT); + reduceFraction(&num, &denom); + int64_t cpbState = (int64_t)m_bufferFillFinal; + int64_t cpbSize = (int64_t)hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT); + + if (cpbState < 0 || cpbState > cpbSize) + { + x265_log(m_param, X265_LOG_WARNING, "CPB %s: %.0lf bits in a %.0lf-bit buffer\n", + cpbState < 0 ? "underflow" : "overflow", (float)cpbState/denom, (float)cpbSize/denom); + } + + seiBP->m_initialCpbRemovalDelay = (uint32_t)(num * cpbState + denom) / denom; + seiBP->m_initialCpbRemovalDelayOffset = (uint32_t)(num * cpbSize + denom) / denom - seiBP->m_initialCpbRemovalDelay; +} + +void RateControl::updateVbvPlan(Encoder* enc) +{ + m_bufferFill = m_bufferFillFinal; + enc->updateVbvPlan(this); +} + +double RateControl::predictSize(Predictor *p, double q, double var) +{ + return (p->coeff * var + p->offset) / (q * p->count); +} + +double RateControl::clipQscale(Frame* curFrame, RateControlEntry* rce, double q) +{ + // B-frames are not directly subject to VBV, + // since they are controlled by referenced P-frames' QPs. + double q0 = q; + if (m_isVbv && m_currentSatd > 0 && curFrame) + { + if (m_param->lookaheadDepth || m_param->rc.cuTree || + m_param->scenecutThreshold || + (m_param->bFrameAdaptive && m_param->bframes)) + { + /* Lookahead VBV: If lookahead is done, raise the quantizer as necessary + * such that no frames in the lookahead overflow and such that the buffer + * is in a reasonable state by the end of the lookahead. */ + int loopTerminate = 0; + /* Avoid an infinite loop. */ + for (int iterations = 0; iterations < 1000 && loopTerminate != 3; iterations++) + { + double frameQ[3]; + double curBits; + if (m_sliceType == B_SLICE) + curBits = predictSize(&m_predBfromP, q, (double)m_currentSatd); + else + curBits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + double bufferFillCur = m_bufferFill - curBits; + double targetFill; + double totalDuration = 0; + frameQ[P_SLICE] = m_sliceType == I_SLICE ? q * m_param->rc.ipFactor : (m_sliceType == B_SLICE ? q / m_param->rc.pbFactor : q); + frameQ[B_SLICE] = frameQ[P_SLICE] * m_param->rc.pbFactor; + frameQ[I_SLICE] = frameQ[P_SLICE] / m_param->rc.ipFactor; + /* Loop over the planned future frames. */ + for (int j = 0; bufferFillCur >= 0; j++) + { + int type = curFrame->m_lowres.plannedType[j]; + if (type == X265_TYPE_AUTO) + break; + totalDuration += m_frameDuration; + double wantedFrameSize = m_vbvMaxRate * m_frameDuration; + if (bufferFillCur + wantedFrameSize <= m_bufferSize) + bufferFillCur += wantedFrameSize; + int64_t satd = curFrame->m_lowres.plannedSatd[j] >> (X265_DEPTH - 8); + type = IS_X265_TYPE_I(type) ? I_SLICE : IS_X265_TYPE_B(type) ? B_SLICE : P_SLICE; + curBits = predictSize(&m_pred[type], frameQ[type], (double)satd); + bufferFillCur -= curBits; + } + + /* Try to get the buffer at least 50% filled, but don't set an impossible goal. */ + targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * 0.5); + if (bufferFillCur < targetFill) + { + q *= 1.01; + loopTerminate |= 1; + continue; + } + /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */ + targetFill = Clip3(m_bufferSize * 0.8, m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5); + if (m_isCbr && bufferFillCur > targetFill) + { + q /= 1.01; + loopTerminate |= 2; + continue; + } + break; + } + q = X265_MAX(q0 / 2, q); + } + else + { + /* Fallback to old purely-reactive algorithm: no lookahead. */ + if ((m_sliceType == P_SLICE || m_sliceType == B_SLICE || + (m_sliceType == I_SLICE && m_lastNonBPictType == I_SLICE)) && + m_bufferFill / m_bufferSize < 0.5) + { + q /= Clip3(0.5, 1.0, 2.0 * m_bufferFill / m_bufferSize); + } + // Now a hard threshold to make sure the frame fits in VBV. + // This one is mostly for I-frames. + double bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + + // For small VBVs, allow the frame to use up the entire VBV. + double maxFillFactor; + maxFillFactor = m_bufferSize >= 5 * m_bufferRate ? 2 : 1; + // For single-frame VBVs, request that the frame use up the entire VBV. + double minFillFactor = m_singleFrameVbv ? 1 : 2; + + for (int iterations = 0; iterations < 10; iterations++) + { + double qf = 1.0; + if (bits > m_bufferFill / maxFillFactor) + qf = Clip3(0.2, 1.0, m_bufferFill / (maxFillFactor * bits)); + q /= qf; + bits *= qf; + if (bits < m_bufferRate / minFillFactor) + q *= bits * minFillFactor / m_bufferRate; + bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + } + + q = X265_MAX(q0, q); + } + + /* Apply MinCR restrictions */ + double pbits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + if (pbits > rce->frameSizeMaximum) + q *= pbits / rce->frameSizeMaximum; + + // Check B-frame complexity, and use up any bits that would + // overflow before the next P-frame. + if (m_leadingBframes <= 5 && m_sliceType == P_SLICE && !m_singleFrameVbv) + { + int nb = m_leadingBframes; + double bits = predictSize(&m_pred[m_sliceType], q, (double)m_currentSatd); + double bbits = predictSize(&m_predBfromP, q * m_param->rc.pbFactor, (double)m_currentSatd); + double space; + if (bbits > m_bufferRate) + nb = 0; + double pbbits = nb * bbits; + + space = m_bufferFill + (1 + nb) * m_bufferRate - m_bufferSize; + if (pbbits < space) + q *= X265_MAX(pbbits / space, bits / (0.5 * m_bufferSize)); + + q = X265_MAX(q0 / 2, q); + } + + if (!m_isCbr || (m_isAbr && m_currentSatd >= rce->movingAvgSum && q <= q0 / 2)) + q = X265_MAX(q0, q); + + if (m_rateFactorMaxIncrement) + { + double qpNoVbv = x265_qScale2qp(q0); + double qmax = X265_MIN(MAX_MAX_QPSCALE,x265_qp2qScale(qpNoVbv + m_rateFactorMaxIncrement)); + return Clip3(MIN_QPSCALE, qmax, q); + } + } + if (m_2pass) + { + double min = log(MIN_QPSCALE); + double max = log(MAX_MAX_QPSCALE); + q = (log(q) - min) / (max - min) - 0.5; + q = 1.0 / (1.0 + exp(-4 * q)); + q = q*(max - min) + min; + return exp(q); + } + return Clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q); +} + +double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, double qpVbv, int32_t& encodedBitsSoFar) +{ + uint32_t rowSatdCostSoFar = 0, totalSatdBits = 0; + encodedBitsSoFar = 0; + + double qScale = x265_qp2qScale(qpVbv); + FrameData& curEncData = *curFrame->m_encData; + int picType = curEncData.m_slice->m_sliceType; + Frame* refFrame = curEncData.m_slice->m_refPicList[0][0]; + + uint32_t maxRows = curEncData.m_slice->m_sps->numCuInHeight; + uint32_t maxCols = curEncData.m_slice->m_sps->numCuInWidth; + + for (uint32_t row = 0; row < maxRows; row++) + { + encodedBitsSoFar += curEncData.m_rowStat[row].encodedBits; + rowSatdCostSoFar = curEncData.m_rowStat[row].diagSatd; + uint32_t satdCostForPendingCus = curEncData.m_rowStat[row].satdForVbv - rowSatdCostSoFar; + satdCostForPendingCus >>= X265_DEPTH - 8; + if (satdCostForPendingCus > 0) + { + double pred_s = predictSize(rce->rowPred[0], qScale, satdCostForPendingCus); + uint32_t refRowSatdCost = 0, refRowBits = 0, intraCost = 0; + double refQScale = 0; + + if (picType != I_SLICE) + { + FrameData& refEncData = *refFrame->m_encData; + uint32_t endCuAddr = maxCols * (row + 1); + for (uint32_t cuAddr = curEncData.m_rowStat[row].numEncodedCUs + 1; cuAddr < endCuAddr; cuAddr++) + { + refRowSatdCost += refEncData.m_cuStat[cuAddr].vbvCost; + refRowBits += refEncData.m_cuStat[cuAddr].totalBits; + intraCost += curEncData.m_cuStat[cuAddr].intraVbvCost; + } + + refRowSatdCost >>= X265_DEPTH - 8; + refQScale = refEncData.m_rowStat[row].diagQpScale; + } + + if (picType == I_SLICE || qScale >= refQScale) + { + if (picType == P_SLICE + && !refFrame + && refFrame->m_encData->m_slice->m_sliceType == picType + && refQScale > 0 + && refRowSatdCost > 0) + { + if (abs(int32_t(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2) + { + double predTotal = refRowBits * satdCostForPendingCus / refRowSatdCost * refQScale / qScale; + totalSatdBits += int32_t((pred_s + predTotal) * 0.5); + continue; + } + } + totalSatdBits += int32_t(pred_s); + } + else + { + /* Our QP is lower than the reference! */ + double pred_intra = predictSize(rce->rowPred[1], qScale, intraCost); + /* Sum: better to overestimate than underestimate by using only one of the two predictors. */ + totalSatdBits += int32_t(pred_intra + pred_s); + } + } + } + + return totalSatdBits + encodedBitsSoFar; +} + +int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv) +{ + FrameData& curEncData = *curFrame->m_encData; + double qScaleVbv = x265_qp2qScale(qpVbv); + uint64_t rowSatdCost = curEncData.m_rowStat[row].diagSatd; + double encodedBits = curEncData.m_rowStat[row].encodedBits; + + if (row == 1) + { + rowSatdCost += curEncData.m_rowStat[0].diagSatd; + encodedBits += curEncData.m_rowStat[0].encodedBits; + } + rowSatdCost >>= X265_DEPTH - 8; + updatePredictor(rce->rowPred[0], qScaleVbv, (double)rowSatdCost, encodedBits); + if (curEncData.m_slice->m_sliceType == P_SLICE) + { + Frame* refFrame = curEncData.m_slice->m_refPicList[0][0]; + if (qpVbv < refFrame->m_encData->m_rowStat[row].diagQp) + { + uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd; + if (row == 1) + intraRowSatdCost += curEncData.m_rowStat[0].diagIntraSatd; + + updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits); + } + } + + int canReencodeRow = 1; + /* tweak quality based on difference from predicted size */ + double prevRowQp = qpVbv; + double qpAbsoluteMax = QP_MAX_MAX; + double qpAbsoluteMin = QP_MIN; + if (m_rateFactorMaxIncrement) + qpAbsoluteMax = X265_MIN(qpAbsoluteMax, rce->qpNoVbv + m_rateFactorMaxIncrement); + + if (m_rateFactorMaxDecrement) + qpAbsoluteMin = X265_MAX(qpAbsoluteMin, rce->qpNoVbv - m_rateFactorMaxDecrement); + + double qpMax = X265_MIN(prevRowQp + m_param->rc.qpStep, qpAbsoluteMax); + double qpMin = X265_MAX(prevRowQp - m_param->rc.qpStep, qpAbsoluteMin); + double stepSize = 0.5; + double bufferLeftPlanned = rce->bufferFill - rce->frameSizePlanned; + + const SPS& sps = *curEncData.m_slice->m_sps; + double maxFrameError = X265_MAX(0.05, 1.0 / sps.numCuInHeight); + + if (row < sps.numCuInHeight - 1) + { + /* B-frames shouldn't use lower QP than their reference frames. */ + if (rce->sliceType == B_SLICE) + { + Frame* refSlice1 = curEncData.m_slice->m_refPicList[0][0]; + Frame* refSlice2 = curEncData.m_slice->m_refPicList[1][0]; + qpMin = X265_MAX(qpMin, X265_MAX(refSlice1->m_encData->m_rowStat[row].diagQp, refSlice2->m_encData->m_rowStat[row].diagQp)); + qpVbv = X265_MAX(qpVbv, qpMin); + } + /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */ + double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_param->rc.rateTolerance; + int32_t encodedBitsSoFar = 0; + double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar); + + /* * Don't increase the row QPs until a sufficent amount of the bits of + * the frame have been processed, in case a flat area at the top of the + * frame was measured inaccurately. */ + if (encodedBitsSoFar < 0.05f * rce->frameSizePlanned) + qpMax = qpAbsoluteMax = prevRowQp; + + if (rce->sliceType != I_SLICE) + rcTol *= 0.5; + + if (!m_isCbr) + qpMin = X265_MAX(qpMin, rce->qpNoVbv); + + while (qpVbv < qpMax + && ((accFrameBits > rce->frameSizePlanned + rcTol) || + (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) || + (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv))) + { + qpVbv += stepSize; + accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar); + } + + while (qpVbv > qpMin + && (qpVbv > curEncData.m_rowStat[0].diagQp || m_singleFrameVbv) + && ((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp) + || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)) + { + qpVbv -= stepSize; + accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar); + } + + /* avoid VBV underflow or MinCr violation */ + while ((qpVbv < qpAbsoluteMax) + && ((rce->bufferFill - accFrameBits < m_bufferRate * maxFrameError) || + (rce->frameSizeMaximum - accFrameBits < rce->frameSizeMaximum * maxFrameError))) + { + qpVbv += stepSize; + accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar); + } + + rce->frameSizeEstimated = accFrameBits; + + /* If the current row was large enough to cause a large QP jump, try re-encoding it. */ + if (qpVbv > qpMax && prevRowQp < qpMax && canReencodeRow) + { + /* Bump QP to halfway in between... close enough. */ + qpVbv = Clip3(prevRowQp + 1.0f, qpMax, (prevRowQp + qpVbv) * 0.5); + return -1; + } + + if (m_param->rc.rfConstantMin) + { + if (qpVbv < qpMin && prevRowQp > qpMin && canReencodeRow) + { + qpVbv = Clip3(qpMin, prevRowQp, (prevRowQp + qpVbv) * 0.5); + return -1; + } + } + } + else + { + int32_t encodedBitsSoFar = 0; + rce->frameSizeEstimated = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar); + + /* Last-ditch attempt: if the last row of the frame underflowed the VBV, + * try again. */ + if ((rce->frameSizeEstimated > (rce->bufferFill - m_bufferRate * maxFrameError) && + qpVbv < qpMax && canReencodeRow)) + { + qpVbv = qpMax; + return -1; + } + } + return 0; +} + +/* modify the bitrate curve from pass1 for one frame */ +double RateControl::getQScale(RateControlEntry *rce, double rateFactor) +{ + double q; + + if (m_param->rc.cuTree) + { + // Scale and units are obtained from rateNum and rateDenom for videos with fixed frame rates. + double timescale = (double)m_param->fpsDenom / (2 * m_param->fpsNum); + q = pow(BASE_FRAME_DURATION / CLIP_DURATION(2 * timescale), 1 - m_param->rc.qCompress); + } + else + q = pow(rce->blurredComplexity, 1 - m_param->rc.qCompress); + // avoid NaN's in the Rceq + if (rce->coeffBits + rce->mvBits == 0) + q = m_lastQScaleFor[rce->sliceType]; + else + { + m_lastRceq = q; + q /= rateFactor; + } + return q; +} + +void RateControl::updatePredictor(Predictor *p, double q, double var, double bits) +{ + if (var < 10) + return; + const double range = 1.5; + double old_coeff = p->coeff / p->count; + double new_coeff = bits * q / var; + double new_coeff_clipped = Clip3(old_coeff / range, old_coeff * range, new_coeff); + double new_offset = bits * q - new_coeff_clipped * var; + if (new_offset >= 0) + new_coeff = new_coeff_clipped; + else + new_offset = 0; + p->count *= p->decay; + p->coeff *= p->decay; + p->offset *= p->decay; + p->count++; + p->coeff += new_coeff; + p->offset += new_offset; +} + +void RateControl::updateVbv(int64_t bits, RateControlEntry* rce) +{ + if (rce->lastSatd >= m_ncu) + updatePredictor(&m_pred[rce->sliceType], x265_qp2qScale(rce->qpaRc), (double)rce->lastSatd, (double)bits); + if (!m_isVbv) + return; + + m_bufferFillFinal -= bits; + + if (m_bufferFillFinal < 0) + x265_log(m_param, X265_LOG_WARNING, "poc:%d, VBV underflow (%.0f bits)\n", rce->poc, m_bufferFillFinal); + + m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0); + m_bufferFillFinal += m_bufferRate; + m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize); +} + +/* After encoding one frame, update rate control state */ +int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats) +{ + int orderValue = m_startEndOrder.get(); + int endOrdinal = (rce->encodeOrder + m_param->frameNumThreads) * 2 - 1; + while (orderValue < endOrdinal && !m_bTerminated) + { + /* no more frames are being encoded, so fake the start event if we would + * have blocked on it. Note that this does not enforce rateControlEnd() + * ordering during flush, but this has no impact on the outputs */ + if (m_finalFrameCount && orderValue >= 2 * m_finalFrameCount) + break; + orderValue = m_startEndOrder.waitForChange(orderValue); + } + + FrameData& curEncData = *curFrame->m_encData; + int64_t actualBits = bits; + Slice *slice = curEncData.m_slice; + if (m_isAbr) + { + if (m_param->rc.rateControlMode == X265_RC_ABR && !m_param->rc.bStatRead) + checkAndResetABR(rce, true); + + if (m_param->rc.rateControlMode == X265_RC_CRF) + { + if (int(curEncData.m_avgQpRc + 0.5) == slice->m_sliceQp) + curEncData.m_rateFactor = m_rateFactorConstant; + else + { + /* If vbv changed the frame QP recalculate the rate-factor */ + double baseCplx = m_ncu * (m_param->bframes ? 120 : 80); + double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0; + curEncData.m_rateFactor = pow(baseCplx, 1 - m_qCompress) / + x265_qp2qScale(int(curEncData.m_avgQpRc + 0.5) + mbtree_offset); + } + } + } + + if (m_param->rc.aqMode || m_isVbv) + { + if (m_isVbv) + { + for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++) + curEncData.m_avgQpRc += curEncData.m_rowStat[i].sumQpRc; + + curEncData.m_avgQpRc /= slice->m_sps->numCUsInFrame; + rce->qpaRc = curEncData.m_avgQpRc; + + // copy avg RC qp to m_avgQpAq. To print out the correct qp when aq/cutree is disabled. + curEncData.m_avgQpAq = curEncData.m_avgQpRc; + } + + if (m_param->rc.aqMode) + { + for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++) + curEncData.m_avgQpAq += curEncData.m_rowStat[i].sumQpAq; + + curEncData.m_avgQpAq /= slice->m_sps->numCUsInFrame; + } + } + + // Write frame stats into the stats file if 2 pass is enabled. + if (m_param->rc.bStatWrite) + { + char cType = rce->sliceType == I_SLICE ? (rce->poc > 0 && m_param->bOpenGOP ? 'i' : 'I') + : rce->sliceType == P_SLICE ? 'P' + : IS_REFERENCED(curFrame) ? 'B' : 'b'; + if (fprintf(m_statFileOut, + "in:%d out:%d type:%c q:%.2f q-aq:%.2f tex:%d mv:%d misc:%d icu:%.2f pcu:%.2f scu:%.2f ;\n", + rce->poc, rce->encodeOrder, + cType, curEncData.m_avgQpRc, curEncData.m_avgQpAq, + stats->coeffBits, + stats->mvBits, + stats->miscBits, + stats->percentIntra * m_ncu, + stats->percentInter * m_ncu, + stats->percentSkip * m_ncu) < 0) + goto writeFailure; + /* Don't re-write the data in multi-pass mode. */ + if (m_param->rc.cuTree && IS_REFERENCED(curFrame) && !m_param->rc.bStatRead) + { + uint8_t sliceType = (uint8_t)rce->sliceType; + for (int i = 0; i < m_ncu; i++) + m_cuTreeStats.qpBuffer[0][i] = (uint16_t)(curFrame->m_lowres.qpCuTreeOffset[i] * 256.0); + if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1) + goto writeFailure; + if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu, m_cutreeStatFileOut) < (size_t)m_ncu) + goto writeFailure; + } + } + if (m_isAbr && !m_isAbrReset) + { + /* amortize part of each I slice over the next several frames, up to + * keyint-max, to avoid over-compensating for the large I slice cost */ + if (!m_param->rc.bStatWrite && !m_param->rc.bStatRead) + { + if (rce->sliceType == I_SLICE) + { + /* previous I still had a residual; roll it into the new loan */ + if (m_residualFrames) + bits += m_residualCost * m_residualFrames; + m_residualFrames = X265_MIN(s_amortizeFrames, m_param->keyframeMax); + m_residualCost = (int)((bits * s_amortizeFraction) / m_residualFrames); + bits -= m_residualCost * m_residualFrames; + } + else if (m_residualFrames) + { + bits += m_residualCost; + m_residualFrames--; + } + } + if (rce->sliceType != B_SLICE) + { + /* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low + * to improve short term compensation for next frame. */ + m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / rce->qRceq) - (rce->rowCplxrSum); + } + else + { + /* Depends on the fact that B-frame's QP is an offset from the following P-frame's. + * Not perfectly accurate with B-refs, but good enough. */ + m_cplxrSum += (bits * x265_qp2qScale(rce->qpaRc) / (rce->qRceq * fabs(m_param->rc.pbFactor))) - (rce->rowCplxrSum); + } + m_wantedBitsWindow += m_frameDuration * m_bitrate; + m_totalBits += bits - rce->rowTotalBits; + int pos = m_sliderPos - m_param->frameNumThreads; + if (pos >= 0) + m_encodedBitsWindow[pos % s_slidingWindowFrames] = actualBits; + } + + if (m_2pass) + { + m_expectedBitsSum += qScale2bits(rce, x265_qp2qScale(rce->newQp)); + m_totalBits += bits - rce->rowTotalBits; + } + + if (m_isVbv) + { + if (rce->sliceType == B_SLICE) + { + m_bframeBits += actualBits; + if (rce->bLastMiniGopBFrame) + { + if (rce->bframes != 0) + updatePredictor(&m_predBfromP, x265_qp2qScale(rce->qpaRc), (double)rce->leadingNoBSatd, (double)m_bframeBits / rce->bframes); + m_bframeBits = 0; + } + } + updateVbv(actualBits, rce); + + if (m_param->bEmitHRDSEI) + { + const VUI *vui = &curEncData.m_slice->m_sps->vuiParameters; + const HRDInfo *hrd = &vui->hrdParameters; + const TimingInfo *time = &vui->timingInfo; + if (!curFrame->m_poc) + { + // first access unit initializes the HRD + rce->hrdTiming->cpbInitialAT = 0; + rce->hrdTiming->cpbRemovalTime = m_nominalRemovalTime = (double)m_bufPeriodSEI.m_initialCpbRemovalDelay / 90000; + } + else + { + rce->hrdTiming->cpbRemovalTime = m_nominalRemovalTime + (double)rce->picTimingSEI->m_auCpbRemovalDelay * time->numUnitsInTick / time->timeScale; + double cpbEarliestAT = rce->hrdTiming->cpbRemovalTime - (double)m_bufPeriodSEI.m_initialCpbRemovalDelay / 90000; + if (!curFrame->m_lowres.bKeyframe) + cpbEarliestAT -= (double)m_bufPeriodSEI.m_initialCpbRemovalDelayOffset / 90000; + + rce->hrdTiming->cpbInitialAT = hrd->cbrFlag ? m_prevCpbFinalAT : X265_MAX(m_prevCpbFinalAT, cpbEarliestAT); + } + + uint32_t cpbsizeUnscale = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT); + rce->hrdTiming->cpbFinalAT = m_prevCpbFinalAT = rce->hrdTiming->cpbInitialAT + actualBits / cpbsizeUnscale; + rce->hrdTiming->dpbOutputTime = (double)rce->picTimingSEI->m_picDpbOutputDelay * time->numUnitsInTick / time->timeScale + rce->hrdTiming->cpbRemovalTime; + } + } + // Allow rateControlStart of next frame only when rateControlEnd of previous frame is over + m_startEndOrder.incr(); + rce->isActive = false; + return 0; + +writeFailure: + x265_log(m_param, X265_LOG_ERROR, "RatecontrolEnd: stats file write failure\n"); + return 1; +} + +#if defined(_MSC_VER) +#pragma warning(disable: 4996) // POSIX function names are just fine, thank you +#endif + +/* called when the encoder is flushing, and thus the final frame count is + * unambiguously known */ +void RateControl::setFinalFrameCount(int count) +{ + m_finalFrameCount = count; + /* unblock waiting threads */ + m_startEndOrder.set(m_startEndOrder.get()); +} + +/* called when the encoder is closing, and no more frames will be output. + * all blocked functions must finish so the frame encoder threads can be + * closed */ +void RateControl::terminate() +{ + m_bTerminated = true; + /* unblock waiting threads */ + m_startEndOrder.set(m_startEndOrder.get()); +} + +void RateControl::destroy() +{ + const char *fileName = m_param->rc.statFileName; + if (!fileName) + fileName = s_defaultStatFileName; + + if (m_statFileOut) + { + fclose(m_statFileOut); + char *tmpFileName = strcatFilename(fileName, ".temp"); + int bError = 1; + if (tmpFileName) + { + unlink(fileName); + bError = rename(tmpFileName, fileName); + } + if (bError) + { + x265_log(m_param, X265_LOG_ERROR, "failed to rename output stats file to \"%s\"\n", + fileName); + } + X265_FREE(tmpFileName); + } + + if (m_cutreeStatFileOut) + { + fclose(m_cutreeStatFileOut); + char *tmpFileName = strcatFilename(fileName, ".cutree.temp"); + char *newFileName = strcatFilename(fileName, ".cutree"); + int bError = 1; + if (tmpFileName && newFileName) + { + unlink(newFileName); + bError = rename(tmpFileName, newFileName); + } + if (bError) + { + x265_log(m_param, X265_LOG_ERROR, "failed to rename cutree output stats file to \"%s\"\n", + newFileName); + } + X265_FREE(tmpFileName); + X265_FREE(newFileName); + } + + if (m_cutreeStatFileIn) + fclose(m_cutreeStatFileIn); + + X265_FREE(m_rce2Pass); + for (int i = 0; i < 2; i++) + X265_FREE(m_cuTreeStats.qpBuffer[i]); +} + diff --git a/source/encoder/ratecontrol.h b/source/encoder/ratecontrol.h new file mode 100644 index 0000000..5b86147 --- /dev/null +++ b/source/encoder/ratecontrol.h @@ -0,0 +1,269 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Sumalatha Polureddy + * Aarthi Priya Thirumalai + * Xun Xu, PPLive Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_RATECONTROL_H +#define X265_RATECONTROL_H + +#include "common.h" +#include "sei.h" + +namespace x265 { +// encoder namespace + +class Encoder; +class Frame; +struct SPS; +class SEIBufferingPeriod; +#define BASE_FRAME_DURATION 0.04 + +/* Arbitrary limitations as a sanity check. */ +#define MAX_FRAME_DURATION 1.00 +#define MIN_FRAME_DURATION 0.01 + +#define CLIP_DURATION(f) Clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f) + +/* Current frame stats for 2 pass */ +struct FrameStats +{ + int mvBits; /* MV bits (MV+Ref+Block Type) */ + int coeffBits; /* Texture bits (DCT coefs) */ + int miscBits; + + int iCuCnt; + int pCuCnt; + int skipCuCnt; + + /* CU type counts stored as percentage */ + double percentIntra; + double percentInter; + double percentSkip; +}; + +struct Predictor +{ + double coeff; + double count; + double decay; + double offset; +}; + +struct HRDTiming +{ + double cpbInitialAT; + double cpbFinalAT; + double dpbOutputTime; + double cpbRemovalTime; +}; + +struct RateControlEntry +{ + int64_t lastSatd; /* Contains the picture cost of the previous frame, required for resetAbr and VBV */ + int sliceType; + int bframes; + int poc; + int encodeOrder; + int64_t leadingNoBSatd; + bool bLastMiniGopBFrame; + double blurredComplexity; + double qpaRc; + double qpAq; + double qRceq; + double frameSizePlanned; /* frame Size decided by RateCotrol before encoding the frame */ + double bufferRate; + double movingAvgSum; + double rowCplxrSum; + int64_t rowTotalBits; /* update cplxrsum and totalbits at the end of 2 rows */ + double qpNoVbv; + double bufferFill; + double frameDuration; + double clippedDuration; + Predictor rowPreds[3][2]; + Predictor* rowPred[2]; + double frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */ + double frameSizeMaximum; /* max frame Size according to minCR restrictions and level of the video */ + bool isActive; + SEIPictureTiming *picTimingSEI; + HRDTiming *hrdTiming; + /* Required in 2-pass rate control */ + double iCuCount; + double pCuCount; + double skipCuCount; + bool keptAsRef; + double expectedVbv; + double qScale; + double newQScale; + double newQp; + int mvBits; + int miscBits; + int coeffBits; + uint64_t expectedBits; /* total expected bits up to the current frame (current one excluded) */ +}; + +class RateControl +{ +public: + + x265_param* m_param; + Slice* m_curSlice; /* all info about the current frame */ + SliceType m_sliceType; /* Current frame type */ + int m_ncu; /* number of CUs in a frame */ + int m_qp; /* updated qp for current frame */ + + bool m_isAbr; + bool m_isVbv; + bool m_isCbr; + bool m_singleFrameVbv; + + bool m_isAbrReset; + int m_lastAbrResetPoc; + + double m_frameDuration; /* current frame duration in seconds */ + double m_bitrate; + double m_rateFactorConstant; + double m_bufferSize; + double m_bufferFillFinal; /* real buffer as of the last finished frame */ + double m_bufferFill; /* planned buffer, if all in-progress frames hit their bit budget */ + double m_bufferRate; /* # of bits added to buffer_fill after each frame */ + double m_vbvMaxRate; /* in kbps */ + double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */ + double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */ + + Predictor m_pred[5]; + Predictor m_predBfromP; + + int m_leadingBframes; + int64_t m_bframeBits; + int64_t m_currentSatd; + int m_qpConstant[3]; + double m_ipOffset; + double m_pbOffset; + + int m_lastNonBPictType; + int64_t m_leadingNoBSatd; + + double m_cplxrSum; /* sum of bits*qscale/rceq */ + double m_wantedBitsWindow; /* target bitrate * window */ + double m_accumPQp; /* for determining I-frame quant */ + double m_accumPNorm; + double m_lastQScaleFor[3]; /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */ + double m_lstep; + double m_shortTermCplxSum; + double m_shortTermCplxCount; + double m_lastRceq; + double m_qCompress; + int64_t m_totalBits; /* total bits used for already encoded frames */ + int m_framesDone; /* # of frames passed through RateCotrol already */ + double m_fps; + int64_t m_satdCostWindow[50]; + int m_sliderPos; + int64_t m_encodedBitsWindow[50]; + /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to + * sync the calls to these functions. For example + * -F2: + * rceStart 10 + * rceUpdate 10 + * rceEnd 9 + * rceStart 11 + * rceUpdate 11 + * rceEnd 10 + * rceStart 12 + * rceUpdate 12 + * rceEnd 11 */ + ThreadSafeInteger m_startEndOrder; + int m_finalFrameCount; /* set when encoder begins flushing */ + bool m_bTerminated; /* set true when encoder is closing */ + + /* hrd stuff */ + SEIBufferingPeriod m_bufPeriodSEI; + double m_nominalRemovalTime; + double m_prevCpbFinalAT; + + /* 2 pass */ + bool m_2pass; + FILE* m_statFileOut; + FILE* m_cutreeStatFileOut; + FILE* m_cutreeStatFileIn; + int m_numEntries; + RateControlEntry *m_rce2Pass; + double m_lastAccumPNorm; + int64_t m_predictedBits; + double m_expectedBitsSum; /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */ + struct + { + uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */ + int qpBufPos; /* In order to handle pyramid reordering, QP buffer acts as a stack. + * This value is the current position (0 or 1). */ + } m_cuTreeStats; + + RateControl(x265_param *p); + void setFinalFrameCount(int count); + void terminate(); /* un-block all waiting functions so encoder may close */ + void destroy(); + + // to be called for each curFrame to process RateControl and set QP + int rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc); + void calcAdaptiveQuantFrame(Frame *curFrame); + void rateControlUpdateStats(RateControlEntry* rce); + int rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats); + int rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv); + void hrdFullness(SEIBufferingPeriod* sei); + bool init(const SPS* sps); + void initHRD(SPS* sps); + int rateControlSliceType(int frameNum); + bool cuTreeReadFor2Pass(Frame* curFrame); + +protected: + + static const double s_amortizeFraction; + static const int s_amortizeFrames; + static const int s_slidingWindowFrames; + static const char *s_defaultStatFileName; + + int m_residualFrames; + int m_partialResidualFrames; + int m_residualCost; + int m_partialResidualCost; + + double getQScale(RateControlEntry *rce, double rateFactor); + double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR + void accumPQpUpdate(); + uint32_t acEnergyCu(Frame* pic, uint32_t block_x, uint32_t block_y); + + void updateVbv(int64_t bits, RateControlEntry* rce); + void updatePredictor(Predictor *p, double q, double var, double bits); + double clipQscale(Frame* pic, RateControlEntry* rce, double q); + void updateVbvPlan(Encoder* enc); + double predictSize(Predictor *p, double q, double var); + void checkAndResetABR(RateControlEntry* rce, bool isFrameDone); + double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits); + bool initPass2(); + double getDiffLimitedQScale(RateControlEntry *rce, double q); + double countExpectedBits(); + bool vbv2Pass(uint64_t allAvailableBits); + bool findUnderflow(double *fills, int *t0, int *t1, int over); + bool fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax); +}; +} +#endif // ifndef X265_RATECONTROL_H diff --git a/source/encoder/rdcost.h b/source/encoder/rdcost.h new file mode 100644 index 0000000..10bfff3 --- /dev/null +++ b/source/encoder/rdcost.h @@ -0,0 +1,125 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_RDCOST_H +#define X265_RDCOST_H + +#include "common.h" +#include "slice.h" + +namespace x265 { +// private namespace + +class RDCost +{ +public: + + /* all weights and factors stored as FIX8 */ + uint64_t m_lambda2; + uint64_t m_lambda; + uint64_t m_cbDistortionWeight; + uint64_t m_crDistortionWeight; + uint32_t m_psyRd; + int m_qp; + + void setPsyRdScale(double scale) { m_psyRd = (uint32_t)floor(256.0 * scale * 0.33); } + void setCbDistortionWeight(uint16_t weightFix8) { m_cbDistortionWeight = weightFix8; } + void setCrDistortionWeight(uint16_t weightFix8) { m_crDistortionWeight = weightFix8; } + + void setQP(const Slice& slice, int qp) + { + m_qp = qp; + + setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]); + + int qpCb = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCbQpOffset); + int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET); + uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256; + setCbDistortionWeight(lambdaOffset); + + int qpCr = Clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaCrQpOffset); + chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET); + lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256; + setCrDistortionWeight(lambdaOffset); + } + + void setLambda(double lambda2, double lambda) + { + m_lambda2 = (uint64_t)floor(256.0 * lambda2); + m_lambda = (uint64_t)floor(256.0 * lambda); + } + + inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const + { + X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2, + "calcRdCost wrap detected dist: %d, bits %d, lambda: %d\n", distortion, bits, (int)m_lambda2); + return distortion + ((bits * m_lambda2 + 128) >> 8); + } + + /* return the difference in energy between the source block and the recon block */ + inline int psyCost(int size, pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) const + { + return primitives.psy_cost_pp[size](source, sstride, recon, rstride); + } + + /* return the difference in energy between the source block and the recon block */ + inline int psyCost(int size, int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) const + { + return primitives.psy_cost_ss[size](source, sstride, recon, rstride); + } + + /* return the RD cost of this prediction, including the effect of psy-rd */ + inline uint64_t calcPsyRdCost(uint32_t distortion, uint32_t bits, uint32_t psycost) const + { + return distortion + ((m_lambda * m_psyRd * psycost) >> 16) + ((bits * m_lambda2) >> 8); + } + + inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const + { + X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda, + "calcRdSADCost wrap detected dist: %d, bits %d, lambda: "X265_LL"\n", sadCost, bits, m_lambda); + return sadCost + ((bits * m_lambda + 128) >> 8); + } + + inline uint32_t scaleChromaDistCb(uint32_t dist) const + { + X265_CHECK(dist <= (UINT64_MAX - 128) / m_cbDistortionWeight, + "scaleChromaDistCb wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_cbDistortionWeight); + return (uint32_t)(((dist * m_cbDistortionWeight) + 128) >> 8); + } + + inline uint32_t scaleChromaDistCr(uint32_t dist) const + { + X265_CHECK(dist <= (UINT64_MAX - 128) / m_crDistortionWeight, + "scaleChromaDistCr wrap detected dist: %d, lambda: "X265_LL"\n", dist, m_crDistortionWeight); + return (uint32_t)(((dist * m_crDistortionWeight) + 128) >> 8); + } + + inline uint32_t getCost(uint32_t bits) const + { + return (uint32_t)((bits * m_lambda + 128) >> 8); + } +}; +} + +#endif // ifndef X265_TCOMRDCOST_H diff --git a/source/encoder/reference.cpp b/source/encoder/reference.cpp new file mode 100644 index 0000000..958042b --- /dev/null +++ b/source/encoder/reference.cpp @@ -0,0 +1,118 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Deepthi Devaki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "slice.h" +#include "picyuv.h" + +#include "reference.h" + +using namespace x265; + +MotionReference::MotionReference() +{ + m_weightBuffer = NULL; +} + +int MotionReference::init(PicYuv* recPic, WeightParam *w) +{ + m_reconPic = recPic; + lumaStride = recPic->m_stride; + intptr_t startpad = recPic->m_lumaMarginY * lumaStride + recPic->m_lumaMarginX; + + /* directly reference the pre-extended integer pel plane */ + fpelPlane = recPic->m_picBuf[0] + startpad; + isWeighted = false; + + if (w) + { + if (!m_weightBuffer) + { + uint32_t numCUinHeight = (recPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; + size_t padheight = (numCUinHeight * g_maxCUSize) + recPic->m_lumaMarginY * 2; + m_weightBuffer = X265_MALLOC(pixel, lumaStride * padheight); + if (!m_weightBuffer) + return -1; + } + + isWeighted = true; + weight = w->inputWeight; + offset = w->inputOffset * (1 << (X265_DEPTH - 8)); + shift = w->log2WeightDenom; + round = shift ? 1 << (shift - 1) : 0; + m_numWeightedRows = 0; + + /* use our buffer which will have weighted pixels written to it */ + fpelPlane = m_weightBuffer + startpad; + } + + return 0; +} + +MotionReference::~MotionReference() +{ + X265_FREE(m_weightBuffer); +} + +void MotionReference::applyWeight(int rows, int numRows) +{ + rows = X265_MIN(rows, numRows); + if (m_numWeightedRows >= rows) + return; + int marginX = m_reconPic->m_lumaMarginX; + int marginY = m_reconPic->m_lumaMarginY; + pixel* src = (pixel*)m_reconPic->m_picOrg[0] + (m_numWeightedRows * (int)g_maxCUSize * lumaStride); + pixel* dst = fpelPlane + ((m_numWeightedRows * (int)g_maxCUSize) * lumaStride); + int width = m_reconPic->m_picWidth; + int height = ((rows - m_numWeightedRows) * g_maxCUSize); + if (rows == numRows) + height = ((m_reconPic->m_picHeight % g_maxCUSize) ? (m_reconPic->m_picHeight % g_maxCUSize) : g_maxCUSize); + + // Computing weighted CU rows + int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth + int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths + primitives.weight_pp(src, dst, lumaStride, padwidth, height, + weight, round << correction, shift + correction, offset); + + // Extending Left & Right + primitives.extendRowBorder(dst, lumaStride, width, height, marginX); + + // Extending Above + if (m_numWeightedRows == 0) + { + pixel *pixY = fpelPlane - marginX; + for (int y = 0; y < marginY; y++) + memcpy(pixY - (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel)); + } + + // Extending Bottom + if (rows == numRows) + { + pixel *pixY = fpelPlane - marginX + (m_reconPic->m_picHeight - 1) * lumaStride; + for (int y = 0; y < marginY; y++) + memcpy(pixY + (y + 1) * lumaStride, pixY, lumaStride * sizeof(pixel)); + } + m_numWeightedRows = rows; +} diff --git a/source/encoder/reference.h b/source/encoder/reference.h new file mode 100644 index 0000000..3fb9afd --- /dev/null +++ b/source/encoder/reference.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_REFERENCE_H +#define X265_REFERENCE_H + +#include "primitives.h" +#include "lowres.h" +#include "mv.h" + +namespace x265 { +// private x265 namespace + +class PicYuv; +struct WeightParam; + +class MotionReference : public ReferencePlanes +{ +public: + + MotionReference(); + ~MotionReference(); + int init(PicYuv*, WeightParam* w = NULL); + void applyWeight(int rows, int numRows); + + PicYuv* m_reconPic; + pixel* m_weightBuffer; + int m_numWeightedRows; + +protected: + + MotionReference& operator =(const MotionReference&); +}; +} + +#endif // ifndef X265_REFERENCE_H diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp new file mode 100644 index 0000000..1179fe0 --- /dev/null +++ b/source/encoder/sao.cpp @@ -0,0 +1,1498 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" +#include "sao.h" + +namespace { + +inline int32_t roundIBDI(int32_t num, int32_t den) +{ + return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2)); +} + +/* get the sign of input variable (TODO: this is a dup, make common) */ +inline int signOf(int x) +{ + return (x >> 31) | ((int)((((uint32_t)-x)) >> 31)); +} + +inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg) +{ + return (count * offset - offsetOrg * 2) * offset; +} + +} // end anonymous namespace + + +namespace x265 { + +const uint32_t SAO::s_eoTable[NUM_EDGETYPE] = +{ + 1, // 0 + 2, // 1 + 0, // 2 + 3, // 3 + 4 // 4 +}; + +SAO::SAO() +{ + m_count = NULL; + m_offset = NULL; + m_offsetOrg = NULL; + m_countPreDblk = NULL; + m_offsetOrgPreDblk = NULL; + m_refDepth = 0; + m_lumaLambda = 0; + m_chromaLambda = 0; + m_param = NULL; + m_clipTable = NULL; + m_clipTableBase = NULL; + m_offsetBo = NULL; + m_tmpU1[0] = NULL; + m_tmpU1[1] = NULL; + m_tmpU1[2] = NULL; + m_tmpU2[0] = NULL; + m_tmpU2[1] = NULL; + m_tmpU2[2] = NULL; + m_tmpL1 = NULL; + m_tmpL2 = NULL; + + m_depthSaoRate[0][0] = 0; + m_depthSaoRate[0][1] = 0; + m_depthSaoRate[0][2] = 0; + m_depthSaoRate[0][3] = 0; + m_depthSaoRate[1][0] = 0; + m_depthSaoRate[1][1] = 0; + m_depthSaoRate[1][2] = 0; + m_depthSaoRate[1][3] = 0; +} + +bool SAO::create(x265_param* param) +{ + m_param = param; + m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp); + m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp); + + m_numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + m_numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + + const pixel maxY = (1 << X265_DEPTH) - 1; + const pixel rangeExt = maxY >> 1; + int numCtu = m_numCuInWidth * m_numCuInHeight; + + CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt); + CHECKED_MALLOC(m_offsetBo, pixel, maxY + 2 * rangeExt); + + CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1); + CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1); + + for (int i = 0; i < 3; i++) + { + CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth); + CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth); + } + + CHECKED_MALLOC(m_count, PerClass, NUM_PLANE); + CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE); + CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE); + + CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); + CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu); + + m_clipTable = &(m_clipTableBase[rangeExt]); + + for (int i = 0; i < rangeExt; i++) + m_clipTableBase[i] = 0; + + for (int i = 0; i < maxY; i++) + m_clipTable[i] = (pixel)i; + + for (int i = maxY; i < maxY + rangeExt; i++) + m_clipTable[i] = maxY; + + return true; + +fail: + return false; +} + +void SAO::destroy() +{ + X265_FREE(m_clipTableBase); + X265_FREE(m_offsetBo); + + X265_FREE(m_tmpL1); + X265_FREE(m_tmpL2); + + for (int i = 0; i < 3; i++) + { + X265_FREE(m_tmpU1[i]); + X265_FREE(m_tmpU2[i]); + } + + X265_FREE(m_count); + X265_FREE(m_offset); + X265_FREE(m_offsetOrg); + X265_FREE(m_countPreDblk); + X265_FREE(m_offsetOrgPreDblk); +} + +/* allocate memory for SAO parameters */ +void SAO::allocSaoParam(SAOParam* saoParam) const +{ + saoParam->numCuInWidth = m_numCuInWidth; + + saoParam->ctuParam[0] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth]; + saoParam->ctuParam[1] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth]; + saoParam->ctuParam[2] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth]; +} + +void SAO::startSlice(Frame* frame, Entropy& initState, int qp) +{ + Slice* slice = frame->m_encData->m_slice; + + int qpCb = Clip3(0, QP_MAX_MAX, qp + slice->m_pps->chromaCbQpOffset); + m_lumaLambda = x265_lambda2_tab[qp]; + m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma + m_frame = frame; + + switch (slice->m_sliceType) + { + case I_SLICE: + m_refDepth = 0; + break; + case P_SLICE: + m_refDepth = 1; + break; + case B_SLICE: + m_refDepth = 2 + !IS_REFERENCED(frame); + break; + } + + resetStats(); + + m_entropyCoder.load(initState); + m_rdContexts.next.load(initState); + m_rdContexts.cur.load(initState); + + SAOParam* saoParam = frame->m_encData->m_saoParam; + if (!saoParam) + { + saoParam = new SAOParam; + allocSaoParam(saoParam); + frame->m_encData->m_saoParam = saoParam; + } + + rdoSaoUnitRowInit(saoParam); + + // NOTE: Disable SAO automatic turn-off when frame parallelism is + // enabled for output exact independent of frame thread count + if (m_param->frameNumThreads > 1) + { + saoParam->bSaoFlag[0] = true; + saoParam->bSaoFlag[1] = true; + } +} + +// CTU-based SAO process without slice granularity +void SAO::processSaoCu(int addr, int typeIdx, int plane) +{ + int x, y; + const CUData* cu = m_frame->m_encData->getPicCTU(addr); + pixel* rec = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr); + intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride; + uint32_t picWidth = m_param->sourceWidth; + uint32_t picHeight = m_param->sourceHeight; + int ctuWidth = g_maxCUSize; + int ctuHeight = g_maxCUSize; + uint32_t lpelx = cu->m_cuPelX; + uint32_t tpely = cu->m_cuPelY; + if (plane) + { + picWidth >>= m_hChromaShift; + picHeight >>= m_vChromaShift; + ctuWidth >>= m_hChromaShift; + ctuHeight >>= m_vChromaShift; + lpelx >>= m_hChromaShift; + tpely >>= m_vChromaShift; + } + uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth); + uint32_t bpely = x265_min(tpely + ctuHeight, picHeight); + ctuWidth = rpelx - lpelx; + ctuHeight = bpely - tpely; + + int startX; + int startY; + int endX; + int endY; + pixel* tmpL; + pixel* tmpU; + + int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1; + int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; + + { + const pixel* recR = &rec[ctuWidth - 1]; + for (int i = 0; i < ctuHeight + 1; i++) + { + m_tmpL2[i] = *recR; + recR += stride; + } + + tmpL = m_tmpL1; + tmpU = &(m_tmpU1[plane][lpelx]); + } + + switch (typeIdx) + { + case SAO_EO_0: // dir: - + { + pixel firstPxl = 0, lastPxl = 0; + startX = !lpelx; + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; + if (ctuWidth & 15) + { + for (y = 0; y < ctuHeight; y++) + { + int signLeft = signOf(rec[startX] - tmpL[y]); + for (x = startX; x < endX; x++) + { + int signRight = signOf(rec[x] - rec[x + 1]); + int edgeType = signRight + signLeft + 2; + signLeft = -signRight; + + rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + } + + rec += stride; + } + } + else + { + for (y = 0; y < ctuHeight; y++) + { + int signLeft = signOf(rec[startX] - tmpL[y]); + + if (!lpelx) + firstPxl = rec[0]; + + if (rpelx == picWidth) + lastPxl = rec[ctuWidth - 1]; + + primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, (int8_t)signLeft); + + if (!lpelx) + rec[0] = firstPxl; + + if (rpelx == picWidth) + rec[ctuWidth - 1] = lastPxl; + + rec += stride; + } + } + break; + } + case SAO_EO_1: // dir: | + { + startY = !tpely; + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; + if (!tpely) + rec += stride; + + for (x = 0; x < ctuWidth; x++) + upBuff1[x] = signOf(rec[x] - tmpU[x]); + + for (y = startY; y < endY; y++) + { + for (x = 0; x < ctuWidth; x++) + { + int signDown = signOf(rec[x] - rec[x + stride]); + int edgeType = signDown + upBuff1[x] + 2; + upBuff1[x] = -signDown; + + rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + } + + rec += stride; + } + + break; + } + case SAO_EO_2: // dir: 135 + { + startX = !lpelx; + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; + + startY = !tpely; + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; + + if (!tpely) + rec += stride; + + for (x = startX; x < endX; x++) + upBuff1[x] = signOf(rec[x] - tmpU[x - 1]); + + for (y = startY; y < endY; y++) + { + upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]); + for (x = startX; x < endX; x++) + { + int signDown = signOf(rec[x] - rec[x + stride + 1]); + int edgeType = signDown + upBuff1[x] + 2; + upBufft[x + 1] = -signDown; + rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + } + + std::swap(upBuff1, upBufft); + + rec += stride; + } + + break; + } + case SAO_EO_3: // dir: 45 + { + startX = !lpelx; + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; + + startY = !tpely; + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; + + if (!tpely) + rec += stride; + + for (x = startX - 1; x < endX; x++) + upBuff1[x] = signOf(rec[x] - tmpU[x + 1]); + + for (y = startY; y < endY; y++) + { + x = startX; + int signDown = signOf(rec[x] - tmpL[y + 1]); + int edgeType = signDown + upBuff1[x] + 2; + upBuff1[x - 1] = -signDown; + rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + for (x = startX + 1; x < endX; x++) + { + signDown = signOf(rec[x] - rec[x + stride - 1]); + edgeType = signDown + upBuff1[x] + 2; + upBuff1[x - 1] = -signDown; + rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + } + + upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); + + rec += stride; + } + + break; + } + case SAO_BO: + { + const pixel* offsetBo = m_offsetBo; + + for (y = 0; y < ctuHeight; y++) + { + for (x = 0; x < ctuWidth; x++) + rec[x] = offsetBo[rec[x]]; + + rec += stride; + } + + break; + } + default: break; + } + +// if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1) + std::swap(m_tmpL1, m_tmpL2); +} + +/* Process SAO all units */ +void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane) +{ + intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride; + uint32_t picWidth = m_param->sourceWidth; + int ctuWidth = g_maxCUSize; + int ctuHeight = g_maxCUSize; + if (plane) + { + picWidth >>= m_hChromaShift; + ctuWidth >>= m_hChromaShift; + ctuHeight >>= m_vChromaShift; + } + + if (!idxY) + { + pixel* rec = m_frame->m_reconPicYuv->m_picOrg[plane]; + memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth); + } + + int addr = idxY * m_numCuInWidth; + pixel* rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr); + + for (int i = 0; i < ctuHeight + 1; i++) + { + m_tmpL1[i] = rec[0]; + rec += stride; + } + + rec -= (stride << 1); + + memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth); + + const int boShift = X265_DEPTH - SAO_BO_BITS; + + for (int idxX = 0; idxX < m_numCuInWidth; idxX++) + { + addr = idxY * m_numCuInWidth + idxX; + + bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT; + int typeIdx = ctuParam[addr].typeIdx; + + if (typeIdx >= 0) + { + if (!mergeLeftFlag) + { + if (typeIdx == SAO_BO) + { + pixel* offsetBo = m_offsetBo; + int offset[SAO_NUM_BO_CLASSES]; + memset(offset, 0, sizeof(offset)); + + for (int i = 0; i < SAO_NUM_OFFSET; i++) + offset[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC; + + for (int i = 0; i < (1 << X265_DEPTH); i++) + offsetBo[i] = m_clipTable[i + offset[i >> boShift]]; + } + else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) + { + int offset[NUM_EDGETYPE]; + offset[0] = 0; + for (int i = 0; i < SAO_NUM_OFFSET; i++) + offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; + + for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) + m_offsetEo[edgeType] = (int8_t)offset[s_eoTable[edgeType]]; + } + } + processSaoCu(addr, typeIdx, plane); + } + else if (idxX != (m_numCuInWidth - 1)) + { + rec = plane ? m_frame->m_reconPicYuv->getChromaAddr(plane, addr) : m_frame->m_reconPicYuv->getLumaAddr(addr); + + for (int i = 0; i < ctuHeight + 1; i++) + { + m_tmpL1[i] = rec[ctuWidth - 1]; + rec += stride; + } + } + } + + std::swap(m_tmpU1[plane], m_tmpU2[plane]); +} + +void SAO::resetSaoUnit(SaoCtuParam* saoUnit) +{ + saoUnit->mergeMode = SAO_MERGE_NONE; + saoUnit->typeIdx = -1; + saoUnit->bandPos = 0; + + for (int i = 0; i < SAO_NUM_OFFSET; i++) + saoUnit->offset[i] = 0; +} + +void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc) +{ + saoUnitDst->mergeMode = saoUnitSrc->mergeMode; + saoUnitDst->typeIdx = saoUnitSrc->typeIdx; + saoUnitDst->bandPos = saoUnitSrc->bandPos; + + for (int i = 0; i < SAO_NUM_OFFSET; i++) + saoUnitDst->offset[i] = saoUnitSrc->offset[i]; +} + +/* Calculate SAO statistics for current CTU without non-crossing slice */ +void SAO::calcSaoStatsCu(int addr, int plane) +{ + int x, y; + CUData* cu = m_frame->m_encData->getPicCTU(addr); + const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr); + const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr); + const pixel* fenc; + const pixel* rec; + intptr_t stride = plane ? m_frame->m_reconPicYuv->m_strideC : m_frame->m_reconPicYuv->m_stride; + uint32_t picWidth = m_param->sourceWidth; + uint32_t picHeight = m_param->sourceHeight; + int ctuWidth = g_maxCUSize; + int ctuHeight = g_maxCUSize; + uint32_t lpelx = cu->m_cuPelX; + uint32_t tpely = cu->m_cuPelY; + if (plane) + { + picWidth >>= m_hChromaShift; + picHeight >>= m_vChromaShift; + ctuWidth >>= m_hChromaShift; + ctuHeight >>= m_vChromaShift; + lpelx >>= m_hChromaShift; + tpely >>= m_vChromaShift; + } + uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth); + uint32_t bpely = x265_min(tpely + ctuHeight, picHeight); + ctuWidth = rpelx - lpelx; + ctuHeight = bpely - tpely; + + int startX; + int startY; + int endX; + int endY; + int32_t* stats; + int32_t* count; + + int skipB = plane ? 2 : 4; + int skipR = plane ? 3 : 5; + + int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1; + int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; + + // SAO_BO: + { + const int boShift = X265_DEPTH - SAO_BO_BITS; + + if (m_param->bSaoNonDeblocked) + { + skipB = plane ? 1 : 3; + skipR = plane ? 2 : 4; + } + stats = m_offsetOrg[plane][SAO_BO]; + count = m_count[plane][SAO_BO]; + + fenc = fenc0; + rec = rec0; + + endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; + endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB; + + for (y = 0; y < endY; y++) + { + for (x = 0; x < endX; x++) + { + int classIdx = 1 + (rec[x] >> boShift); + stats[classIdx] += (fenc[x] - rec[x]); + count[classIdx]++; + } + + fenc += stride; + rec += stride; + } + } + + { + // SAO_EO_0: // dir: - + { + if (m_param->bSaoNonDeblocked) + { + skipB = plane ? 1 : 3; + skipR = plane ? 3 : 5; + } + stats = m_offsetOrg[plane][SAO_EO_0]; + count = m_count[plane][SAO_EO_0]; + + fenc = fenc0; + rec = rec0; + + startX = !lpelx; + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; + for (y = 0; y < ctuHeight - skipB; y++) + { + int signLeft = signOf(rec[startX] - rec[startX - 1]); + for (x = startX; x < endX; x++) + { + int signRight = signOf(rec[x] - rec[x + 1]); + int edgeType = signRight + signLeft + 2; + signLeft = -signRight; + + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + fenc += stride; + rec += stride; + } + } + + // SAO_EO_1: // dir: | + { + if (m_param->bSaoNonDeblocked) + { + skipB = plane ? 2 : 4; + skipR = plane ? 2 : 4; + } + stats = m_offsetOrg[plane][SAO_EO_1]; + count = m_count[plane][SAO_EO_1]; + + fenc = fenc0; + rec = rec0; + + startY = !tpely; + endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; + if (!tpely) + { + fenc += stride; + rec += stride; + } + + for (x = 0; x < ctuWidth; x++) + upBuff1[x] = signOf(rec[x] - rec[x - stride]); + + for (y = startY; y < endY; y++) + { + for (x = 0; x < endX; x++) + { + int signDown = signOf(rec[x] - rec[x + stride]); + int edgeType = signDown + upBuff1[x] + 2; + upBuff1[x] = -signDown; + + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + fenc += stride; + rec += stride; + } + } + + // SAO_EO_2: // dir: 135 + { + if (m_param->bSaoNonDeblocked) + { + skipB = plane ? 2 : 4; + skipR = plane ? 3 : 5; + } + stats = m_offsetOrg[plane][SAO_EO_2]; + count = m_count[plane][SAO_EO_2]; + + fenc = fenc0; + rec = rec0; + + startX = !lpelx; + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; + + startY = !tpely; + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; + if (!tpely) + { + fenc += stride; + rec += stride; + } + + for (x = startX; x < endX; x++) + upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]); + + for (y = startY; y < endY; y++) + { + upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]); + for (x = startX; x < endX; x++) + { + int signDown = signOf(rec[x] - rec[x + stride + 1]); + int edgeType = signDown + upBuff1[x] + 2; + upBufft[x + 1] = -signDown; + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + std::swap(upBuff1, upBufft); + + rec += stride; + fenc += stride; + } + } + + // SAO_EO_3: // dir: 45 + { + if (m_param->bSaoNonDeblocked) + { + skipB = plane ? 2 : 4; + skipR = plane ? 3 : 5; + } + stats = m_offsetOrg[plane][SAO_EO_3]; + count = m_count[plane][SAO_EO_3]; + + fenc = fenc0; + rec = rec0; + + startX = !lpelx; + endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; + + startY = !tpely; + endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; + + if (!tpely) + { + fenc += stride; + rec += stride; + } + + for (x = startX - 1; x < endX; x++) + upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]); + + for (y = startY; y < endY; y++) + { + for (x = startX; x < endX; x++) + { + int signDown = signOf(rec[x] - rec[x + stride - 1]); + int edgeType = signDown + upBuff1[x] + 2; + upBuff1[x - 1] = -signDown; + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); + + rec += stride; + fenc += stride; + } + } + } +} + +void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY) +{ + int addr = idxX + m_numCuInWidth * idxY; + + int x, y; + CUData* cu = frame->m_encData->getPicCTU(addr); + const pixel* fenc; + const pixel* rec; + intptr_t stride = m_frame->m_reconPicYuv->m_stride; + uint32_t picWidth = m_param->sourceWidth; + uint32_t picHeight = m_param->sourceHeight; + int ctuWidth = g_maxCUSize; + int ctuHeight = g_maxCUSize; + uint32_t lpelx = cu->m_cuPelX; + uint32_t tpely = cu->m_cuPelY; + uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth); + uint32_t bpely = x265_min(tpely + ctuHeight, picHeight); + ctuWidth = rpelx - lpelx; + ctuHeight = bpely - tpely; + + int startX; + int startY; + int endX; + int endY; + int firstX, firstY; + int32_t* stats; + int32_t* count; + + int skipB, skipR; + + int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1; + int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; + + const int boShift = X265_DEPTH - SAO_BO_BITS; + + memset(m_countPreDblk[addr], 0, sizeof(PerPlane)); + memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane)); + + for (int plane = 0; plane < NUM_PLANE; plane++) + { + if (plane == 1) + { + stride = frame->m_reconPicYuv->m_strideC; + picWidth >>= m_hChromaShift; + picHeight >>= m_vChromaShift; + ctuWidth >>= m_hChromaShift; + ctuHeight >>= m_vChromaShift; + lpelx >>= m_hChromaShift; + tpely >>= m_vChromaShift; + rpelx >>= m_hChromaShift; + bpely >>= m_vChromaShift; + } + + // SAO_BO: + + skipB = plane ? 1 : 3; + skipR = plane ? 2 : 4; + + stats = m_offsetOrgPreDblk[addr][plane][SAO_BO]; + count = m_countPreDblk[addr][plane][SAO_BO]; + + const pixel* fenc0 = m_frame->m_origPicYuv->getPlaneAddr(plane, addr); + const pixel* rec0 = m_frame->m_reconPicYuv->getPlaneAddr(plane, addr); + fenc = fenc0; + rec = rec0; + + startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; + startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB; + + for (y = 0; y < ctuHeight; y++) + { + for (x = (y < startY ? startX : 0); x < ctuWidth; x++) + { + int classIdx = 1 + (rec[x] >> boShift); + stats[classIdx] += (fenc[x] - rec[x]); + count[classIdx]++; + } + + fenc += stride; + rec += stride; + } + + // SAO_EO_0: // dir: - + { + skipB = plane ? 1 : 3; + skipR = plane ? 3 : 5; + + stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0]; + count = m_countPreDblk[addr][plane][SAO_EO_0]; + + fenc = fenc0; + rec = rec0; + + startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; + startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB; + firstX = !lpelx; + // endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; + endX = ctuWidth - 1; // not refer right CTU + + for (y = 0; y < ctuHeight; y++) + { + x = (y < startY ? startX : firstX); + int signLeft = signOf(rec[x] - rec[x - 1]); + for (; x < endX; x++) + { + int signRight = signOf(rec[x] - rec[x + 1]); + int edgeType = signRight + signLeft + 2; + signLeft = -signRight; + + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + fenc += stride; + rec += stride; + } + } + + // SAO_EO_1: // dir: | + { + skipB = plane ? 2 : 4; + skipR = plane ? 2 : 4; + + stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1]; + count = m_countPreDblk[addr][plane][SAO_EO_1]; + + fenc = fenc0; + rec = rec0; + + startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; + startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; + firstY = !tpely; + // endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; + endY = ctuHeight - 1; // not refer below CTU + if (!tpely) + { + fenc += stride; + rec += stride; + } + + for (x = startX; x < ctuWidth; x++) + upBuff1[x] = signOf(rec[x] - rec[x - stride]); + + for (y = firstY; y < endY; y++) + { + for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++) + { + int signDown = signOf(rec[x] - rec[x + stride]); + int edgeType = signDown + upBuff1[x] + 2; + upBuff1[x] = -signDown; + + if (x < startX && y < startY) + continue; + + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + fenc += stride; + rec += stride; + } + } + + // SAO_EO_2: // dir: 135 + { + skipB = plane ? 2 : 4; + skipR = plane ? 3 : 5; + + stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2]; + count = m_countPreDblk[addr][plane][SAO_EO_2]; + + fenc = fenc0; + rec = rec0; + + startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; + startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; + firstX = !lpelx; + firstY = !tpely; + // endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; + // endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; + endX = ctuWidth - 1; // not refer right CTU + endY = ctuHeight - 1; // not refer below CTU + if (!tpely) + { + fenc += stride; + rec += stride; + } + + for (x = startX; x < endX; x++) + upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]); + + for (y = firstY; y < endY; y++) + { + x = (y < startY - 1 ? startX : firstX); + upBufft[x] = signOf(rec[x + stride] - rec[x - 1]); + for (; x < endX; x++) + { + int signDown = signOf(rec[x] - rec[x + stride + 1]); + int edgeType = signDown + upBuff1[x] + 2; + upBufft[x + 1] = -signDown; + + if (x < startX && y < startY) + continue; + + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + std::swap(upBuff1, upBufft); + + rec += stride; + fenc += stride; + } + } + + // SAO_EO_3: // dir: 45 + { + skipB = plane ? 2 : 4; + skipR = plane ? 3 : 5; + + stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3]; + count = m_countPreDblk[addr][plane][SAO_EO_3]; + + fenc = fenc0; + rec = rec0; + + startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; + startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; + firstX = !lpelx; + firstY = !tpely; + // endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; + // endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; + endX = ctuWidth - 1; // not refer right CTU + endY = ctuHeight - 1; // not refer below CTU + if (!tpely) + { + fenc += stride; + rec += stride; + } + + for (x = startX - 1; x < endX; x++) + upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]); + + for (y = firstY; y < endY; y++) + { + for (x = (y < startY - 1 ? startX : firstX); x < endX; x++) + { + int signDown = signOf(rec[x] - rec[x + stride - 1]); + int edgeType = signDown + upBuff1[x] + 2; + upBuff1[x - 1] = -signDown; + + if (x < startX && y < startY) + continue; + + stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); + count[s_eoTable[edgeType]]++; + } + + upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); + + rec += stride; + fenc += stride; + } + } + } +} + +/* reset offset statistics */ +void SAO::resetStats() +{ + memset(m_count, 0, sizeof(PerClass) * NUM_PLANE); + memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE); + memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE); +} + +void SAO::rdoSaoUnitRowInit(SAOParam* saoParam) +{ + saoParam->bSaoFlag[0] = true; + saoParam->bSaoFlag[1] = true; + + m_numNoSao[0] = 0; // Luma + m_numNoSao[1] = 0; // Chroma + if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE) + saoParam->bSaoFlag[0] = false; + if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA) + saoParam->bSaoFlag[1] = false; +} + +void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus) +{ + if (!saoParam->bSaoFlag[0]) + m_depthSaoRate[0][m_refDepth] = 1.0; + else + m_depthSaoRate[0][m_refDepth] = m_numNoSao[0] / ((double)numctus); + + if (!saoParam->bSaoFlag[1]) + m_depthSaoRate[1][m_refDepth] = 1.0; + else + m_depthSaoRate[1][m_refDepth] = m_numNoSao[1] / ((double)numctus); +} + +void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY) +{ + SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2]; + double mergeDist[NUM_MERGE_MODE]; + bool allowMerge[2]; // left, up + allowMerge[1] = (idxY > 0); + + for (int idxX = 0; idxX < m_numCuInWidth; idxX++) + { + int addr = idxX + idxY * m_numCuInWidth; + int addrUp = idxY ? addr - m_numCuInWidth : -1; + int addrLeft = idxX ? addr - 1 : -1; + allowMerge[0] = (idxX > 0); + + m_entropyCoder.load(m_rdContexts.cur); + if (allowMerge[0]) + m_entropyCoder.codeSaoMerge(0); + if (allowMerge[1]) + m_entropyCoder.codeSaoMerge(0); + m_entropyCoder.store(m_rdContexts.temp); + // reset stats Y, Cb, Cr + for (int plane = 0; plane < 3; plane++) + { + for (int j = 0; j < MAX_NUM_SAO_TYPE; j++) + { + for (int k = 0; k < MAX_NUM_SAO_CLASS; k++) + { + m_offset[plane][j][k] = 0; + if (m_param->bSaoNonDeblocked) + { + m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k]; + m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k]; + } + else + { + m_count[plane][j][k] = 0; + m_offsetOrg[plane][j][k] = 0; + } + } + } + + saoParam->ctuParam[plane][addr].mergeMode = SAO_MERGE_NONE; + saoParam->ctuParam[plane][addr].typeIdx = -1; + saoParam->ctuParam[plane][addr].bandPos = 0; + if (saoParam->bSaoFlag[plane > 0]) + calcSaoStatsCu(addr, plane); + } + + saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist); + + sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist); + + if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1]) + { + // Cost of new SAO_params + m_entropyCoder.load(m_rdContexts.cur); + m_entropyCoder.resetBits(); + if (allowMerge[0]) + m_entropyCoder.codeSaoMerge(0); + if (allowMerge[1]) + m_entropyCoder.codeSaoMerge(0); + for (int plane = 0; plane < 3; plane++) + { + if (saoParam->bSaoFlag[plane > 0]) + m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane); + } + + uint32_t rate = m_entropyCoder.getNumberOfWrittenBits(); + double bestCost = mergeDist[0] + (double)rate; + m_entropyCoder.store(m_rdContexts.temp); + + // Cost of Merge + for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx) + { + if (!allowMerge[mergeIdx]) + continue; + + m_entropyCoder.load(m_rdContexts.cur); + m_entropyCoder.resetBits(); + if (allowMerge[0]) + m_entropyCoder.codeSaoMerge(1 - mergeIdx); + if (allowMerge[1] && (mergeIdx == 1)) + m_entropyCoder.codeSaoMerge(1); + + rate = m_entropyCoder.getNumberOfWrittenBits(); + double mergeCost = mergeDist[mergeIdx + 1] + (double)rate; + if (mergeCost < bestCost) + { + SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT; + bestCost = mergeCost; + m_entropyCoder.store(m_rdContexts.temp); + for (int plane = 0; plane < 3; plane++) + { + mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode; + if (saoParam->bSaoFlag[plane > 0]) + copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]); + } + } + } + + if (saoParam->ctuParam[0][addr].typeIdx < 0) + m_numNoSao[0]++; + if (saoParam->ctuParam[1][addr].typeIdx < 0) + m_numNoSao[1]++; + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.store(m_rdContexts.cur); + } + } +} + +/** rate distortion optimization of SAO unit */ +inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo) +{ + int64_t estDist = 0; + + for (int classIdx = 1; classIdx < ((typeIdx < SAO_BO) ? SAO_EO_LEN + 1 : SAO_NUM_BO_CLASSES + 1); classIdx++) + { + int32_t count = m_count[plane][typeIdx][classIdx]; + int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx]; + int32_t& offsetOut = m_offset[plane][typeIdx][classIdx]; + + if (typeIdx == SAO_BO) + { + currentDistortionTableBo[classIdx - 1] = 0; + currentRdCostTableBo[classIdx - 1] = lambda; + } + if (count) + { + int offset = roundIBDI(offsetOrg, count << SAO_BIT_INC); + offset = Clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset); + if (typeIdx < SAO_BO) + { + if (classIdx < 3) + offset = X265_MAX(offset, 0); + else + offset = X265_MIN(offset, 0); + } + offsetOut = estIterOffset(typeIdx, classIdx, lambda, offset, count, offsetOrg, currentDistortionTableBo, currentRdCostTableBo); + } + else + { + offsetOrg = 0; + offsetOut = 0; + } + if (typeIdx != SAO_BO) + estDist += estSaoDist(count, (int)offsetOut << SAO_BIT_INC, offsetOrg); + } + + return estDist; +} + +inline int SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, int32_t* currentDistortionTableBo, double* currentRdCostTableBo) +{ + int offsetOut = 0; + + // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here. + double tempMinCost = lambda; + while (offset != 0) + { + // Calculate the bits required for signalling the offset + int tempRate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1); + if (abs(offset) == OFFSET_THRESH - 1) + tempRate--; + + // Do the dequntization before distorion calculation + int tempOffset = offset << SAO_BIT_INC; + int64_t tempDist = estSaoDist(count, tempOffset, offsetOrg); + double tempCost = ((double)tempDist + lambda * (double)tempRate); + if (tempCost < tempMinCost) + { + tempMinCost = tempCost; + offsetOut = offset; + if (typeIdx == SAO_BO) + { + currentDistortionTableBo[classIdx - 1] = (int)tempDist; + currentRdCostTableBo[classIdx - 1] = tempCost; + } + } + offset = (offset > 0) ? (offset - 1) : (offset + 1); + } + + return offsetOut; +} + +void SAO::saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam* mergeSaoParam, double* mergeDist) +{ + int64_t bestDist = 0; + + SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr]; + + double bestRDCostTableBo = MAX_DOUBLE; + int bestClassTableBo = 0; + int currentDistortionTableBo[MAX_NUM_SAO_CLASS]; + double currentRdCostTableBo[MAX_NUM_SAO_CLASS]; + + resetSaoUnit(lclCtuParam); + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.resetBits(); + m_entropyCoder.codeSaoOffset(*lclCtuParam, 0); + double dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda; + + for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++) + { + int64_t estDist = estSaoTypeDist(0, typeIdx, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo); + + if (typeIdx == SAO_BO) + { + // Estimate Best Position + for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++) + { + double currentRDCost = 0.0; + for (int j = i; j < i + SAO_BO_LEN; j++) + currentRDCost += currentRdCostTableBo[j]; + + if (currentRDCost < bestRDCostTableBo) + { + bestRDCostTableBo = currentRDCost; + bestClassTableBo = i; + } + } + + // Re code all Offsets + // Code Center + estDist = 0; + for (int classIdx = bestClassTableBo; classIdx < bestClassTableBo + SAO_BO_LEN; classIdx++) + estDist += currentDistortionTableBo[classIdx]; + } + SaoCtuParam ctuParamRdo; + ctuParamRdo.mergeMode = SAO_MERGE_NONE; + ctuParamRdo.typeIdx = typeIdx; + ctuParamRdo.bandPos = (typeIdx == SAO_BO) ? bestClassTableBo : 0; + for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) + ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.bandPos + 1]; + + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.resetBits(); + m_entropyCoder.codeSaoOffset(ctuParamRdo, 0); + + uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits(); + double cost = (double)estDist + m_lumaLambda * (double)estRate; + + if (cost < dCostPartBest) + { + dCostPartBest = cost; + copySaoUnit(lclCtuParam, &ctuParamRdo); + bestDist = estDist; + } + } + + mergeDist[0] = ((double)bestDist / m_lumaLambda); + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.codeSaoOffset(*lclCtuParam, 0); + m_entropyCoder.store(m_rdContexts.temp); + + // merge left or merge up + + for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++) + { + SaoCtuParam* mergeSrcParam = NULL; + if (addrLeft >= 0 && mergeIdx == 0) + mergeSrcParam = &(saoParam->ctuParam[0][addrLeft]); + else if (addrUp >= 0 && mergeIdx == 1) + mergeSrcParam = &(saoParam->ctuParam[0][addrUp]); + if (mergeSrcParam) + { + int64_t estDist = 0; + int typeIdx = mergeSrcParam->typeIdx; + if (typeIdx >= 0) + { + int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0; + for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) + { + int mergeOffset = mergeSrcParam->offset[classIdx]; + estDist += estSaoDist(m_count[0][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + bandPos + 1]); + } + } + + copySaoUnit(&mergeSaoParam[mergeIdx], mergeSrcParam); + mergeSaoParam[mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT; + + mergeDist[mergeIdx + 1] = ((double)estDist / m_lumaLambda); + } + else + resetSaoUnit(&mergeSaoParam[mergeIdx]); + } +} + +void SAO::sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist) +{ + int64_t bestDist = 0; + + SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] }; + + double currentRdCostTableBo[MAX_NUM_SAO_CLASS]; + int bestClassTableBo[2] = { 0, 0 }; + int currentDistortionTableBo[MAX_NUM_SAO_CLASS]; + + resetSaoUnit(lclCtuParam[0]); + resetSaoUnit(lclCtuParam[1]); + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.resetBits(); + m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1); + m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2); + + double costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda; + + for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++) + { + int64_t estDist[2]; + if (typeIdx == SAO_BO) + { + // Estimate Best Position + for (int compIdx = 0; compIdx < 2; compIdx++) + { + double bestRDCostTableBo = MAX_DOUBLE; + estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo); + for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++) + { + double currentRDCost = 0.0; + for (int j = i; j < i + SAO_BO_LEN; j++) + currentRDCost += currentRdCostTableBo[j]; + + if (currentRDCost < bestRDCostTableBo) + { + bestRDCostTableBo = currentRDCost; + bestClassTableBo[compIdx] = i; + } + } + + // Re code all Offsets + // Code Center + estDist[compIdx] = 0; + for (int classIdx = bestClassTableBo[compIdx]; classIdx < bestClassTableBo[compIdx] + SAO_BO_LEN; classIdx++) + estDist[compIdx] += currentDistortionTableBo[classIdx]; + } + } + else + { + estDist[0] = estSaoTypeDist(1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo); + estDist[1] = estSaoTypeDist(2, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo); + } + + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.resetBits(); + + SaoCtuParam ctuParamRdo[2]; + for (int compIdx = 0; compIdx < 2; compIdx++) + { + ctuParamRdo[compIdx].mergeMode = SAO_MERGE_NONE; + ctuParamRdo[compIdx].typeIdx = typeIdx; + ctuParamRdo[compIdx].bandPos = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0; + for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) + ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].bandPos + 1]; + + m_entropyCoder.codeSaoOffset(ctuParamRdo[compIdx], compIdx + 1); + } + + uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits(); + double cost = (double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate; + + if (cost < costPartBest) + { + costPartBest = cost; + copySaoUnit(lclCtuParam[0], &ctuParamRdo[0]); + copySaoUnit(lclCtuParam[1], &ctuParamRdo[1]); + bestDist = (estDist[0] + estDist[1]); + } + } + + mergeDist[0] += ((double)bestDist / m_chromaLambda); + m_entropyCoder.load(m_rdContexts.temp); + m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1); + m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2); + m_entropyCoder.store(m_rdContexts.temp); + + // merge left or merge up + + for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++) + { + for (int compIdx = 0; compIdx < 2; compIdx++) + { + int plane = compIdx + 1; + SaoCtuParam* mergeSrcParam = NULL; + if (addrLeft >= 0 && mergeIdx == 0) + mergeSrcParam = &(saoParam->ctuParam[plane][addrLeft]); + else if (addrUp >= 0 && mergeIdx == 1) + mergeSrcParam = &(saoParam->ctuParam[plane][addrUp]); + if (mergeSrcParam) + { + int64_t estDist = 0; + int typeIdx = mergeSrcParam->typeIdx; + if (typeIdx >= 0) + { + int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0; + for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) + { + int mergeOffset = mergeSrcParam->offset[classIdx]; + estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]); + } + } + + copySaoUnit(&mergeSaoParam[plane][mergeIdx], mergeSrcParam); + mergeSaoParam[plane][mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT; + mergeDist[mergeIdx + 1] += ((double)estDist / m_chromaLambda); + } + else + resetSaoUnit(&mergeSaoParam[plane][mergeIdx]); + } + } +} +} diff --git a/source/encoder/sao.h b/source/encoder/sao.h new file mode 100644 index 0000000..70df9da --- /dev/null +++ b/source/encoder/sao.h @@ -0,0 +1,151 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_SAO_H +#define X265_SAO_H + +#include "common.h" +#include "frame.h" +#include "entropy.h" + +namespace x265 { +// private namespace + +enum SAOTypeLen +{ + SAO_EO_LEN = 4, + SAO_BO_LEN = 4, + SAO_NUM_BO_CLASSES = 32 +}; + +enum SAOType +{ + SAO_EO_0 = 0, + SAO_EO_1, + SAO_EO_2, + SAO_EO_3, + SAO_BO, + MAX_NUM_SAO_TYPE +}; + +class SAO +{ +protected: + + enum { SAO_MAX_DEPTH = 4 }; + enum { SAO_BO_BITS = 5 }; + enum { MAX_NUM_SAO_CLASS = 33 }; + enum { SAO_BIT_INC = X265_MAX(X265_DEPTH - 10, 0) }; + enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) }; + enum { NUM_EDGETYPE = 5 }; + enum { NUM_PLANE = 3 }; + enum { NUM_MERGE_MODE = 3 }; + + static const uint32_t s_eoTable[NUM_EDGETYPE]; + + typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]); + typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]); + + /* allocated per part */ + PerClass* m_count; + PerClass* m_offset; + PerClass* m_offsetOrg; + + /* allocated per CTU */ + PerPlane* m_countPreDblk; + PerPlane* m_offsetOrgPreDblk; + + double m_depthSaoRate[2][4]; + pixel* m_offsetBo; + int8_t m_offsetEo[NUM_EDGETYPE]; + + int m_numCuInWidth; + int m_numCuInHeight; + int m_hChromaShift; + int m_vChromaShift; + + pixel* m_clipTable; + pixel* m_clipTableBase; + + pixel* m_tmpU1[3]; + pixel* m_tmpU2[3]; + pixel* m_tmpL1; + pixel* m_tmpL2; + +public: + + struct SAOContexts + { + Entropy cur; + Entropy next; + Entropy temp; + }; + + Frame* m_frame; + Entropy m_entropyCoder; + SAOContexts m_rdContexts; + + x265_param* m_param; + int m_refDepth; + int m_numNoSao[2]; + + double m_lumaLambda; + double m_chromaLambda; + /* TODO: No doubles for distortion */ + + SAO(); + + bool create(x265_param* param); + void destroy(); + + void allocSaoParam(SAOParam* saoParam) const; + + void startSlice(Frame* pic, Entropy& initState, int qp); + void resetStats(); + void resetSaoUnit(SaoCtuParam* saoUnit); + + // CTU-based SAO process without slice granularity + void processSaoCu(int addr, int typeIdx, int plane); + void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane); + + void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc); + + void calcSaoStatsCu(int addr, int plane); + void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY); + + void saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[2], double* mergeDist); + void sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist); + + inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, + int32_t* currentDistortionTableBo, double* currentRdCostTableBo); + inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo); + + void rdoSaoUnitRowInit(SAOParam* saoParam); + void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus); + void rdoSaoUnitRow(SAOParam* saoParam, int idxY); +}; + +} + +#endif // ifndef X265_SAO_H diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp new file mode 100644 index 0000000..cd86318 --- /dev/null +++ b/source/encoder/search.cpp @@ -0,0 +1,3249 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "picyuv.h" +#include "cudata.h" + +#include "search.h" +#include "entropy.h" +#include "rdcost.h" + +using namespace x265; + +#if _MSC_VER +#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning) +#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data) +#endif + +ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 }; +ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 }; + +Search::Search() : JobProvider(NULL) +{ + memset(m_rqt, 0, sizeof(m_rqt)); + + for (int i = 0; i < 3; i++) + { + m_qtTempTransformSkipFlag[i] = NULL; + m_qtTempCbf[i] = NULL; + } + + m_numLayers = 0; + m_param = NULL; + m_slice = NULL; + m_frame = NULL; + m_bJobsQueued = false; + m_totalNumME = m_numAcquiredME = m_numCompletedME = 0; +} + +bool Search::initSearch(const x265_param& param, ScalingList& scalingList) +{ + m_param = ¶m; + m_bEnableRDOQ = param.rdLevel >= 4; + m_bFrameParallel = param.frameNumThreads > 1; + m_numLayers = g_log2Size[param.maxCUSize] - 2; + + m_rdCost.setPsyRdScale(param.psyRd); + m_me.setSearchMethod(param.searchMethod); + m_me.setSubpelRefine(param.subpelRefine); + + bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder); + if (m_param->noiseReduction) + ok &= m_quant.allocNoiseReduction(param); + + ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */ + + /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed + * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */ + m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight; + + uint32_t sizeL = 1 << (g_maxLog2CUSize * 2); + uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); + uint32_t numPartitions = NUM_CU_PARTITIONS; + + /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32 + * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts + * which are reconstructed at each depth are valid. At the end, the transform depth table + * is walked and the coeff and recon at the correct depths are collected */ + for (uint32_t i = 0; i <= m_numLayers; i++) + { + CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2); + m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL; + m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC; + ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp); + ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp); + } + + /* the rest of these buffers are indexed per-depth */ + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + { + int cuSize = g_maxCUSize >> i; + ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp); + ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp); + ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp); + ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp); + } + + CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3); + m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions; + m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2; + CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3); + m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions; + m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2; + + return ok; + +fail: + return false; +} + +Search::~Search() +{ + for (uint32_t i = 0; i <= m_numLayers; i++) + { + X265_FREE(m_rqt[i].coeffRQT[0]); + m_rqt[i].reconQtYuv.destroy(); + m_rqt[i].resiQtYuv.destroy(); + } + + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + { + m_rqt[i].tmpResiYuv.destroy(); + m_rqt[i].tmpPredYuv.destroy(); + m_rqt[i].bidirPredYuv[0].destroy(); + m_rqt[i].bidirPredYuv[1].destroy(); + } + + X265_FREE(m_qtTempCbf[0]); + X265_FREE(m_qtTempTransformSkipFlag[0]); +} + +void Search::setQP(const Slice& slice, int qp) +{ + x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */ + m_me.setQP(qp); + m_rdCost.setQP(slice, qp); +} + +#if CHECKED_BUILD || _DEBUG +void Search::invalidateContexts(int fromDepth) +{ + /* catch reads without previous writes */ + for (int d = fromDepth; d < NUM_FULL_DEPTH; d++) + { + m_rqt[d].cur.markInvalid(); + m_rqt[d].rqtTemp.markInvalid(); + m_rqt[d].rqtRoot.markInvalid(); + m_rqt[d].rqtTest.markInvalid(); + } +} +#else +void Search::invalidateContexts(int) {} +#endif + +void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height) +{ + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t subdiv = tuDepthL > trDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + + bool mCodeAll = true; + const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift); + if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) + mCodeAll = false; + + if (mCodeAll) + { + if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1)) + m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv); + + if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1)) + m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv); + } + + if (subdiv) + { + absPartIdxStep >>= 2; + width >>= 1; + height >>= 1; + + uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + for (uint32_t part = 0; part < 4; part++) + codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height); + } +} + +void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype) +{ + if (!cu.getCbf(absPartIdx, ttype, trDepth)) + return; + + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + + if (tuDepthL > trDepth) + { + uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + for (uint32_t part = 0; part < 4; part++) + codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype); + + return; + } + + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + + uint32_t trDepthC = trDepth; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + + if (log2TrSizeC == 1) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n"); + trDepthC--; + log2TrSizeC++; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); + if (!bFirstQ) + return; + } + + uint32_t qtLayer = log2TrSize - 2; + + if (m_csp != X265_CSP_I422) + { + uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0; + uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift); + coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; + m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); + } + else + { + uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1); + coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; + uint32_t subTUSize = 1 << (log2TrSizeC * 2); + uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1); + if (cu.getCbf(absPartIdx, ttype, trDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); + if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype); + } +} + +void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2]) +{ + uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t qtLayer = log2TrSize - 2; + uint32_t sizeIdx = log2TrSize - 2; + bool mightNotSplit = log2TrSize <= depthRange[1]; + bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit); + + /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */ + if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4) + { + mightNotSplit = false; + mightSplit = true; + } + + CUData& cu = mode.cu; + + Cost fullCost; + uint32_t bCBF = 0; + + pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); + uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size; + + if (mightNotSplit) + { + if (mightSplit) + m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); + + pixel* fenc = const_cast(mode.fencYuv->getLumaAddr(absPartIdx)); + pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); + int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); + uint32_t stride = mode.fencYuv->m_size; + + // init availability pattern + uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; + initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + + // get prediction signal + predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); + + cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); + cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; + + // store original entropy coding status + if (m_bEnableRDOQ) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); + + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride); + + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); + if (numSig) + { + m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig); + primitives.luma_add_ps[sizeIdx](reconQt, reconQtStride, pred, residual, stride, stride); + } + else + // no coded residual, recon = pred + primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride); + + bCBF = !!numSig << trDepth; + cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); + fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride); + + m_entropyCoder.resetBits(); + if (!absPartIdx) + { + if (!cu.m_slice->isIntra()) + { + if (cu.m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + } + + m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]); + } + if (cu.m_partSize[0] == SIZE_2Nx2N) + { + if (!absPartIdx) + m_entropyCoder.codeIntraDirLumaAng(cu, 0, false); + } + else + { + uint32_t qtNumParts = cuGeom.numPartitions >> 2; + if (!trDepth) + { + for (uint32_t part = 0; part < 4; part++) + m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false); + } + else if (!(absPartIdx & (qtNumParts - 1))) + m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); + } + if (log2TrSize != depthRange[0]) + m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); + + m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + + if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA); + + fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + + if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE) + fullCost.bits *= 4; + + if (m_rdCost.m_psyRd) + { + fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride); + fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); + } + else + fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits); + } + else + fullCost.rdcost = MAX_INT64; + + if (mightSplit) + { + if (mightNotSplit) + { + m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode + m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode + } + + // code split block + uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t absPartIdxSub = absPartIdx; + + int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; + if (m_param->bEnableTSkipFast) + checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN; + + Cost splitCost; + uint32_t cbf = 0; + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + { + if (checkTransformSkip) + codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost); + else + codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange); + + cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1); + } + for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth); + + if (mightNotSplit && log2TrSize != depthRange[0]) + { + /* If we could have coded this TU depth, include cost of subdiv flag */ + m_entropyCoder.resetBits(); + m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); + splitCost.bits += m_entropyCoder.getNumberOfWrittenBits(); + + if (m_rdCost.m_psyRd) + splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); + else + splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits); + } + + if (splitCost.rdcost < fullCost.rdcost) + { + outCost.rdcost += splitCost.rdcost; + outCost.distortion += splitCost.distortion; + outCost.bits += splitCost.bits; + outCost.energy += splitCost.energy; + return; + } + else + { + // recover entropy state of full-size TU encode + m_entropyCoder.load(m_rqt[fullDepth].rqtTest); + + // recover transform index and Cbf values + cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); + cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); + } + } + + // set reconstruction for next intra prediction blocks if full TU prediction won + pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPicYuv->m_stride; + primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); + + outCost.rdcost += fullCost.rdcost; + outCost.distortion += fullCost.distortion; + outCost.bits += fullCost.bits; + outCost.energy += fullCost.energy; +} + +void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost) +{ + uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t tuSize = 1 << log2TrSize; + + X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n"); + + CUData& cu = mode.cu; + Yuv* predYuv = &mode.predYuv; + const Yuv* fencYuv = mode.fencYuv; + + Cost fullCost; + fullCost.rdcost = MAX_INT64; + int bTSkip = 0; + uint32_t bCBF = 0; + + pixel* fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + pixel* pred = predYuv->getLumaAddr(absPartIdx); + int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); + uint32_t stride = fencYuv->m_size; + int sizeIdx = log2TrSize - 2; + + // init availability pattern + uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; + initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + + // get prediction signal + predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); + + cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + + uint32_t qtLayer = log2TrSize - 2; + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; + pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); + uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size; + + // store original entropy coding status + m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); + + if (m_bEnableRDOQ) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); + + ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]); + ALIGN_VAR_32(pixel, tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]); + + int checkTransformSkip = 1; + for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) + { + uint64_t tmpCost; + uint32_t tmpEnergy = 0; + + coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY); + pixel* tmpRecon = (useTSkip ? tsReconY : reconQt); + uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); + + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride); + + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip); + if (numSig) + { + m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig); + primitives.luma_add_ps[sizeIdx](tmpRecon, tmpReconStride, pred, residual, stride, stride); + } + else if (useTSkip) + { + /* do not allow tskip if CBF=0, pretend we did not try tskip */ + checkTransformSkip = 0; + break; + } + else + // no residual coded, recon = pred + primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride); + + uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride); + + cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + + if (useTSkip) + m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); + + m_entropyCoder.resetBits(); + if (!absPartIdx) + { + if (!cu.m_slice->isIntra()) + { + if (cu.m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + } + + m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]); + } + if (cu.m_partSize[0] == SIZE_2Nx2N) + { + if (!absPartIdx) + m_entropyCoder.codeIntraDirLumaAng(cu, 0, false); + } + else + { + uint32_t qtNumParts = cuGeom.numPartitions >> 2; + if (!trDepth) + { + for (uint32_t part = 0; part < 4; part++) + m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false); + } + else if (!(absPartIdx & (qtNumParts - 1))) + m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); + } + m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); + + m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + + if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA); + + uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits(); + + if (!useTSkip) + m_entropyCoder.store(m_rqt[fullDepth].rqtTemp); + + if (m_rdCost.m_psyRd) + { + tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride); + tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy); + } + else + tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits); + + if (tmpCost < fullCost.rdcost) + { + bTSkip = useTSkip; + bCBF = !!numSig; + fullCost.rdcost = tmpCost; + fullCost.distortion = tmpDist; + fullCost.bits = tmpBits; + fullCost.energy = tmpEnergy; + } + } + + if (bTSkip) + { + memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2)); + primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize); + } + else if (checkTransformSkip) + { + cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + m_entropyCoder.load(m_rqt[fullDepth].rqtTemp); + } + + // set reconstruction for next intra prediction blocks + pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPicYuv->m_stride; + primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); + + outCost.rdcost += fullCost.rdcost; + outCost.distortion += fullCost.distortion; + outCost.bits += fullCost.bits; + outCost.energy += fullCost.energy; +} + +/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */ +void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]) +{ + CUData& cu = mode.cu; + + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + bool bCheckFull = log2TrSize <= depthRange[1]; + + X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n"); + + /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible + * since we are not measuring RD cost */ + if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4) + bCheckFull = false; + + if (bCheckFull) + { + pixel* fenc = const_cast(mode.fencYuv->getLumaAddr(absPartIdx)); + pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); + int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); + pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPicYuv->m_stride; + uint32_t stride = mode.fencYuv->m_size; + uint32_t sizeIdx = log2TrSize - 2; + uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY; + + initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); + + X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n"); + cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + + primitives.calcresidual[sizeIdx](fenc, pred, residual, stride); + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false); + if (numSig) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig); + primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride); + cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + } + else + { + primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride); + cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); + } + } + else + { + X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n"); + + /* code split block */ + uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t cbf = 0; + for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + { + residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange); + cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1); + } + for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth); + } +} + +void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx) +{ + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + + if (tuDepth == trDepth) + { + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t qtLayer = log2TrSize - 2; + + // copy transform coefficients + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; + coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY; + memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2)); + + // copy reconstruction + m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize); + } + else + { + uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart); + } +} + +/* 4:2:2 post-TU split processing */ +void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx) +{ + uint32_t depth = cu.m_cuDepth[0]; + uint32_t fullDepth = depth + trDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + + uint32_t trDepthC = trDepth; + if (log2TrSize == 2) + { + X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n"); + trDepthC--; + } + + uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1; + + // move the CBFs down a level and set the parent CBF + uint8_t subTUCBF[2]; + uint8_t combinedSubTUCBF = 0; + + for (uint32_t subTU = 0; subTU < 2; subTU++) + { + const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); + + subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth); + combinedSubTUCBF |= subTUCBF[subTU]; + } + + for (uint32_t subTU = 0; subTU < 2; subTU++) + { + const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); + const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF; + + cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU); + } +} + +/* returns distortion */ +uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy) +{ + CUData& cu = mode.cu; + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + + if (tuDepthL > trDepth) + { + uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0; + for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + { + outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy); + splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1); + splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1); + } + for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + { + cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth); + cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth); + } + + return outDist; + } + + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + + uint32_t trDepthC = trDepth; + if (log2TrSizeC == 1) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n"); + trDepthC--; + log2TrSizeC++; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); + if (!bFirstQ) + return 0; + } + + if (m_bEnableRDOQ) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); + + bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; + checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]); + if (checkTransformSkip) + return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy); + + uint32_t qtLayer = log2TrSize - 2; + uint32_t tuSize = 1 << log2TrSizeC; + uint32_t outDist = 0; + + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + TextType ttype = (TextType)chromaId; + + TURecurse tuIterator(splitType, curPartNum, absPartIdx); + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + + pixel* fenc = const_cast(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC); + pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); + int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); + uint32_t stride = mode.fencYuv->m_csize; + uint32_t sizeIdxC = log2TrSizeC - 2; + + uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); + coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; + pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); + uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; + + pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + intptr_t picStride = m_frame->m_reconPicYuv->m_strideC; + + // init availability pattern + initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); + + uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; + if (chromaPredMode == DM_CHROMA_IDX) + chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; + if (m_csp == X265_CSP_I422) + chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; + + // get prediction signal + predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp); + + cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); + + primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride); + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); + uint32_t tmpDist; + if (numSig) + { + m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); + primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride); + cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + } + else + { + // no coded residual, recon = pred + primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride); + cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); + } + + tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride); + outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist); + + if (m_rdCost.m_psyRd) + psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride); + + primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride); + } + while (tuIterator.isNextSection()); + + if (splitType == VERTICAL_SPLIT) + offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx); + } + + return outDist; +} + +/* returns distortion */ +uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy) +{ + CUData& cu = mode.cu; + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t log2TrSizeC = 2; + uint32_t tuSize = 4; + uint32_t qtLayer = log2TrSize - 2; + uint32_t outDist = 0; + + /* At the TU layers above this one, no RDO is performed, only distortion is being measured, + * so the entropy coder is not very accurate. The best we can do is return it in the same + * condition as it arrived, and to do all bit estimates from the same state. */ + m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); + + ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); + ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]); + + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + TextType ttype = (TextType)chromaId; + + TURecurse tuIterator(splitType, curPartNum, absPartIdx); + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + + pixel* fenc = const_cast(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC); + pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); + int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); + uint32_t stride = mode.fencYuv->m_csize; + uint32_t sizeIdxC = log2TrSizeC - 2; + + uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); + coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; + pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); + uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; + + // init availability pattern + initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); + + uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; + if (chromaPredMode == DM_CHROMA_IDX) + chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; + if (m_csp == X265_CSP_I422) + chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; + + // get prediction signal + predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp); + + uint64_t bCost = MAX_INT64; + uint32_t bDist = 0; + uint32_t bCbf = 0; + uint32_t bEnergy = 0; + int bTSkip = 0; + + int checkTransformSkip = 1; + for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) + { + coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC); + pixel* recon = (useTSkip ? tskipReconC : reconQt); + uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); + + primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride); + + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip); + if (numSig) + { + m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig); + primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride); + cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + } + else if (useTSkip) + { + checkTransformSkip = 0; + break; + } + else + { + primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride); + cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); + } + uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride); + tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist); + + cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); + + uint32_t tmpBits = 0, tmpEnergy = 0; + if (numSig) + { + m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); + m_entropyCoder.resetBits(); + m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId); + tmpBits = m_entropyCoder.getNumberOfWrittenBits(); + } + + uint64_t tmpCost; + if (m_rdCost.m_psyRd) + { + tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride); + tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy); + } + else + tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits); + + if (tmpCost < bCost) + { + bCost = tmpCost; + bDist = tmpDist; + bTSkip = useTSkip; + bCbf = !!numSig; + bEnergy = tmpEnergy; + } + } + + if (bTSkip) + { + memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2)); + primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE); + } + + cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); + + pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + intptr_t picStride = m_frame->m_reconPicYuv->m_strideC; + primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride); + + outDist += bDist; + psyEnergy += bEnergy; + } + while (tuIterator.isNextSection()); + + if (splitType == VERTICAL_SPLIT) + offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx); + } + + m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); + return outDist; +} + +void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad) +{ + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + + if (tuDepthL == trDepth) + { + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + + if (tuQuad) + { + log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */ + trDepth--; /* also adjust the number of coeff read */ + } + + // copy transform coefficients + uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); + uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); + + uint32_t qtLayer = log2TrSize - 2; + coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; + coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; + coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; + coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC; + memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC); + memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC); + + // copy reconstruction + m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift); + } + else + { + if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444) + /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */ + extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true); + else + { + uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) + extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false); + } + } +} + +void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx) +{ + CUData& cu = mode.cu; + uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + + if (tuDepthL == trDepth) + { + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + uint32_t trDepthC = trDepth; + if (log2TrSizeC == 1) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n"); + trDepthC--; + log2TrSizeC++; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); + if (!bFirstQ) + return; + } + + ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; + uint32_t tuSize = 1 << log2TrSizeC; + uint32_t stride = mode.fencYuv->m_csize; + const int sizeIdxC = log2TrSizeC - 2; + + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + TextType ttype = (TextType)chromaId; + + TURecurse tuIterator(splitType, curPartNum, absPartIdx); + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + + pixel* fenc = const_cast(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC)); + pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); + int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); + pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed? + uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); + coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC; + pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + uint32_t picStride = m_frame->m_reconPicYuv->m_strideC; + + uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; + if (chromaPredMode == DM_CHROMA_IDX) + chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; + chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode; + initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); + + predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp); + + X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n"); + + primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride); + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false); + if (numSig) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig); + primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride); + primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride); + cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + } + else + { + primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride); + primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride); + cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); + } + } + while (tuIterator.isNextSection()); + + if (splitType == VERTICAL_SPLIT) + offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx); + } + } + else + { + uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t splitCbfU = 0, splitCbfV = 0; + for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv) + { + residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC); + splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1); + splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1); + } + for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + { + cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth); + cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth); + } + } +} + +void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes) +{ + uint32_t depth = cuGeom.depth; + CUData& cu = intraMode.cu; + + cu.setPartSizeSubParts(partSize); + cu.setPredModeSubParts(MODE_INTRA); + + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + intraMode.initCosts(); + intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes); + intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); + + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + + if (!m_slice->isIntra()) + { + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + } + + m_entropyCoder.codePartSize(cu, 0, depth); + m_entropyCoder.codePredInfo(cu, 0); + intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits(); + + bool bCodeDQP = m_slice->m_pps->bUseDQP; + m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange); + m_entropyCoder.store(intraMode.contexts); + intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); + intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; + if (m_rdCost.m_psyRd) + intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size); + + updateModeCost(intraMode); +} + +uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes) +{ + CUData& cu = intraMode.cu; + Yuv* reconYuv = &intraMode.reconYuv; + Yuv* predYuv = &intraMode.predYuv; + const Yuv* fencYuv = intraMode.fencYuv; + + uint32_t depth = cu.m_cuDepth[0]; + uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1; + uint32_t numPU = 1 << (2 * initTrDepth); + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t tuSize = 1 << log2TrSize; + uint32_t qNumParts = cuGeom.numPartitions >> 2; + uint32_t sizeIdx = log2TrSize - 2; + uint32_t absPartIdx = 0; + uint32_t totalDistortion = 0; + + int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN; + + // loop over partitions + for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts) + { + uint32_t bmode = 0; + + if (sharedModes) + bmode = sharedModes[pu]; + else + { + // Reference sample smoothing + initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX); + + // determine set of modes to be tested (using prediction signal only) + pixel* fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + uint32_t stride = predYuv->m_size; + + pixel *above = m_refAbove + tuSize - 1; + pixel *aboveFiltered = m_refAboveFlt + tuSize - 1; + pixel *left = m_refLeft + tuSize - 1; + pixel *leftFiltered = m_refLeftFlt + tuSize - 1; + + // 33 Angle modes once + ALIGN_VAR_32(pixel, buf_trans[32 * 32]); + ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); + ALIGN_VAR_32(pixel, bufScale[32 * 32]); + pixel _above[4 * 32 + 1]; + pixel _left[4 * 32 + 1]; + int scaleTuSize = tuSize; + int scaleStride = stride; + int costShift = 0; + + if (tuSize > 32) + { + pixel *aboveScale = _above + 2 * 32; + pixel *leftScale = _left + 2 * 32; + + // origin is 64x64, we scale to 32x32 and setup required parameters + primitives.scale2D_64to32(bufScale, fenc, stride); + fenc = bufScale; + + // reserve space in case primitives need to store data in above + // or left buffers + aboveScale[0] = leftScale[0] = above[0]; + primitives.scale1D_128to64(aboveScale + 1, above + 1, 0); + primitives.scale1D_128to64(leftScale + 1, left + 1, 0); + + scaleTuSize = 32; + scaleStride = 32; + costShift = 2; + sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 + + // Filtered and Unfiltered refAbove and refLeft pointing to above and left. + above = aboveScale; + left = leftScale; + aboveFiltered = aboveScale; + leftFiltered = leftScale; + } + + m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + + /* there are three cost tiers for intra modes: + * pred[0] - mode probable, least cost + * pred[1], pred[2] - less probable, slightly more cost + * non-mpm modes - all cost the same (rbits) */ + uint64_t mpms; + uint32_t preds[3]; + uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); + + pixelcmp_t sa8d = primitives.sa8d[sizeIdx]; + uint64_t modeCosts[35]; + uint64_t bcost; + + // DC + primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16)); + uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits; + uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); + + // PLANAR + pixel *abovePlanar = above; + pixel *leftPlanar = left; + if (tuSize >= 8 && tuSize <= 32) + { + abovePlanar = aboveFiltered; + leftPlanar = leftFiltered; + } + primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0); + bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits; + sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits); + COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]); + + // angular predictions + primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16)); + + primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride); + for (int mode = 2; mode < 35; mode++) + { + bool modeHor = (mode < 18); + pixel *cmp = (modeHor ? buf_trans : fenc); + intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride); + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; + modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); + COPY1_IF_LT(bcost, modeCosts[mode]); + } + + /* Find the top maxCandCount candidate modes with cost within 25% of best + * or among the most probable modes. maxCandCount is derived from the + * rdLevel and depth. In general we want to try more modes at slower RD + * levels and at higher depths */ + uint64_t candCostList[MAX_RD_INTRA_MODES]; + uint32_t rdModeList[MAX_RD_INTRA_MODES]; + int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1); + for (int i = 0; i < maxCandCount; i++) + candCostList[i] = MAX_INT64; + + uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12% + for (int mode = 0; mode < 35; mode++) + if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode))) + updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList); + + /* measure best candidates using simple RDO (no TU splits) */ + bcost = MAX_INT64; + for (int i = 0; i < maxCandCount; i++) + { + if (candCostList[i] == MAX_INT64) + break; + m_entropyCoder.load(m_rqt[depth].cur); + cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth); + + Cost icosts; + if (checkTransformSkip) + codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts); + else + codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange); + COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]); + } + } + + /* remeasure best mode, allowing TU splits */ + cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth); + m_entropyCoder.load(m_rqt[depth].cur); + + Cost icosts; + if (checkTransformSkip) + codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts); + else + codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange); + totalDistortion += icosts.distortion; + + extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx); + + // set reconstruction for next intra prediction blocks + if (pu != numPU - 1) + { + /* This has important implications for parallelism and RDO. It is writing intermediate results into the + * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also + * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think + * that the contexts should be tracked through each PU */ + pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + uint32_t dststride = m_frame->m_reconPicYuv->m_stride; + pixel* src = reconYuv->getLumaAddr(absPartIdx); + uint32_t srcstride = reconYuv->m_size; + primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride); + } + } + + if (numPU > 1) + { + uint32_t combCbfY = 0; + uint32_t partIdx = 0; + for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts) + combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1); + + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) + cu.m_cbf[0][offs] |= combCbfY; + } + + // TODO: remove this + m_entropyCoder.load(m_rqt[depth].cur); + + return totalDistortion; +} + +void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + const Yuv* fencYuv = intraMode.fencYuv; + Yuv* predYuv = &intraMode.predYuv; + + uint32_t bestMode = 0; + uint64_t bestCost = MAX_INT64; + uint32_t modeList[NUM_CHROMA_MODE]; + + uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift; + uint32_t tuSize = 1 << log2TrSizeC; + int32_t scaleTuSize = tuSize; + int32_t costShift = 0; + + if (tuSize > 32) + { + scaleTuSize = 32; + costShift = 2; + log2TrSizeC = 5; + } + + Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1); + Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2); + cu.getAllowedChromaDir(0, modeList); + + // check chroma modes + for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++) + { + uint32_t chromaPredMode = modeList[mode]; + if (chromaPredMode == DM_CHROMA_IDX) + chromaPredMode = cu.m_lumaIntraDir[0]; + if (m_csp == X265_CSP_I422) + chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; + + uint64_t cost = 0; + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + pixel* fenc = fencYuv->m_buf[chromaId]; + pixel* pred = predYuv->m_buf[chromaId]; + pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize); + + // get prediction signal + predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp); + cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift; + } + + if (cost < bestCost) + { + bestCost = cost; + bestMode = modeList[mode]; + } + } + + cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]); +} + +uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + Yuv& reconYuv = intraMode.reconYuv; + + uint32_t depth = cu.m_cuDepth[0]; + uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444; + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1)); + uint32_t totalDistortion = 0; + + int part = partitionFromLog2Size(log2TrSize); + + TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); + + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + int cuSize = 1 << cu.m_log2CUSize[absPartIdxC]; + + uint32_t bestMode = 0; + uint32_t bestDist = 0; + uint64_t bestCost = MAX_INT64; + + // init mode list + uint32_t minMode = 0; + uint32_t maxMode = NUM_CHROMA_MODE; + uint32_t modeList[NUM_CHROMA_MODE]; + + cu.getAllowedChromaDir(absPartIdxC, modeList); + + // check chroma modes + for (uint32_t mode = minMode; mode < maxMode; mode++) + { + // restore context models + m_entropyCoder.load(m_rqt[depth].cur); + + cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth); + uint32_t psyEnergy = 0; + uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy); + + if (m_slice->m_pps->bTransformSkipEnabled) + m_entropyCoder.load(m_rqt[depth].cur); + + m_entropyCoder.resetBits(); + // chroma prediction mode + if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444) + { + if (!absPartIdxC) + m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); + } + else + { + uint32_t qtNumParts = cuGeom.numPartitions >> 2; + if (!(absPartIdxC & (qtNumParts - 1))) + m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); + } + + codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize); + codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U); + codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V); + uint32_t bits = m_entropyCoder.getNumberOfWrittenBits(); + uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits); + + if (cost < bestCost) + { + bestCost = cost; + bestDist = dist; + bestMode = modeList[mode]; + extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false); + memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); + memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); + memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); + memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); + } + } + + if (!tuIterator.isLastSection()) + { + uint32_t zorder = cuGeom.encodeIdx + absPartIdxC; + uint32_t dststride = m_frame->m_reconPicYuv->m_strideC; + pixel *src, *dst; + + dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder); + src = reconYuv.getCbAddr(absPartIdxC); + primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize); + + dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder); + src = reconYuv.getCrAddr(absPartIdxC); + primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize); + } + + memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); + memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); + memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); + memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); + cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth); + totalDistortion += bestDist; + } + while (tuIterator.isNextSection()); + + if (initTrDepth != 0) + { + uint32_t combCbfU = 0; + uint32_t combCbfV = 0; + uint32_t partIdx = 0; + for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep) + { + combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1); + combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1); + } + + for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++) + { + cu.m_cbf[1][offs] |= combCbfU; + cu.m_cbf[2][offs] |= combCbfV; + } + } + + /* TODO: remove this */ + m_entropyCoder.load(m_rqt[depth].cur); + return totalDistortion; +} + +/* estimation of best merge coding of an inter PU (not a merge CU) */ +uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m) +{ + X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n"); + + m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours); + + if (cu.isBipredRestriction()) + { + /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */ + for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand) + { + if (m.interDirNeighbours[mergeCand] == 3) + { + m.interDirNeighbours[mergeCand] = 1; + m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID; + } + } + } + + Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv; + + uint32_t outCost = MAX_UINT; + for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand) + { + /* Prevent TMVP candidates from using unavailable reference pixels */ + if (m_bFrameParallel && + (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 || + m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)) + continue; + + cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv; + cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx; + cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv; + cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx; + + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(tempYuv, true, false); + uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size); + uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand); + costCand = costCand + m_rdCost.getCost(bitsCand); + if (costCand < outCost) + { + outCost = costCand; + m.bits = bitsCand; + m.index = mergeCand; + } + } + + m.mvField[0] = m.mvFieldNeighbours[m.index][0]; + m.mvField[1] = m.mvFieldNeighbours[m.index][1]; + m.interDir = m.interDirNeighbours[m.index]; + + return outCost; +} + +/* this function assumes the caller has configured its MotionEstimation engine with the + * correct source plane and source PU, and has called prepMotionCompensation() to set + * m_puAbsPartIdx, m_puWidth, and m_puHeight */ +void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref) +{ + uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS; + bits += getTUBits(ref, m_slice->m_numRefIdx[list]); + + MV amvpCand[AMVP_NUM_CANDS]; + MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; + int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc); + + uint32_t bestCost = MAX_INT; + int mvpIdx = 0; + int merange = m_param->searchRange; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = amvpCand[i]; + + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; + + cu.clipMv(mvCand); + + Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; + predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } + } + + MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx]; + setSearchRange(cu, mvp, merange, mvmin, mvmax); + + int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); + + /* Get total cost of partition, but only include MV bit cost once */ + bits += m_me.bitcost(outmv); + uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); + + /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ + checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost); + + /* tie goes to the smallest ref ID, just like --no-pme */ + ScopedLock _lock(master.m_outputLock); + if (cost < master.m_bestME[list].cost || + (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref)) + { + master.m_bestME[list].mv = outmv; + master.m_bestME[list].mvp = mvp; + master.m_bestME[list].mvpIdx = mvpIdx; + master.m_bestME[list].ref = ref; + master.m_bestME[list].cost = cost; + master.m_bestME[list].bits = bits; + } +} + +/* search of the best candidate for inter prediction + * returns true if predYuv was filled with a motion compensated prediction */ +bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma) +{ + CUData& cu = interMode.cu; + Yuv* predYuv = &interMode.predYuv; + + MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS]; + MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; + + const Slice *slice = m_slice; + PicYuv* fencPic = m_frame->m_origPicYuv; + int numPart = cu.getNumPartInter(); + int numPredDir = slice->isInterP() ? 1 : 2; + const int* numRefIdx = slice->m_numRefIdx; + uint32_t lastMode = 0; + int totalmebits = 0; + bool bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2; + MV mvzero(0, 0); + Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; + + MergeData merge; + memset(&merge, 0, sizeof(merge)); + + for (int puIdx = 0; puIdx < numPart; puIdx++) + { + /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */ + initMotionCompensation(cu, cuGeom, puIdx); + + pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); + m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight); + + uint32_t mrgCost = MAX_UINT; + + /* find best cost merge candidate */ + if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N) + { + merge.absPartIdx = m_puAbsPartIdx; + merge.width = m_puWidth; + merge.height = m_puHeight; + mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge); + + if (bMergeOnly && cu.m_log2CUSize[0] > 3) + { + if (mrgCost == MAX_UINT) + { + /* No valid merge modes were found, there is no possible way to + * perform a valid motion compensation prediction, so early-exit */ + return false; + } + // set merge result + cu.m_mergeFlag[m_puAbsPartIdx] = true; + cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx + cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx); + cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx); + cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx); + totalmebits += merge.bits; + + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(*predYuv, true, bChroma); + continue; + } + } + + MotionData bidir[2]; + uint32_t bidirCost = MAX_UINT; + int bidirBits = 0; + + m_bestME[0].cost = MAX_UINT; + m_bestME[1].cost = MAX_UINT; + + getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); + + if (bDistributed) + { + m_curMECu = &cu; + m_curGeom = &cuGeom; + + /* this worker might already be enqueued for pmode, so other threads + * might be looking at the ME job counts at any time, do these sets + * in a safe order */ + m_curPart = puIdx; + m_totalNumME = 0; + m_numAcquiredME = 1; + m_numCompletedME = 0; + m_totalNumME = numRefIdx[0] + numRefIdx[1]; + + if (!m_bJobsQueued) + JobProvider::enqueue(); + + for (int i = 1; i < m_totalNumME; i++) + m_pool->pokeIdleThread(); + + while (m_totalNumME > m_numAcquiredME) + { + int id = ATOMIC_INC(&m_numAcquiredME); + if (m_totalNumME >= id) + { + id -= 1; + if (id < numRefIdx[0]) + singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id); + else + singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]); + + if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) + m_meCompletionEvent.trigger(); + } + } + if (!m_bJobsQueued) + JobProvider::dequeue(); + + /* we saved L0-0 for ourselves */ + singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0); + if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) + m_meCompletionEvent.trigger(); + + m_meCompletionEvent.wait(); + } + else + { + // Uni-directional prediction + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < numRefIdx[l]; ref++) + { + uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; + bits += getTUBits(ref, numRefIdx[l]); + + int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc); + + // Pick the best possible MVP from AMVP candidates based on least residual + uint32_t bestCost = MAX_INT; + int mvpIdx = 0; + int merange = m_param->searchRange; + + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = amvpCand[l][ref][i]; + + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; + + cu.clipMv(mvCand); + predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } + } + + MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx]; + + setSearchRange(cu, mvp, merange, mvmin, mvmax); + int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); + + /* Get total cost of partition, but only include MV bit cost once */ + bits += m_me.bitcost(outmv); + uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); + + /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ + checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + + if (cost < m_bestME[l].cost) + { + m_bestME[l].mv = outmv; + m_bestME[l].mvp = mvp; + m_bestME[l].mvpIdx = mvpIdx; + m_bestME[l].ref = ref; + m_bestME[l].cost = cost; + m_bestME[l].bits = bits; + } + } + } + } + + /* Bi-directional prediction */ + if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT) + { + bidir[0] = m_bestME[0]; + bidir[1] = m_bestME[1]; + + /* Generate reference subpels */ + PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv; + PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv; + Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; + predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv); + predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv); + + pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx); + pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx); + + int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32); + int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + + bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); + bidirCost = satdCost + m_rdCost.getCost(bidirBits); + + bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero(); + if (bTryZero) + { + /* Do not try zero MV if unidir motion predictors are beyond + * valid search area */ + MV mvmin, mvmax; + int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight); + setSearchRange(cu, mvzero, merange, mvmin, mvmax); + mvmax.y += 2; // there is some pad for subpel refine + mvmin <<= 2; + mvmax <<= 2; + + bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax); + bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax); + } + if (bTryZero) + { + // coincident blocks of the two reference pictures + pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]); + pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]); + intptr_t refStride = slice->m_mref[0][0].lumaStride; + + primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); + satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + + MV mvp0 = m_bestME[0].mvp; + int mvpIdx0 = m_bestME[0].mvpIdx; + uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); + + MV mvp1 = m_bestME[1].mvp; + int mvpIdx1 = m_bestME[1].mvpIdx; + uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); + + uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); + + if (bDistributed) + { + cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc); + cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc); + } + + /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ + checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost); + checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost); + + if (cost < bidirCost) + { + bidir[0].mv = mvzero; + bidir[1].mv = mvzero; + bidir[0].mvp = mvp0; + bidir[1].mvp = mvp1; + bidir[0].mvpIdx = mvpIdx0; + bidir[1].mvpIdx = mvpIdx1; + bidirCost = cost; + bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); + } + } + } + + /* select best option and store into CU */ + if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost) + { + cu.m_mergeFlag[m_puAbsPartIdx] = true; + cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx + cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx); + cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx); + cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx); + + totalmebits += merge.bits; + } + else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost) + { + lastMode = 2; + + cu.m_mergeFlag[m_puAbsPartIdx] = false; + cu.setPUInterDir(3, m_puAbsPartIdx, puIdx); + cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; + cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx; + + cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; + cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx; + + totalmebits += bidirBits; + } + else if (m_bestME[0].cost <= m_bestME[1].cost) + { + lastMode = 0; + + cu.m_mergeFlag[m_puAbsPartIdx] = false; + cu.setPUInterDir(1, m_puAbsPartIdx, puIdx); + cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp; + cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx; + + cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx); + cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx); + + totalmebits += m_bestME[0].bits; + } + else + { + lastMode = 1; + + cu.m_mergeFlag[m_puAbsPartIdx] = false; + cu.setPUInterDir(2, m_puAbsPartIdx, puIdx); + cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp; + cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx; + + cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx); + cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx); + + totalmebits += m_bestME[1].bits; + } + + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(*predYuv, true, bChroma); + } + + interMode.sa8dBits += totalmebits; + return true; +} + +void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]) +{ + if (cuMode == SIZE_2Nx2N) + { + blockBit[0] = (!bPSlice) ? 3 : 1; + blockBit[1] = 3; + blockBit[2] = 5; + } + else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD) + { + static const uint32_t listBits[2][3][3] = + { + { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } }, + { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } } + }; + if (bPSlice) + { + blockBit[0] = 3; + blockBit[1] = 0; + blockBit[2] = 0; + } + else + memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t)); + } + else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N) + { + static const uint32_t listBits[2][3][3] = + { + { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } }, + { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } } + }; + if (bPSlice) + { + blockBit[0] = 3; + blockBit[1] = 0; + blockBit[2] = 0; + } + else + memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t)); + } + else if (cuMode == SIZE_NxN) + { + blockBit[0] = (!bPSlice) ? 3 : 1; + blockBit[1] = 3; + blockBit[2] = 5; + } + else + { + X265_CHECK(0, "getBlkBits: unknown cuMode\n"); + } +} + +/* Check if using an alternative MVP would result in a smaller MVD + signal bits */ +void Search::checkBestMVP(MV* amvpCand, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost) const +{ + X265_CHECK(amvpCand[outMvpIdx] == mvPred, "checkBestMVP: unexpected mvPred\n"); + + int mvpIdx = !outMvpIdx; + MV mvp = amvpCand[mvpIdx]; + int diffBits = m_me.bitcost(mv, mvp) - m_me.bitcost(mv, mvPred); + if (diffBits < 0) + { + outMvpIdx = mvpIdx; + mvPred = mvp; + uint32_t origOutBits = outBits; + outBits = origOutBits + diffBits; + outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits); + } +} + +void Search::setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const +{ + cu.clipMv(mvp); + + MV dist((int16_t)merange << 2, (int16_t)merange << 2); + mvmin = mvp - dist; + mvmax = mvp + dist; + + cu.clipMv(mvmin); + cu.clipMv(mvmax); + + /* Clip search range to signaled maximum MV length. + * We do not support this VUI field being changed from the default */ + const int maxMvLen = (1 << 15) - 1; + mvmin.x = X265_MAX(mvmin.x, -maxMvLen); + mvmin.y = X265_MAX(mvmin.y, -maxMvLen); + mvmax.x = X265_MIN(mvmax.x, maxMvLen); + mvmax.y = X265_MIN(mvmax.y, maxMvLen); + + mvmin >>= 2; + mvmax >>= 2; + + /* conditional clipping for frame parallelism */ + mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels); + mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels); +} + +/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ +void Search::encodeResAndCalcRdSkipCU(Mode& interMode) +{ + CUData& cu = interMode.cu; + Yuv* reconYuv = &interMode.reconYuv; + const Yuv* fencYuv = interMode.fencYuv; + + X265_CHECK(!cu.isIntra(0), "intra CU not expected\n"); + + uint32_t cuSize = 1 << cu.m_log2CUSize[0]; + uint32_t depth = cu.m_cuDepth[0]; + + // No residual coding : SKIP mode + + cu.setSkipFlagSubParts(true); + cu.clearCbf(); + cu.setTUDepthSubParts(0, 0, depth); + + reconYuv->copyFromYuv(interMode.predYuv); + + // Luma + int part = partitionFromLog2Size(cu.m_log2CUSize[0]); + interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + // Chroma + part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); + interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); + interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); + + m_entropyCoder.load(m_rqt[depth].cur); + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codeMergeIndex(cu, 0); + + interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits(); + interMode.coeffBits = 0; + interMode.totalBits = interMode.mvBits; + if (m_rdCost.m_psyRd) + interMode.psyEnergy = m_rdCost.psyCost(cu.m_log2CUSize[0] - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + + updateModeCost(interMode); + m_entropyCoder.store(interMode.contexts); +} + +/* encode residual and calculate rate-distortion for a CU block. + * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ +void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) +{ + CUData& cu = interMode.cu; + Yuv* reconYuv = &interMode.reconYuv; + Yuv* predYuv = &interMode.predYuv; + ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv; + const Yuv* fencYuv = interMode.fencYuv; + + X265_CHECK(!cu.isIntra(0), "intra CU not expected\n"); + + uint32_t log2CUSize = cu.m_log2CUSize[0]; + uint32_t cuSize = 1 << log2CUSize; + uint32_t depth = cu.m_cuDepth[0]; + + int part = partitionFromLog2Size(log2CUSize); + int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); + + m_quant.setQPforQuant(interMode.cu); + + resiYuv->subtract(*fencYuv, *predYuv, log2CUSize); + + uint32_t tuDepthRange[2]; + cu.getInterTUQtDepthRange(tuDepthRange, 0); + + m_entropyCoder.load(m_rqt[depth].cur); + + Cost costs; + estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange); + + if (!cu.m_tqBypass[0]) + { + uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); + cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); + cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); + + /* Consider the RD cost of not signaling any residual */ + m_entropyCoder.load(m_rqt[depth].cur); + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtRootCbfZero(); + uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits(); + + uint64_t cbf0Cost; + uint32_t cbf0Energy; + if (m_rdCost.m_psyRd) + { + cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); + cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy); + } + else + cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits); + + if (cbf0Cost < costs.rdcost) + { + cu.clearCbf(); + cu.setTUDepthSubParts(0, 0, depth); + } + } + + if (cu.getQtRootCbf(0)) + saveResidualQTData(cu, *resiYuv, 0, depth); + + /* calculate signal bits for inter/merge/skip coded CU */ + m_entropyCoder.load(m_rqt[depth].cur); + + uint32_t coeffBits, bits; + if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) + { + cu.setSkipFlagSubParts(true); + + /* Merge/Skip */ + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codeMergeIndex(cu, 0); + coeffBits = 0; + bits = m_entropyCoder.getNumberOfWrittenBits(); + } + else + { + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]); + m_entropyCoder.codePredInfo(cu, 0); + uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits(); + + bool bCodeDQP = m_slice->m_pps->bUseDQP; + m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange); + bits = m_entropyCoder.getNumberOfWrittenBits(); + + coeffBits = bits - mvBits; + } + + m_entropyCoder.store(interMode.contexts); + + if (cu.getQtRootCbf(0)) + reconYuv->addClip(*predYuv, *resiYuv, log2CUSize); + else + reconYuv->copyFromYuv(*predYuv); + + // update with clipped distortion and cost (qp estimation loop uses unclipped values) + uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); + bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); + if (m_rdCost.m_psyRd) + interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + + interMode.totalBits = bits; + interMode.distortion = bestDist; + interMode.coeffBits = coeffBits; + interMode.mvBits = bits - coeffBits; + updateModeCost(interMode); +} + +void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom) +{ + CUData& cu = mode.cu; + + m_quant.setQPforQuant(mode.cu); + + if (cu.m_predMode[0] == MODE_INTER) + { + uint32_t tuDepthRange[2]; + cu.getInterTUQtDepthRange(tuDepthRange, 0); + + residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange); + if (cu.getQtRootCbf(0)) + mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]); + else + { + mode.reconYuv.copyFromYuv(mode.predYuv); + if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) + cu.setSkipFlagSubParts(true); + } + } + else if (cu.m_predMode[0] == MODE_INTRA) + { + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN; + residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange); + getBestIntraModeChroma(mode, cuGeom); + residualQTIntraChroma(mode, cuGeom, 0, 0); + mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: + } +} + +void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]) +{ + CUData& cu = mode.cu; + X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n"); + + uint32_t log2TrSize = g_maxLog2CUSize - depth; + uint32_t tuDepth = depth - cu.m_cuDepth[0]; + + bool bCheckFull = log2TrSize <= depthRange[1]; + if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0]) + bCheckFull = false; + + if (bCheckFull) + { + // code full block + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + bool bCodeChroma = true; + uint32_t tuDepthC = tuDepth; + if (log2TrSizeC == 1) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n"); + log2TrSizeC++; + tuDepthC--; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + } + + uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); + uint32_t setCbf = 1 << tuDepth; + + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY; + + uint32_t sizeIdx = log2TrSize - 2; + + cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth); + cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); + + ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; + const Yuv* fencYuv = mode.fencYuv; + + int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx); + uint32_t strideResiY = resiYuv.m_size; + + pixel *fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); + + if (numSigY) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY); + cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth); + } + else + { + primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0); + cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth); + } + + if (bCodeChroma) + { + uint32_t sizeIdxC = log2TrSizeC - 2; + uint32_t strideResiC = resiYuv.m_csize; + + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC; + coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC; + bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + + cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); + cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); + + int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC); + pixel* fencCb = const_cast(fencYuv->getCbAddr(absPartIdxC)); + uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false); + if (numSigU) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU); + cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); + } + else + { + primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0); + cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); + } + + int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC); + pixel* fencCr = const_cast(fencYuv->getCrAddr(absPartIdxC)); + uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false); + if (numSigV) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV); + cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); + } + else + { + primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0); + cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); + } + } + while (tuIterator.isNextSection()); + + if (splitIntoSubTUs) + { + offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); + offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); + } + } + } + else + { + X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n"); + + const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + uint32_t ycbf = 0, ucbf = 0, vcbf = 0; + for (uint32_t i = 0; i < 4; i++) + { + residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange); + ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1); + ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1); + vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1); + } + for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++) + { + cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth; + cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth; + cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth; + } + } +} + +void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2]) +{ + CUData& cu = mode.cu; + uint32_t log2TrSize = g_maxLog2CUSize - depth; + + bool bCheckSplit = log2TrSize > depthRange[0]; + bool bCheckFull = log2TrSize <= depthRange[1]; + + if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit) + bCheckFull = false; + + X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n"); + X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); + + uint32_t tuDepth = depth - cu.m_cuDepth[0]; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + bool bCodeChroma = true; + uint32_t tuDepthC = tuDepth; + if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444)) + { + log2TrSizeC++; + tuDepthC--; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + } + + // code full block + Cost fullCost; + fullCost.rdcost = MAX_INT64; + + uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; + uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; + uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} }; + + m_entropyCoder.store(m_rqt[depth].rqtRoot); + + uint32_t trSize = 1 << log2TrSize; + const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); + const Yuv* fencYuv = mode.fencYuv; + + // code full block + if (bCheckFull) + { + uint32_t trSizeC = 1 << log2TrSizeC; + int partSize = partitionFromLog2Size(log2TrSize); + int partSizeC = partitionFromLog2Size(log2TrSizeC); + const uint32_t qtLayer = log2TrSize - 2; + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; + + bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0]; + bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE; + bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE; + + cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth); + cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); + + if (m_bEnableRDOQ) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); + + pixel *fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + int16_t *resi = resiYuv.getLumaAddr(absPartIdx); + numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); + cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0]; + + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); + if (cbfFlag[TEXT_LUMA][0]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits(); + + uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0]; + + if (bCodeChroma) + { + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); + + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + + cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); + + if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); + + fenc = const_cast(fencYuv->getChromaAddr(chromaId, absPartIdxC)); + resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); + numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); + cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; + + m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); + if (cbfFlag[chromaId][tuIterator.section]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); + + uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits(); + singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev; + + singleBitsPrev = newBits; + } + while (tuIterator.isNextSection()); + } + } + + const uint32_t numCoeffY = 1 << (log2TrSize * 2); + const uint32_t numCoeffC = 1 << (log2TrSizeC * 2); + + X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n"); + uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size); + uint32_t psyEnergyY = 0; + if (m_rdCost.m_psyRd) + psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0); + + int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); + uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size; + + if (cbfFlag[TEXT_LUMA][0]) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only + + const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY); + uint32_t nonZeroPsyEnergyY = 0; + if (m_rdCost.m_psyRd) + nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY); + + if (cu.m_tqBypass[0]) + { + distY = nonZeroDistY; + psyEnergyY = nonZeroPsyEnergyY; + } + else + { + uint64_t singleCostY = 0; + if (m_rdCost.m_psyRd) + singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY); + else + singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]); + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth); + const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits(); + uint64_t nullCostY = 0; + if (m_rdCost.m_psyRd) + nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY); + else + nullCostY = m_rdCost.calcRdCost(distY, nullBitsY); + if (nullCostY < singleCostY) + { + cbfFlag[TEXT_LUMA][0] = 0; +#if CHECKED_BUILD || _DEBUG + memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY); +#endif + if (checkTransformSkipY) + minCost[TEXT_LUMA][0] = nullCostY; + } + else + { + distY = nonZeroDistY; + psyEnergyY = nonZeroPsyEnergyY; + if (checkTransformSkipY) + minCost[TEXT_LUMA][0] = singleCostY; + } + } + } + else if (checkTransformSkipY) + { + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth); + const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits(); + if (m_rdCost.m_psyRd) + minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY); + else + minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY); + } + + singleDistComp[TEXT_LUMA][0] = distY; + singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY; + if (!cbfFlag[TEXT_LUMA][0]) + primitives.blockfill_s[partSize](curResiY, strideResiY, 0); + cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); + + if (bCodeChroma) + { + uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + uint32_t distC = 0, psyEnergyC = 0; + coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); + + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + + int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + + distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize)); + + if (cbfFlag[chromaId][tuIterator.section]) + { + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset, + log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); + uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); + const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist); + uint32_t nonZeroPsyEnergyC = 0; + if (m_rdCost.m_psyRd) + nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); + + if (cu.m_tqBypass[0]) + { + distC = nonZeroDistC; + psyEnergyC = nonZeroPsyEnergyC; + } + else + { + uint64_t singleCostC = 0; + if (m_rdCost.m_psyRd) + singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC); + else + singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]); + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth); + const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits(); + uint64_t nullCostC = 0; + if (m_rdCost.m_psyRd) + nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC); + else + nullCostC = m_rdCost.calcRdCost(distC, nullBitsC); + if (nullCostC < singleCostC) + { + cbfFlag[chromaId][tuIterator.section] = 0; +#if CHECKED_BUILD || _DEBUG + memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC); +#endif + if (checkTransformSkipC) + minCost[chromaId][tuIterator.section] = nullCostC; + } + else + { + distC = nonZeroDistC; + psyEnergyC = nonZeroPsyEnergyC; + if (checkTransformSkipC) + minCost[chromaId][tuIterator.section] = singleCostC; + } + } + } + else if (checkTransformSkipC) + { + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC); + const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits(); + if (m_rdCost.m_psyRd) + minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC); + else + minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC); + } + + singleDistComp[chromaId][tuIterator.section] = distC; + singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC; + + if (!cbfFlag[chromaId][tuIterator.section]) + primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0); + + cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); + } + while (tuIterator.isNextSection()); + } + } + + if (checkTransformSkipY) + { + uint32_t nonZeroDistY = 0; + uint32_t nonZeroPsyEnergyY = 0; + uint64_t singleCostY = MAX_INT64; + + ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]); + ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]); + + m_entropyCoder.load(m_rqt[depth].rqtRoot); + + cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth); + + if (m_bEnableRDOQ) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); + + fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + resi = resiYuv.getLumaAddr(absPartIdx); + uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true); + + if (numSigTSkipY) + { + m_entropyCoder.resetBits(); + m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth); + m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA); + const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits(); + + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY); + + nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize); + + if (m_rdCost.m_psyRd) + { + nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize); + singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY); + } + else + singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY); + } + + if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY) + cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); + else + { + singleDistComp[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY; + cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY; + bestTransformMode[TEXT_LUMA][0] = 1; + memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY); + primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize); + } + + cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); + } + + if (bCodeChroma && checkTransformSkipC) + { + uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0; + uint64_t singleCostC = MAX_INT64; + uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + + m_entropyCoder.load(m_rqt[depth].rqtRoot); + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; + TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); + + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + + int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + + ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); + ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]); + + cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); + + if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); + + fenc = const_cast(fencYuv->getChromaAddr(chromaId, absPartIdxC)); + resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); + uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); + + m_entropyCoder.resetBits(); + singleBitsComp[chromaId][tuIterator.section] = 0; + + if (numSigTSkipC) + { + m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth); + m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId); + singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); + + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC, + log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC); + uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); + nonZeroDistC = m_rdCost.scaleChromaDistCb(dist); + if (m_rdCost.m_psyRd) + { + nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); + singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC); + } + else + singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]); + } + + if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC) + cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); + else + { + singleDistComp[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC; + cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC; + bestTransformMode[chromaId][tuIterator.section] = 1; + memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC); + primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC); + } + + cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); + } + while (tuIterator.isNextSection()); + } + } + + m_entropyCoder.load(m_rqt[depth].rqtRoot); + + m_entropyCoder.resetBits(); + + if (log2TrSize > depthRange[0]) + m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); + + if (bCodeChroma) + { + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + if (!splitIntoSubTUs) + m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth); + else + { + offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx); + for (uint32_t subTU = 0; subTU < 2; subTU++) + m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth); + } + } + } + + m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); + if (cbfFlag[TEXT_LUMA][0]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + + if (bCodeChroma) + { + uint32_t subTUSize = 1 << (log2TrSizeC * 2); + uint32_t partIdxesPerSubTU = absPartIdxStep >> 1; + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; + if (!splitIntoSubTUs) + { + if (cbfFlag[chromaId][0]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId); + } + else + { + for (uint32_t subTU = 0; subTU < 2; subTU++) + { + if (cbfFlag[chromaId][subTU]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId); + } + } + } + } + + fullCost.distortion += singleDistComp[TEXT_LUMA][0]; + fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also + for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) + { + fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex]; + fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex]; + } + + fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + if (m_rdCost.m_psyRd) + fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); + else + fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits); + } + + // code sub-blocks + if (bCheckSplit) + { + if (bCheckFull) + { + m_entropyCoder.store(m_rqt[depth].rqtTest); + m_entropyCoder.load(m_rqt[depth].rqtRoot); + } + + Cost splitCost; + const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + uint32_t ycbf = 0, ucbf = 0, vcbf = 0; + for (uint32_t i = 0; i < 4; ++i) + { + estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange); + ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1); + ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1); + vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1); + } + for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i) + { + cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth; + cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth; + cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth; + } + + m_entropyCoder.load(m_rqt[depth].rqtRoot); + m_entropyCoder.resetBits(); + + encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange); + encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange); + encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange); + encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange); + + splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + + if (m_rdCost.m_psyRd) + splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); + else + splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits); + + if (ycbf || ucbf || vcbf || !bCheckFull) + { + if (splitCost.rdcost < fullCost.rdcost) + { + outCosts.distortion += splitCost.distortion; + outCosts.rdcost += splitCost.rdcost; + outCosts.bits += splitCost.bits; + outCosts.energy += splitCost.energy; + return; + } + else + outCosts.energy += splitCost.energy; + } + + cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth); + if (bCodeChroma) + { + const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1; + + uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0); + for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++) + { + const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU); + + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU); + } + } + X265_CHECK(bCheckFull, "check-full must be set\n"); + m_entropyCoder.load(m_rqt[depth].rqtTest); + } + + cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); + cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); + + if (bCodeChroma) + { + uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1; + uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0); + + for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + { + for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++) + { + const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU); + + if (splitIntoSubTUs) + { + uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1]; + cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU); + } + else + cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU); + } + } + } + + outCosts.distortion += fullCost.distortion; + outCosts.rdcost += fullCost.rdcost; + outCosts.bits += fullCost.bits; + outCosts.energy += fullCost.energy; +} + +void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]) +{ + X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); + X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n"); + + const uint32_t curTuDepth = depth - cu.m_cuDepth[0]; + const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + const bool bSubdiv = curTuDepth != tuDepth; + const uint32_t log2TrSize = g_maxLog2CUSize - depth; + + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + + const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + + if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]) + m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize); + + bool mCodeAll = true; + uint32_t trWidthC = 1 << log2TrSizeC; + uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC; + + const uint32_t numPels = trWidthC * trHeightC; + if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) + mCodeAll = false; + + if (bSubdivAndCbf) + { + const bool bFirstCbfOfCU = curTuDepth == 0; + if (bFirstCbfOfCU || mCodeAll) + { + uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1); + if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1)) + m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv); + if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1)) + m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv); + } + else + { + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n"); + } + } + + if (!bSubdiv) + { + // Luma + const uint32_t qtLayer = log2TrSize - 2; + uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); + coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; + + // Chroma + bool bCodeChroma = true; + uint32_t tuDepthC = tuDepth; + if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444)) + { + log2TrSizeC++; + tuDepthC--; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + } + + if (bSubdivAndCbf) + m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth); + else + { + if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + + if (bCodeChroma) + { + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; + coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; + + if (!splitIntoSubTUs) + { + if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); + if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + } + else + { + uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1); + uint32_t subTUSize = 1 << (log2TrSizeC * 2); + if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) + { + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); + if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U); + } + if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) + { + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V); + } + } + } + } + } + else + { + if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth)) + { + const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + for (uint32_t i = 0; i < 4; ++i) + encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange); + } + } +} + +void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth) +{ + X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); + const uint32_t curTrMode = depth - cu.m_cuDepth[0]; + const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + + if (curTrMode < tuDepth) + { + uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv) + saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1); + return; + } + + const uint32_t log2TrSize = g_maxLog2CUSize - depth; + const uint32_t qtLayer = log2TrSize - 2; + + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + bool bCodeChroma = true; + uint32_t tuDepthC = tuDepth; + if (log2TrSizeC == 1) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n"); + log2TrSizeC++; + tuDepthC--; + uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); + bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + } + + m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize); + + uint32_t numCoeffY = 1 << (log2TrSize * 2); + uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2; + coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; + coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY; + memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY); + + if (bCodeChroma) + { + m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift); + + uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + + coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; + coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; + coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; + coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC; + memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC); + memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC); + } +} + +/* returns the number of bits required to signal a non-most-probable mode. + * on return mpms contains bitmap of most probable modes */ +uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const +{ + cu.getIntraDirLumaPredictor(absPartIdx, preds); + + mpms = 0; + for (int i = 0; i < 3; ++i) + mpms |= ((uint64_t)1 << preds[i]); + + return m_entropyCoder.bitsIntraModeNonMPM(); +} + +/* swap the current mode/cost with the mode with the highest cost in the + * current candidate list, if its cost is better (maintain a top N list) */ +void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList) +{ + uint32_t maxIndex = 0; + uint64_t maxValue = 0; + + for (int i = 0; i < maxCandCount; i++) + { + if (maxValue < candCostList[i]) + { + maxValue = candCostList[i]; + maxIndex = i; + } + } + + if (cost < maxValue) + { + candCostList[maxIndex] = cost; + candModeList[maxIndex] = mode; + } +} diff --git a/source/encoder/search.h b/source/encoder/search.h new file mode 100644 index 0000000..79ed94a --- /dev/null +++ b/source/encoder/search.h @@ -0,0 +1,267 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_SEARCH_H +#define X265_SEARCH_H + +#include "common.h" +#include "predict.h" +#include "quant.h" +#include "bitcost.h" +#include "yuv.h" +#include "threadpool.h" + +#include "rdcost.h" +#include "entropy.h" +#include "motion.h" + +#define MVP_IDX_BITS 1 +#define NUM_LAYERS 4 + +namespace x265 { +// private namespace + +class Entropy; +struct ThreadLocalData; + +/* All the CABAC contexts that Analysis needs to keep track of at each depth + * and temp buffers for residual, coeff, and recon for use during residual + * quad-tree depth recursion */ +struct RQTData +{ + Entropy cur; /* starting context for current CU */ + + /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32 + * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts + * which are reconstructed at each depth are valid. At the end, the transform depth table + * is walked and the coeff and recon at the final split depths are collected */ + Entropy rqtRoot; /* residual quad-tree start context */ + Entropy rqtTemp; /* residual quad-tree temp context */ + Entropy rqtTest; /* residual quad-tree test context */ + coeff_t* coeffRQT[3]; /* coeff storage for entire CTU for each RQT layer */ + Yuv reconQtYuv; /* recon storage for entire CTU for each RQT layer (intra) */ + ShortYuv resiQtYuv; /* residual storage for entire CTU for each RQT layer (inter) */ + + /* per-depth temp buffers for inter prediction */ + ShortYuv tmpResiYuv; + Yuv tmpPredYuv; + Yuv bidirPredYuv[2]; +}; + +inline int getTUBits(int idx, int numIdx) +{ + return idx + (idx < numIdx - 1); +} + +class Search : public JobProvider, public Predict +{ +public: + + static const pixel zeroPixel[MAX_CU_SIZE]; + static const int16_t zeroShort[MAX_CU_SIZE]; + + MotionEstimate m_me; + Quant m_quant; + RDCost m_rdCost; + const x265_param* m_param; + Frame* m_frame; + const Slice* m_slice; + + Entropy m_entropyCoder; + RQTData m_rqt[NUM_FULL_DEPTH]; + + uint8_t* m_qtTempCbf[3]; + uint8_t* m_qtTempTransformSkipFlag[3]; + + bool m_bFrameParallel; + bool m_bEnableRDOQ; + uint32_t m_numLayers; + uint32_t m_refLagPixels; + + struct Mode + { + CUData cu; + const Yuv* fencYuv; + Yuv predYuv; + Yuv reconYuv; + Entropy contexts; + + uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits) + uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits) + uint32_t sa8dBits; // signal bits used in sa8dCost calculation + uint32_t psyEnergy; // sum of partition psycho-visual energy difference + uint32_t distortion; // sum of partition SSE distortion + uint32_t totalBits; // sum of partition bits (mv + coeff) + uint32_t mvBits; // Mv bits + Ref + block type (or intra mode) + uint32_t coeffBits; // Texture bits (DCT Coeffs) + + void initCosts() + { + rdCost = 0; + sa8dCost = 0; + sa8dBits = 0; + psyEnergy = 0; + distortion = 0; + totalBits = 0; + mvBits = 0; + coeffBits = 0; + } + + void addSubCosts(const Mode& subMode) + { + rdCost += subMode.rdCost; + sa8dCost += subMode.sa8dCost; + sa8dBits += subMode.sa8dBits; + psyEnergy += subMode.psyEnergy; + distortion += subMode.distortion; + totalBits += subMode.totalBits; + mvBits += subMode.mvBits; + coeffBits += subMode.coeffBits; + } + }; + + struct MotionData + { + MV mv; + MV mvp; + int mvpIdx; + int ref; + uint32_t cost; + int bits; + }; + + Search(); + ~Search(); + + bool initSearch(const x265_param& param, ScalingList& scalingList); + void setQP(const Slice& slice, int qp); + + // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores + void invalidateContexts(int fromDepth); + + // full RD search of intra modes. if sharedModes is not NULL, it directly uses them + void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes); + + // estimation inter prediction (non-skip) + bool predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma); + + // encode residual and compute rd-cost for inter mode + void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom); + void encodeResAndCalcRdSkipCU(Mode& interMode); + + void generateCoeffRecon(Mode& mode, const CUGeom& cuGeom); + void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]); + + uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const; + +protected: + + /* motion estimation distribution */ + ThreadLocalData* m_tld; + CUData* m_curMECu; + const CUGeom* m_curGeom; + int m_curPart; + MotionData m_bestME[2]; + uint32_t m_listSelBits[3]; + int m_totalNumME; + volatile int m_numAcquiredME; + volatile int m_numCompletedME; + Event m_meCompletionEvent; + Lock m_outputLock; + bool m_bJobsQueued; + void singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref); + + void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth); + + // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned + uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes); + + // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned + uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom); + + void codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height); + void codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype); + + struct Cost + { + uint64_t rdcost; + uint32_t bits; + uint32_t distortion; + uint32_t energy; + Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; } + }; + + void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, uint32_t depthRange[2]); + + void encodeResidualQT(CUData& cu, uint32_t absPartIdx, uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]); + + // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits + void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, uint32_t depthRange[2]); + void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& costs); + void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx); + + // generate chroma prediction, generate residual and recon + uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy); + uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy); + void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad); + + void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]); + void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx); + + void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx); + + struct MergeData + { + /* merge candidate data, cached between calls to mergeEstimation */ + MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; + uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; + uint32_t maxNumMergeCand; + + /* data updated for each partition */ + uint32_t absPartIdx; + int width; + int height; + + /* outputs */ + MVField mvField[2]; + uint32_t interDir; + uint32_t index; + uint32_t bits; + }; + + /* inter/ME helper functions */ + void checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const; + void setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const; + uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, int partIdx, MergeData& m); + static void getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]); + + /* intra helper functions */ + enum { MAX_RD_INTRA_MODES = 16 }; + static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList); + void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom); + + void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); } +}; +} + +#endif // ifndef X265_SEARCH_H diff --git a/source/encoder/sei.cpp b/source/encoder/sei.cpp new file mode 100644 index 0000000..37e41de --- /dev/null +++ b/source/encoder/sei.cpp @@ -0,0 +1,74 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#include "common.h" +#include "bitstream.h" +#include "slice.h" +#include "sei.h" + +using namespace x265; + +/* x265's identifying GUID */ +const uint8_t SEIuserDataUnregistered::m_uuid_iso_iec_11578[16] = { + 0x2C, 0xA2, 0xDE, 0x09, 0xB5, 0x17, 0x47, 0xDB, + 0xBB, 0x55, 0xA4, 0xFE, 0x7F, 0xC2, 0xFC, 0x4E +}; + +/* marshal a single SEI message sei, storing the marshalled representation + * in bitstream bs */ +void SEI::write(Bitstream& bs, const SPS& sps) +{ + BitCounter count; + m_bitIf = &count; + + /* virtual writeSEI method, write to bit counter */ + writeSEI(sps); + + m_bitIf = &bs; + uint32_t type = payloadType(); + for (; type >= 0xff; type -= 0xff) + WRITE_CODE(0xff, 8, "payload_type"); + WRITE_CODE(type, 8, "payload_type"); + + X265_CHECK(0 == (count.getNumberOfWrittenBits() & 7), "payload unaligned\n"); + uint32_t payloadSize = count.getNumberOfWrittenBits() >> 3; + for (; payloadSize >= 0xff; payloadSize -= 0xff) + WRITE_CODE(0xff, 8, "payload_size"); + WRITE_CODE(payloadSize, 8, "payload_size"); + + /* virtual writeSEI method, write to bs */ + writeSEI(sps); +} + +void SEI::writeByteAlign() +{ + // TODO: expose bs.writeByteAlignment() as virtual function + if (m_bitIf->getNumberOfWrittenBits() % 8 != 0) + { + WRITE_FLAG(1, "bit_equal_to_one"); + while (m_bitIf->getNumberOfWrittenBits() % 8 != 0) + { + WRITE_FLAG(0, "bit_equal_to_zero"); + } + } +} diff --git a/source/encoder/sei.h b/source/encoder/sei.h new file mode 100644 index 0000000..9ba6f3c --- /dev/null +++ b/source/encoder/sei.h @@ -0,0 +1,281 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_SEI_H +#define X265_SEI_H + +#include "common.h" +#include "bitstream.h" +#include "slice.h" + +namespace x265 { +// private namespace + +class SEI : public SyntaxElementWriter +{ +public: + + /* SEI users call write() to marshal an SEI to a bitstream. SEI + * subclasses may implement write() or accept the default write() + * method which calls writeSEI() with a bitcounter to determine + * the size, then it encodes the header and calls writeSEI a + * second time for the real encode. */ + virtual void write(Bitstream& bs, const SPS& sps); + + virtual ~SEI() {} + +protected: + + enum PayloadType + { + BUFFERING_PERIOD = 0, + PICTURE_TIMING = 1, + PAN_SCAN_RECT = 2, + FILLER_PAYLOAD = 3, + USER_DATA_REGISTERED_ITU_T_T35 = 4, + USER_DATA_UNREGISTERED = 5, + RECOVERY_POINT = 6, + SCENE_INFO = 9, + FULL_FRAME_SNAPSHOT = 15, + PROGRESSIVE_REFINEMENT_SEGMENT_START = 16, + PROGRESSIVE_REFINEMENT_SEGMENT_END = 17, + FILM_GRAIN_CHARACTERISTICS = 19, + POST_FILTER_HINT = 22, + TONE_MAPPING_INFO = 23, + FRAME_PACKING = 45, + DISPLAY_ORIENTATION = 47, + SOP_DESCRIPTION = 128, + ACTIVE_PARAMETER_SETS = 129, + DECODING_UNIT_INFO = 130, + TEMPORAL_LEVEL0_INDEX = 131, + DECODED_PICTURE_HASH = 132, + SCALABLE_NESTING = 133, + REGION_REFRESH_INFO = 134, + }; + + virtual PayloadType payloadType() const = 0; + + virtual void writeSEI(const SPS&) { X265_CHECK(0, "empty writeSEI method called\n"); } + + void writeByteAlign(); +}; + +class SEIuserDataUnregistered : public SEI +{ +public: + + PayloadType payloadType() const { return USER_DATA_UNREGISTERED; } + + SEIuserDataUnregistered() : m_userData(NULL) {} + + static const uint8_t m_uuid_iso_iec_11578[16]; + uint32_t m_userDataLength; + uint8_t *m_userData; + + void write(Bitstream& bs, const SPS&) + { + m_bitIf = &bs; + + WRITE_CODE(USER_DATA_UNREGISTERED, 8, "payload_type"); + + uint32_t payloadSize = 16 + m_userDataLength; + for (; payloadSize >= 0xff; payloadSize -= 0xff) + WRITE_CODE(0xff, 8, "payload_size"); + WRITE_CODE(payloadSize, 8, "payload_size"); + + for (uint32_t i = 0; i < 16; i++) + WRITE_CODE(m_uuid_iso_iec_11578[i], 8, "sei.uuid_iso_iec_11578[i]"); + + for (uint32_t i = 0; i < m_userDataLength; i++) + WRITE_CODE(m_userData[i], 8, "user_data"); + } +}; + +class SEIDecodedPictureHash : public SEI +{ +public: + + PayloadType payloadType() const { return DECODED_PICTURE_HASH; } + + enum Method + { + MD5, + CRC, + CHECKSUM, + } m_method; + + uint8_t m_digest[3][16]; + + void write(Bitstream& bs, const SPS&) + { + m_bitIf = &bs; + + WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type"); + + switch (m_method) + { + case MD5: + WRITE_CODE(1 + 16 * 3, 8, "payload_size"); + WRITE_CODE(MD5, 8, "hash_type"); + break; + case CRC: + WRITE_CODE(1 + 2 * 3, 8, "payload_size"); + WRITE_CODE(CRC, 8, "hash_type"); + break; + case CHECKSUM: + WRITE_CODE(1 + 4 * 3, 8, "payload_size"); + WRITE_CODE(CHECKSUM, 8, "hash_type"); + break; + } + + for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++) + { + if (m_method == MD5) + { + for (uint32_t i = 0; i < 16; i++) + WRITE_CODE(m_digest[yuvIdx][i], 8, "picture_md5"); + } + else if (m_method == CRC) + { + uint32_t val = (m_digest[yuvIdx][0] << 8) + m_digest[yuvIdx][1]; + WRITE_CODE(val, 16, "picture_crc"); + } + else if (m_method == CHECKSUM) + { + uint32_t val = (m_digest[yuvIdx][0] << 24) + (m_digest[yuvIdx][1] << 16) + (m_digest[yuvIdx][2] << 8) + m_digest[yuvIdx][3]; + WRITE_CODE(val, 32, "picture_checksum"); + } + } + } +}; + +class SEIActiveParameterSets : public SEI +{ +public: + + PayloadType payloadType() const { return ACTIVE_PARAMETER_SETS; } + + bool m_selfContainedCvsFlag; + bool m_noParamSetUpdateFlag; + + void writeSEI(const SPS&) + { + WRITE_CODE(0, 4, "active_vps_id"); + WRITE_FLAG(m_selfContainedCvsFlag, "self_contained_cvs_flag"); + WRITE_FLAG(m_noParamSetUpdateFlag, "no_param_set_update_flag"); + WRITE_UVLC(0, "num_sps_ids_minus1"); + WRITE_UVLC(0, "active_seq_param_set_id"); + writeByteAlign(); + } +}; + +class SEIBufferingPeriod : public SEI +{ +public: + + PayloadType payloadType() const { return BUFFERING_PERIOD; } + + SEIBufferingPeriod() + : m_cpbDelayOffset(0) + , m_dpbDelayOffset(0) + , m_auCpbRemovalDelayDelta(1) + { + } + + bool m_cpbDelayOffset; + bool m_dpbDelayOffset; + uint32_t m_initialCpbRemovalDelay; + uint32_t m_initialCpbRemovalDelayOffset; + uint32_t m_auCpbRemovalDelayDelta; + + void writeSEI(const SPS& sps) + { + const HRDInfo& hrd = sps.vuiParameters.hrdParameters; + + WRITE_UVLC(0, "bp_seq_parameter_set_id"); + WRITE_FLAG(0, "rap_cpb_params_present_flag"); + WRITE_FLAG(0, "concatenation_flag"); + WRITE_CODE(m_auCpbRemovalDelayDelta - 1, hrd.cpbRemovalDelayLength, "au_cpb_removal_delay_delta_minus1"); + WRITE_CODE(m_initialCpbRemovalDelay, hrd.initialCpbRemovalDelayLength, "initial_cpb_removal_delay"); + WRITE_CODE(m_initialCpbRemovalDelayOffset, hrd.initialCpbRemovalDelayLength, "initial_cpb_removal_delay_offset"); + + writeByteAlign(); + } +}; + +class SEIPictureTiming : public SEI +{ +public: + + PayloadType payloadType() const { return PICTURE_TIMING; } + + uint32_t m_picStruct; + uint32_t m_sourceScanType; + bool m_duplicateFlag; + + uint32_t m_auCpbRemovalDelay; + uint32_t m_picDpbOutputDelay; + + void writeSEI(const SPS& sps) + { + const VUI *vui = &sps.vuiParameters; + const HRDInfo *hrd = &vui->hrdParameters; + + if (vui->frameFieldInfoPresentFlag) + { + WRITE_CODE(m_picStruct, 4, "pic_struct"); + WRITE_CODE(m_sourceScanType, 2, "source_scan_type"); + WRITE_FLAG(m_duplicateFlag, "duplicate_flag"); + } + + if (vui->hrdParametersPresentFlag) + { + WRITE_CODE(m_auCpbRemovalDelay - 1, hrd->cpbRemovalDelayLength, "au_cpb_removal_delay_minus1"); + WRITE_CODE(m_picDpbOutputDelay, hrd->dpbOutputDelayLength, "pic_dpb_output_delay"); + /* Removed sub-pic signaling June 2014 */ + } + writeByteAlign(); + } +}; + +class SEIRecoveryPoint : public SEI +{ +public: + + PayloadType payloadType() const { return RECOVERY_POINT; } + + int m_recoveryPocCnt; + bool m_exactMatchingFlag; + bool m_brokenLinkFlag; + + void writeSEI(const SPS&) + { + WRITE_SVLC(m_recoveryPocCnt, "recovery_poc_cnt"); + WRITE_FLAG(m_exactMatchingFlag, "exact_matching_flag"); + WRITE_FLAG(m_brokenLinkFlag, "broken_link_flag"); + writeByteAlign(); + } +}; +} + +#endif // ifndef X265_SEI_H diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp new file mode 100644 index 0000000..cc70c20 --- /dev/null +++ b/source/encoder/slicetype.cpp @@ -0,0 +1,1743 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Gopu Govindaswamy + * Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "framedata.h" +#include "picyuv.h" +#include "primitives.h" +#include "lowres.h" +#include "mv.h" + +#include "slicetype.h" +#include "motion.h" +#include "ratecontrol.h" + +#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU) + +using namespace x265; + +static inline int16_t median(int16_t a, int16_t b, int16_t c) +{ + int16_t t = (a - b) & ((a - b) >> 31); + + a -= t; + b += t; + b -= (b - c) & ((b - c) >> 31); + b += (a - b) & ((a - b) >> 31); + return b; +} + +static inline void median_mv(MV &dst, MV a, MV b, MV c) +{ + dst.x = median(a.x, b.x, c.x); + dst.y = median(a.y, b.y, c.y); +} + +Lookahead::Lookahead(x265_param *param, ThreadPool* pool) + : JobProvider(pool) + , m_est(pool) +{ + m_bReady = 0; + m_param = param; + m_lastKeyframe = -m_param->keyframeMax; + m_lastNonB = NULL; + m_bFilling = true; + m_bFlushed = false; + m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int)); + memset(m_histogram, 0, sizeof(m_histogram)); +} + +Lookahead::~Lookahead() { } + +void Lookahead::init() +{ + if (m_pool && m_pool->getThreadCount() >= 4 && + ((m_param->bFrameAdaptive && m_param->bframes) || + m_param->rc.cuTree || m_param->scenecutThreshold || + (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) + m_pool = m_pool; /* allow use of worker thread */ + else + m_pool = NULL; /* disable use of worker thread */ +} + +void Lookahead::destroy() +{ + if (m_pool) + // flush will dequeue, if it is necessary + JobProvider::flush(); + + // these two queues will be empty unless the encode was aborted + while (!m_inputQueue.empty()) + { + Frame* curFrame = m_inputQueue.popFront(); + curFrame->destroy(); + delete curFrame; + } + + while (!m_outputQueue.empty()) + { + Frame* curFrame = m_outputQueue.popFront(); + curFrame->destroy(); + delete curFrame; + } + + x265_free(m_scratch); +} + +/* Called by API thread */ +void Lookahead::addPicture(Frame *curFrame, int sliceType) +{ + PicYuv *orig = curFrame->m_origPicYuv; + + curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType); + + m_inputQueueLock.acquire(); + m_inputQueue.pushBack(*curFrame); + + if (m_inputQueue.size() >= m_param->lookaheadDepth) + { + /* when queue fills the first time, run slicetypeDecide synchronously, + * since the encoder will always be blocked here */ + if (m_pool && !m_bFilling) + { + m_inputQueueLock.release(); + m_bReady = 1; + m_pool->pokeIdleThread(); + } + else + slicetypeDecide(); + + if (m_bFilling && m_pool) + JobProvider::enqueue(); + m_bFilling = false; + } + else + m_inputQueueLock.release(); +} + +/* Called by API thread */ +void Lookahead::flush() +{ + /* just in case the input queue is never allowed to fill */ + m_bFilling = false; + + /* flush synchronously */ + m_inputQueueLock.acquire(); + if (!m_inputQueue.empty()) + { + slicetypeDecide(); + } + else + m_inputQueueLock.release(); + + m_inputQueueLock.acquire(); + + /* bFlushed indicates that an empty output queue actually means all frames + * have been decided (no more inputs for the encoder) */ + if (m_inputQueue.empty()) + m_bFlushed = true; + m_inputQueueLock.release(); +} + +/* Called by API thread. If the lookahead queue has not yet been filled the + * first time, it immediately returns NULL. Else the function blocks until + * outputs are available and then pops the first frame from the output queue. If + * flush() has been called and the output queue is empty, NULL is returned. */ +Frame* Lookahead::getDecidedPicture() +{ + m_outputQueueLock.acquire(); + + if (m_bFilling) + { + m_outputQueueLock.release(); + return NULL; + } + + while (m_outputQueue.empty() && !m_bFlushed) + { + m_outputQueueLock.release(); + m_outputAvailable.wait(); + m_outputQueueLock.acquire(); + } + + Frame *fenc = m_outputQueue.popFront(); + m_outputQueueLock.release(); + return fenc; +} + +/* Called by pool worker threads */ +bool Lookahead::findJob(int) +{ + if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1) + { + m_inputQueueLock.acquire(); + slicetypeDecide(); + return true; + } + else + return false; +} + +/* Called by rate-control to calculate the estimated SATD cost for a given + * picture. It assumes dpb->prepareEncode() has already been called for the + * picture and all the references are established */ +void Lookahead::getEstimatedPictureCost(Frame *curFrame) +{ + Lowres *frames[X265_LOOKAHEAD_MAX]; + + // POC distances to each reference + Slice *slice = curFrame->m_encData->m_slice; + int p0 = 0, p1, b; + int poc = slice->m_poc; + int l0poc = slice->m_refPOCList[0][0]; + int l1poc = slice->m_refPOCList[1][0]; + + switch (slice->m_sliceType) + { + case I_SLICE: + frames[p0] = &curFrame->m_lowres; + b = p1 = 0; + break; + + case P_SLICE: + b = p1 = poc - l0poc; + frames[p0] = &slice->m_refPicList[0][0]->m_lowres; + frames[b] = &curFrame->m_lowres; + break; + + case B_SLICE: + b = poc - l0poc; + p1 = b + l1poc - poc; + frames[p0] = &slice->m_refPicList[0][0]->m_lowres; + frames[b] = &curFrame->m_lowres; + frames[p1] = &slice->m_refPicList[1][0]->m_lowres; + break; + + default: + return; + } + + if (m_param->rc.cuTree && !m_param->rc.bStatRead) + /* update row satds based on cutree offsets */ + curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); + else if (m_param->rc.aqMode) + curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b]; + else + curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b]; + + if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate) + { + /* aggregate lowres row satds to CTU resolution */ + curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b]; + uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0; + uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE); + uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU; + double *qp_offset = 0; + /* Factor in qpoffsets based on Aq/Cutree in CU costs */ + if (m_param->rc.aqMode) + qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; + + for (uint32_t row = 0; row < numCuInHeight; row++) + { + lowresRow = row * scale; + for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++) + { + sum = 0; + lowresCuIdx = lowresRow * widthInLowresCu; + for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++) + { + uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK; + if (qp_offset) + { + lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8); + int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx]; + curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8; + } + curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost; + sum += lowresCuCost; + } + curFrame->m_encData->m_rowStat[row].satdForVbv += sum; + } + } + } +} + +/* called by API thread or worker thread with inputQueueLock acquired */ +void Lookahead::slicetypeDecide() +{ + ScopedLock lock(m_decideLock); + + Lowres *frames[X265_LOOKAHEAD_MAX]; + Frame *list[X265_LOOKAHEAD_MAX]; + int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); + + memset(frames, 0, sizeof(frames)); + memset(list, 0, sizeof(list)); + { + Frame *curFrame = m_inputQueue.first(); + int j; + for (j = 0; j < m_param->bframes + 2; j++) + { + if (!curFrame) break; + list[j] = curFrame; + curFrame = curFrame->m_next; + } + + curFrame = m_inputQueue.first(); + frames[0] = m_lastNonB; + for (j = 0; j < maxSearch; j++) + { + if (!curFrame) break; + frames[j + 1] = &curFrame->m_lowres; + curFrame = curFrame->m_next; + } + + maxSearch = j; + } + + m_inputQueueLock.release(); + + if (!m_est.m_rows && list[0]) + m_est.init(m_param, list[0]); + + if (m_lastNonB && !m_param->rc.bStatRead && + ((m_param->bFrameAdaptive && m_param->bframes) || + m_param->rc.cuTree || m_param->scenecutThreshold || + (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) + { + slicetypeAnalyse(frames, false); + } + + int bframes, brefs; + for (bframes = 0, brefs = 0;; bframes++) + { + Lowres& frm = list[bframes]->m_lowres; + + if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid) + { + frm.sliceType = X265_TYPE_B; + x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n", + frm.frameNum); + } + + /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available. + smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/ + else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs && + m_param->maxNumReferences <= (brefs + 3)) + { + frm.sliceType = X265_TYPE_B; + x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n", + frm.sliceType, m_param->maxNumReferences); + } + + if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) + { + if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I) + frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; + bool warn = frm.sliceType != X265_TYPE_IDR; + if (warn && m_param->bOpenGOP) + warn &= frm.sliceType != X265_TYPE_I; + if (warn) + { + x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", + frm.sliceType, frm.frameNum); + frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; + } + } + if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) + { + if (m_param->bOpenGOP) + { + m_lastKeyframe = frm.frameNum; + frm.bKeyframe = true; + } + else + frm.sliceType = X265_TYPE_IDR; + } + if (frm.sliceType == X265_TYPE_IDR) + { + /* Closed GOP */ + m_lastKeyframe = frm.frameNum; + frm.bKeyframe = true; + if (bframes > 0) + { + list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P; + bframes--; + } + } + if (bframes == m_param->bframes || !list[bframes + 1]) + { + if (IS_X265_TYPE_B(frm.sliceType)) + x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n"); + if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType)) + frm.sliceType = X265_TYPE_P; + } + if (frm.sliceType == X265_TYPE_BREF) + brefs++; + if (frm.sliceType == X265_TYPE_AUTO) + frm.sliceType = X265_TYPE_B; + else if (!IS_X265_TYPE_B(frm.sliceType)) + break; + } + + if (bframes) + list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true; + list[bframes]->m_lowres.leadingBframes = bframes; + m_lastNonB = &list[bframes]->m_lowres; + m_histogram[bframes]++; + + /* insert a bref into the sequence */ + if (m_param->bBPyramid && bframes > 1 && !brefs) + { + list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF; + brefs++; + } + + /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */ + if (m_param->rc.rateControlMode != X265_RC_CQP) + { + int p0, p1, b; + /* For zero latency tuning, calculate frame cost to be used later in RC */ + if (!maxSearch) + { + for (int i = 0; i <= bframes; i++) + frames[i + 1] = &list[i]->m_lowres; + } + + /* estimate new non-B cost */ + p1 = b = bframes + 1; + p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0; + m_est.estimateFrameCost(frames, p0, p1, b, 0); + + if (bframes) + { + p0 = 0; // last nonb + for (b = 1; b <= bframes; b++) + { + if (frames[b]->sliceType == X265_TYPE_B) + for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++) + ; // find new nonb or bref + else + p1 = bframes + 1; + + m_est.estimateFrameCost(frames, p0, p1, b, 0); + + if (frames[b]->sliceType == X265_TYPE_BREF) + p0 = b; + } + } + } + + m_inputQueueLock.acquire(); + + /* dequeue all frames from inputQueue that are about to be enqueued + * in the output queue. The order is important because Frame can + * only be in one list at a time */ + int64_t pts[X265_BFRAME_MAX + 1]; + for (int i = 0; i <= bframes; i++) + { + Frame *curFrame; + curFrame = m_inputQueue.popFront(); + pts[i] = curFrame->m_pts; + maxSearch--; + } + + m_inputQueueLock.release(); + + m_outputQueueLock.acquire(); + /* add non-B to output queue */ + int idx = 0; + list[bframes]->m_reorderedPts = pts[idx++]; + m_outputQueue.pushBack(*list[bframes]); + + /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */ + if (bframes > 1 && m_param->bBPyramid) + { + for (int i = 0; i < bframes; i++) + { + if (list[i]->m_lowres.sliceType == X265_TYPE_BREF) + { + list[i]->m_reorderedPts = pts[idx++]; + m_outputQueue.pushBack(*list[i]); + } + } + } + + /* add B frames to output queue */ + for (int i = 0; i < bframes; i++) + { + /* push all the B frames into output queue except B-ref, which already pushed into output queue*/ + if (list[i]->m_lowres.sliceType != X265_TYPE_BREF) + { + list[i]->m_reorderedPts = pts[idx++]; + m_outputQueue.pushBack(*list[i]); + } + } + + bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead; + if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType)) + { + m_inputQueueLock.acquire(); + Frame *curFrame = m_inputQueue.first(); + frames[0] = m_lastNonB; + int j; + for (j = 0; j < maxSearch; j++) + { + frames[j + 1] = &curFrame->m_lowres; + curFrame = curFrame->m_next; + } + + frames[j + 1] = NULL; + m_inputQueueLock.release(); + slicetypeAnalyse(frames, true); + } + + m_outputQueueLock.release(); + m_outputAvailable.trigger(); +} + +void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) +{ + int prevNonB = 0, curNonB = 1, idx = 0; + bool isNextNonB = false; + + while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B) + curNonB++; + + int nextNonB = keyframe ? prevNonB : curNonB; + int nextB = keyframe ? prevNonB + 1 : curNonB + 1; + + while (curNonB < numFrames + !keyframe) + { + /* P/I cost: This shouldn't include the cost of nextNonB */ + if (nextNonB != curNonB) + { + int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; + frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB); + frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType; + idx++; + } + /* Handle the B-frames: coded order */ + for (int i = prevNonB + 1; i < curNonB; i++, idx++) + { + frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i); + frames[nextNonB]->plannedType[idx] = X265_TYPE_B; + } + + for (int i = nextB; i <= curNonB; i++) + { + for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++) + { + if (j == curNonB) + { + if (isNextNonB) + { + int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; + frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB); + frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType; + } + } + else + { + frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j); + frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B; + } + } + if (i == curNonB && !isNextNonB) + isNextNonB = true; + } + + prevNonB = curNonB; + curNonB++; + while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B) + curNonB++; + } + + frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO; +} + +int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b) +{ + int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0); + + if (m_param->rc.aqMode) + { + if (m_param->rc.cuTree) + return frameCostRecalculate(frames, p0, p1, b); + else + return frames[b]->costEstAq[b - p0][p1 - b]; + } + return cost; +} + +void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) +{ + int numFrames, origNumFrames, keyintLimit, framecnt; + int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); + int cuCount = NUM_CUS; + int resetStart; + bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth; + + /* count undecided frames */ + for (framecnt = 0; framecnt < maxSearch; framecnt++) + { + Lowres *fenc = frames[framecnt + 1]; + if (!fenc || fenc->sliceType != X265_TYPE_AUTO) + break; + } + + if (!framecnt) + { + if (m_param->rc.cuTree) + cuTree(frames, 0, bKeyframe); + return; + } + + frames[framecnt + 1] = NULL; + + keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1; + origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit); + + if (bIsVbvLookahead) + numFrames = framecnt; + else if (m_param->bOpenGOP && numFrames < framecnt) + numFrames++; + else if (numFrames == 0) + { + frames[1]->sliceType = X265_TYPE_I; + return; + } + + int numBFrames = 0; + int numAnalyzed = numFrames; + if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch)) + { + frames[1]->sliceType = X265_TYPE_I; + return; + } + + if (m_param->bframes) + { + if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) + { + if (numFrames > 1) + { + char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" }; + int best_path_index = numFrames % (X265_BFRAME_MAX + 1); + + /* Perform the frametype analysis. */ + for (int j = 2; j <= numFrames; j++) + { + slicetypePath(frames, j, best_paths); + } + + numBFrames = (int)strspn(best_paths[best_path_index], "B"); + + /* Load the results of the analysis into the frame types. */ + for (int j = 1; j < numFrames; j++) + { + frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P; + } + } + frames[numFrames]->sliceType = X265_TYPE_P; + } + else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST) + { + int64_t cost1p0, cost2p0, cost1b1, cost2p1; + + for (int i = 0; i <= numFrames - 2; ) + { + cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1); + if (frames[i + 2]->intraMbs[2] > cuCount / 2) + { + frames[i + 1]->sliceType = X265_TYPE_P; + frames[i + 2]->sliceType = X265_TYPE_P; + i += 2; + continue; + } + + cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0); + cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0); + cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0); + + if (cost1p0 + cost2p0 < cost1b1 + cost2p1) + { + frames[i + 1]->sliceType = X265_TYPE_P; + i += 1; + continue; + } + +// arbitrary and untuned +#define INTER_THRESH 300 +#define P_SENS_BIAS (50 - m_param->bFrameBias) + frames[i + 1]->sliceType = X265_TYPE_B; + + int j; + for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++) + { + int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10); + int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1); + if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3) + break; + frames[j]->sliceType = X265_TYPE_B; + } + + frames[j]->sliceType = X265_TYPE_P; + i = j; + } + frames[numFrames]->sliceType = X265_TYPE_P; + numBFrames = 0; + while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B) + { + numBFrames++; + } + } + else + { + numBFrames = X265_MIN(numFrames - 1, m_param->bframes); + for (int j = 1; j < numFrames; j++) + { + frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P; + } + + frames[numFrames]->sliceType = X265_TYPE_P; + } + /* Check scenecut on the first minigop. */ + for (int j = 1; j < numBFrames + 1; j++) + { + if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch)) + { + frames[j]->sliceType = X265_TYPE_P; + numAnalyzed = j; + break; + } + } + + resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1); + } + else + { + for (int j = 1; j <= numFrames; j++) + { + frames[j]->sliceType = X265_TYPE_P; + } + + resetStart = bKeyframe ? 1 : 2; + } + + if (m_param->rc.cuTree) + cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe); + + // if (!param->bIntraRefresh) + for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax) + { + frames[j]->sliceType = X265_TYPE_I; + resetStart = X265_MIN(resetStart, j + 1); + } + + if (bIsVbvLookahead) + vbvLookahead(frames, numFrames, bKeyframe); + + /* Restore frametypes for all frames that haven't actually been decided yet. */ + for (int j = resetStart; j <= numFrames; j++) + { + frames[j]->sliceType = X265_TYPE_AUTO; + } +} + +bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch) +{ + /* Only do analysis during a normal scenecut check. */ + if (bRealScenecut && m_param->bframes) + { + int origmaxp1 = p0 + 1; + /* Look ahead to avoid coding short flashes as scenecuts. */ + if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) + /* Don't analyse any more frames than the trellis would have covered. */ + origmaxp1 += m_param->bframes; + else + origmaxp1++; + int maxp1 = X265_MIN(origmaxp1, numFrames); + + /* Where A and B are scenes: AAAAAABBBAAAAAA + * If BBB is shorter than (maxp1-p0), it is detected as a flash + * and not considered a scenecut. */ + for (int cp1 = p1; cp1 <= maxp1; cp1++) + { + if (!scenecutInternal(frames, p0, cp1, false)) + /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */ + for (int i = cp1; i > p0; i--) + { + frames[i]->bScenecut = false; + } + } + + /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF + * If each of BB ... EE are shorter than (maxp1-p0), they are + * detected as flashes and not considered scenecuts. + * Instead, the first F frame becomes a scenecut. + * If the video ends before F, no frame becomes a scenecut. */ + for (int cp0 = p0; cp0 <= maxp1; cp0++) + { + if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false))) + /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */ + frames[cp0]->bScenecut = false; + } + } + + /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */ + if (!frames[p1]->bScenecut) + return false; + return scenecutInternal(frames, p0, p1, bRealScenecut); +} + +bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut) +{ + Lowres *frame = frames[p1]; + + m_est.estimateFrameCost(frames, p0, p1, p1, 0); + + int64_t icost = frame->costEst[0][0]; + int64_t pcost = frame->costEst[p1 - p0][0]; + int gopSize = frame->frameNum - m_lastKeyframe; + float threshMax = (float)(m_param->scenecutThreshold / 100.0); + + /* magic numbers pulled out of thin air */ + float threshMin = (float)(threshMax * 0.25); + float bias; + + if (m_param->keyframeMin == m_param->keyframeMax) + threshMin = threshMax; + if (gopSize <= m_param->keyframeMin / 4) + bias = threshMin / 4; + else if (gopSize <= m_param->keyframeMin) + bias = threshMin * gopSize / m_param->keyframeMin; + else + { + bias = threshMin + + (threshMax - threshMin) + * (gopSize - m_param->keyframeMin) + / (m_param->keyframeMax - m_param->keyframeMin); + } + + bool res = pcost >= (1.0 - bias) * icost; + if (res && bRealScenecut) + { + int imb = frame->intraMbs[p1 - p0]; + int pmb = NUM_CUS - imb; + x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", + frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb); + } + return res; +} + +void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]) +{ + char paths[2][X265_LOOKAHEAD_MAX + 1]; + int num_paths = X265_MIN(m_param->bframes + 1, length); + int64_t best_cost = 1LL << 62; + int idx = 0; + + /* Iterate over all currently possible paths */ + for (int path = 0; path < num_paths; path++) + { + /* Add suffixes to the current path */ + int len = length - (path + 1); + memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len); + memset(paths[idx] + len, 'B', path); + strcpy(paths[idx] + len + path, "P"); + + /* Calculate the actual cost of the current path */ + int64_t cost = slicetypePathCost(frames, paths[idx], best_cost); + if (cost < best_cost) + { + best_cost = cost; + idx ^= 1; + } + } + + /* Store the best path. */ + memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length); +} + +int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold) +{ + int64_t cost = 0; + int loc = 1; + int cur_p = 0; + + path--; /* Since the 1st path element is really the second frame */ + while (path[loc]) + { + int next_p = loc; + /* Find the location of the next P-frame. */ + while (path[next_p] != 'P') + { + next_p++; + } + + /* Add the cost of the P-frame found above */ + cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0); + /* Early terminate if the cost we have found is larger than the best path cost so far */ + if (cost > threshold) + break; + + if (m_param->bBPyramid && next_p - cur_p > 2) + { + int middle = cur_p + (next_p - cur_p) / 2; + cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0); + for (int next_b = loc; next_b < middle && cost < threshold; next_b++) + { + cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0); + } + + for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++) + { + cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0); + } + } + else + { + for (int next_b = loc; next_b < next_p && cost < threshold; next_b++) + { + cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0); + } + } + + loc = next_p + 1; + cur_p = next_p; + } + + return cost; +} + +void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra) +{ + int idx = !bIntra; + int lastnonb, curnonb = 1; + int bframes = 0; + + x265_emms(); + double totalDuration = 0.0; + for (int j = 0; j <= numframes; j++) + totalDuration += (double)m_param->fpsDenom / m_param->fpsNum; + + double averageDuration = totalDuration / (numframes + 1); + + int i = numframes; + int cuCount = m_widthInCU * m_heightInCU; + + if (bIntra) + m_est.estimateFrameCost(frames, 0, 0, 0, 0); + + while (i > 0 && frames[i]->sliceType == X265_TYPE_B) + i--; + + lastnonb = i; + + /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could + * be applied to the end of a lookahead buffer of any size. However, it's most needed when + * lookahead=0, so that's what's currently implemented. */ + if (!m_param->lookaheadDepth) + { + if (bIntra) + { + memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); + memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double)); + return; + } + std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); + memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); + } + else + { + if (lastnonb < idx) + return; + memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); + } + + while (i-- > idx) + { + curnonb = i; + while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0) + curnonb--; + + if (curnonb < idx) + break; + + m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0); + memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); + bframes = lastnonb - curnonb - 1; + if (m_param->bBPyramid && bframes > 1) + { + int middle = (bframes + 1) / 2 + curnonb; + m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0); + memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t)); + while (i > curnonb) + { + int p0 = i > middle ? middle : curnonb; + int p1 = i < middle ? middle : lastnonb; + if (i != middle) + { + m_est.estimateFrameCost(frames, p0, p1, i, 0); + estimateCUPropagate(frames, averageDuration, p0, p1, i, 0); + } + i--; + } + + estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1); + } + else + { + while (i > curnonb) + { + m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0); + estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0); + i--; + } + } + estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1); + lastnonb = curnonb; + } + + if (!m_param->lookaheadDepth) + { + m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0); + estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1); + std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); + } + + cuTreeFinish(frames[lastnonb], averageDuration, lastnonb); + if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize) + cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0); +} + +void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced) +{ + uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost }; + int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0); + int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; + MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] }; + int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; + + memset(m_scratch, 0, m_widthInCU * sizeof(int)); + + uint16_t *propagateCost = frames[b]->propagateCost; + + x265_emms(); + double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration); + + /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */ + if (!referenced) + memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t)); + + int32_t StrideInCU = m_widthInCU; + for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++) + { + int cuIndex = blocky * StrideInCU; + primitives.propagateCost(m_scratch, propagateCost, + frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex, + frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU); + + if (referenced) + propagateCost += m_widthInCU; + for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++) + { + int32_t propagate_amount = m_scratch[blockx]; + /* Don't propagate for an intra block. */ + if (propagate_amount > 0) + { + /* Access width-2 bitfield. */ + int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT; + /* Follow the MVs to the previous frame(s). */ + for (uint16_t list = 0; list < 2; list++) + { + if ((lists_used >> list) & 1) + { +#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) + int32_t listamount = propagate_amount; + /* Apply bipred weighting. */ + if (lists_used == 3) + listamount = (listamount * bipredWeights[list] + 32) >> 6; + + /* Early termination for simple case of mv0. */ + if (!mvs[list][cuIndex].word) + { + CLIP_ADD(refCosts[list][cuIndex], listamount); + continue; + } + + int32_t x = mvs[list][cuIndex].x; + int32_t y = mvs[list][cuIndex].y; + int32_t cux = (x >> 5) + blockx; + int32_t cuy = (y >> 5) + blocky; + int32_t idx0 = cux + cuy * StrideInCU; + int32_t idx1 = idx0 + 1; + int32_t idx2 = idx0 + StrideInCU; + int32_t idx3 = idx0 + StrideInCU + 1; + x &= 31; + y &= 31; + int32_t idx0weight = (32 - y) * (32 - x); + int32_t idx1weight = (32 - y) * x; + int32_t idx2weight = y * (32 - x); + int32_t idx3weight = y * x; + + /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't + * be counted. */ + if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0) + { + CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); + CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); + CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); + CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); + } + else /* Check offsets individually */ + { + if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0) + CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); + if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0) + CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); + if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0) + CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); + if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0) + CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); + } + } + } + } + } + } + + if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced) + cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0); +} + +void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance) +{ + int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256); + double weightdelta = 0.0; + + if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0) + weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]); + + /* Allow the strength to be adjusted via qcompress, since the two + * concepts are very similar. */ + + int cuCount = m_widthInCU * m_heightInCU; + double strength = 5.0 * (1.0 - m_param->rc.qCompress); + + for (int cuIndex = 0; cuIndex < cuCount; cuIndex++) + { + int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8; + if (intracost) + { + int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8; + double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta; + frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio; + } + } +} + +/* If MB-tree changes the quantizers, we need to recalculate the frame cost without + * re-running lookahead. */ +int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b) +{ + int64_t score = 0; + int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b]; + double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; + + x265_emms(); + for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--) + { + rowSatd[cuy] = 0; + for (int cux = m_widthInCU - 1; cux >= 0; cux--) + { + int cuxy = cux + cuy * m_widthInCU; + int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK; + double qp_adj = qp_offset[cuxy]; + cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8; + rowSatd[cuy] += cuCost; + if ((cuy > 0 && cuy < m_heightInCU - 1 && + cux > 0 && cux < m_widthInCU - 1) || + m_widthInCU <= 2 || m_heightInCU <= 2) + { + score += cuCost; + } + } + } + + return score; +} + +CostEstimate::CostEstimate(ThreadPool *p) + : WaveFront(p) +{ + m_param = NULL; + m_curframes = NULL; + m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0; + m_rows = NULL; + m_paddedLines = m_widthInCU = m_heightInCU = 0; + m_bDoSearch[0] = m_bDoSearch[1] = false; + m_curb = m_curp0 = m_curp1 = 0; + m_bFrameCompleted = false; +} + +CostEstimate::~CostEstimate() +{ + for (int i = 0; i < 4; i++) + { + x265_free(m_wbuffer[i]); + } + + delete[] m_rows; +} + +void CostEstimate::init(x265_param *_param, Frame *curFrame) +{ + m_param = _param; + m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + + m_rows = new EstimateRow[m_heightInCU]; + for (int i = 0; i < m_heightInCU; i++) + { + m_rows[i].m_widthInCU = m_widthInCU; + m_rows[i].m_heightInCU = m_heightInCU; + m_rows[i].m_param = m_param; + } + + if (WaveFront::init(m_heightInCU)) + WaveFront::enableAllRows(); + else + m_pool = NULL; + + if (m_param->bEnableWeightedPred) + { + PicYuv *orig = curFrame->m_origPicYuv; + m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY; + intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX; + + /* allocate weighted lowres buffers */ + for (int i = 0; i < 4; i++) + { + m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines)); + m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset; + } + + m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0]; + m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride; + m_weightedRef.isLowres = true; + m_weightedRef.isWeighted = false; + } +} + +int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty) +{ + int64_t score = 0; + Lowres *fenc = frames[b]; + + if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1) + score = fenc->costEst[b - p0][p1 - b]; + else + { + m_weightedRef.isWeighted = false; + if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF) + { + if (!fenc->bIntraCalculated) + estimateFrameCost(frames, b, b, b, 0); + weightsAnalyse(frames, b, p0); + } + + /* For each list, check to see whether we have lowres motion-searched this reference */ + m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; + m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; + + if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0; + if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0; + + m_curb = b; + m_curp0 = p0; + m_curp1 = p1; + m_curframes = frames; + fenc->costEst[b - p0][p1 - b] = 0; + fenc->costEstAq[b - p0][p1 - b] = 0; + + for (int i = 0; i < m_heightInCU; i++) + { + m_rows[i].init(); + m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride); + if (!fenc->bIntraCalculated) + fenc->rowSatds[0][0][i] = 0; + fenc->rowSatds[b - p0][p1 - b][i] = 0; + } + + m_bFrameCompleted = false; + + if (m_pool) + { + WaveFront::enqueue(); + + // enableAllRows must be already called + enqueueRow(0); + while (!m_bFrameCompleted) + WaveFront::findJob(-1); + + WaveFront::dequeue(); + } + else + { + for (int row = 0; row < m_heightInCU; row++) + processRow(row, -1); + + x265_emms(); + } + + // Accumulate cost from each row + for (int row = 0; row < m_heightInCU; row++) + { + score += m_rows[row].m_costEst; + fenc->costEst[0][0] += m_rows[row].m_costIntra; + if (m_param->rc.aqMode) + { + fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq; + fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq; + } + fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs; + } + + fenc->bIntraCalculated = true; + + if (b != p1) + score = (uint64_t)score * 100 / (130 + m_param->bFrameBias); + if (b != p0 || b != p1) //Not Intra cost + fenc->costEst[b - p0][p1 - b] = score; + } + + if (bIntraPenalty) + { + // arbitrary penalty for I-blocks after B-frames + int ncu = NUM_CUS; + score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8); + } + return score; +} + +uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp) +{ + Lowres *fenc = frames[b]; + Lowres *ref = frames[p0]; + pixel *src = ref->fpelPlane; + intptr_t stride = fenc->lumaStride; + + if (wp) + { + int offset = wp->inputOffset << (X265_DEPTH - 8); + int scale = wp->inputWeight; + int denom = wp->log2WeightDenom; + int round = denom ? 1 << (denom - 1) : 0; + int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth + int widthHeight = (int)stride; + + primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines, + scale, round << correction, denom + correction, offset); + src = m_weightedRef.fpelPlane; + } + + uint32_t cost = 0; + intptr_t pixoff = 0; + int mb = 0; + + for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride) + { + for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8) + { + int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride); + cost += X265_MIN(satd, fenc->intraCost[mb]); + } + } + + return cost; +} + +void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0) +{ + static const float epsilon = 1.f / 128.f; + Lowres *fenc, *ref; + + fenc = frames[b]; + ref = frames[p0]; + int deltaIndex = fenc->frameNum - ref->frameNum; + + /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ + float guessScale, fencMean, refMean; + x265_emms(); + if (fenc->wp_ssd[0] && ref->wp_ssd[0]) + guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]); + else + guessScale = 1.0f; + fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); + refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); + + /* Early termination */ + if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon) + return; + + int minoff = 0, minscale, mindenom; + unsigned int minscore = 0, origscore = 1; + int found = 0; + + m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true); + mindenom = m_w.log2WeightDenom; + minscale = m_w.inputWeight; + + origscore = minscore = weightCostLuma(frames, b, p0, NULL); + + if (!minscore) + return; + + unsigned int s = 0; + int curScale = minscale; + int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f); + if (curOffset < -128 || curOffset > 127) + { + /* Rescale considering the constraints on curOffset. We do it in this order + * because scale has a much wider range than offset (because of denom), so + * it should almost never need to be clamped. */ + curOffset = Clip3(-128, 127, curOffset); + curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f); + curScale = Clip3(0, 127, curScale); + } + SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset); + s = weightCostLuma(frames, b, p0, &m_w); + COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1); + + /* Use a smaller denominator if possible */ + while (mindenom > 0 && !(minscale & 1)) + { + mindenom--; + minscale >>= 1; + } + + if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f) + return; + else + { + SET_WEIGHT(m_w, 1, minscale, mindenom, minoff); + // set weighted delta cost + fenc->weightedCostDelta[deltaIndex] = minscore / origscore; + + int offset = m_w.inputOffset << (X265_DEPTH - 8); + int scale = m_w.inputWeight; + int denom = m_w.log2WeightDenom; + int round = denom ? 1 << (denom - 1) : 0; + int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth + intptr_t stride = ref->lumaStride; + int widthHeight = (int)stride; + + for (int i = 0; i < 4; i++) + primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines, + scale, round << correction, denom + correction, offset); + + m_weightedRef.isWeighted = true; + } +} + +void CostEstimate::processRow(int row, int /*threadId*/) +{ + int realrow = m_heightInCU - 1 - row; + Lowres **frames = m_curframes; + ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0]; + + /* Lowres lookahead goes backwards because the MVs are used as + * predictors in the main encode. This considerably improves MV + * prediction overall. */ + for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--) + { + // TODO: use lowres MVs as motion candidates in full-res search + m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch); + m_rows[row].m_completed++; + + if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1) + { + ScopedLock below(m_rows[row + 1].m_lock); + if (m_rows[row + 1].m_active == false && + m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed) + { + m_rows[row + 1].m_active = true; + enqueueRow(row + 1); + } + } + + ScopedLock self(m_rows[row].m_lock); + if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 && + m_rows[row - 1].m_completed < m_rows[row].m_completed + 2) + { + m_rows[row].m_active = false; + return; + } + } + + if (row == m_heightInCU - 1) + m_bFrameCompleted = true; +} + +void EstimateRow::init() +{ + m_costEst = 0; + m_costEstAq = 0; + m_costIntra = 0; + m_costIntraAq = 0; + m_intraMbs = 0; + m_active = false; + m_completed = 0; +} + +void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]) +{ + Lowres *fref1 = frames[p1]; + Lowres *fenc = frames[b]; + + const int bBidir = (b < p1); + const int cuXY = cux + cuy * m_widthInCU; + const int cuSize = X265_LOWRES_CU_SIZE; + const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride; + + // should this CU's cost contribute to the frame cost? + const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 && + cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2; + + m_me.setSourcePU(pelOffset, cuSize, cuSize); + + /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ + int lowresPenalty = 4; + + MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], + &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; + int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY], + &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; + + MV mvmin, mvmax; + int bcost = m_me.COST_MAX; + int listused = 0; + + // establish search bounds that don't cross extended frame boundaries + mvmin.x = (int16_t)(-cux * cuSize - 8); + mvmin.y = (int16_t)(-cuy * cuSize - 8); + mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8); + mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8); + + if (p0 != p1) + { + for (int i = 0; i < 1 + bBidir; i++) + { + if (!bDoSearch[i]) + { + /* Use previously calculated cost */ + COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); + continue; + } + int numc = 0; + MV mvc[4], mvp; + MV *fenc_mv = fenc_mvs[i]; + + /* Reverse-order MV prediction. */ + mvc[0] = 0; + mvc[2] = 0; +#define MVC(mv) mvc[numc++] = mv; + if (cux < m_widthInCU - 1) + MVC(fenc_mv[1]); + if (cuy < m_heightInCU - 1) + { + MVC(fenc_mv[m_widthInCU]); + if (cux > 0) + MVC(fenc_mv[m_widthInCU - 1]); + if (cux < m_widthInCU - 1) + MVC(fenc_mv[m_widthInCU + 1]); + } +#undef MVC + if (numc <= 1) + mvp = mvc[0]; + else + { + median_mv(mvp, mvc[0], mvc[1], mvc[2]); + } + + *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]); + COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); + } + if (bBidir) + { + pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; + intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; + pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0); + pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1); + + pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; + primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); + int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); + COPY2_IF_LT(bcost, bicost, listused, 3); + + // Try 0,0 candidates + src0 = wfref0->lowresPlane[0] + pelOffset; + src1 = fref1->lowresPlane[0] + pelOffset; + primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32); + bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); + COPY2_IF_LT(bcost, bicost, listused, 3); + } + } + if (!fenc->bIntraCalculated) + { + const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size + + pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE; + pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE; + pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE; + pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE; + + pixel *pix_cur = fenc->lowresPlane[0] + pelOffset; + + // Copy Above + memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel)); + + // Copy Left + for (int i = 0; i < cuSize + 1; i++) + { + left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride]; + } + + for (int i = 0; i < cuSize; i++) + { + above0[cuSize + i + 1] = above0[cuSize]; + left0[cuSize + i + 1] = left0[cuSize]; + } + + // filtering with [1 2 1] + // assume getUseStrongIntraSmoothing() is disabled + above1[0] = above0[0]; + above1[2 * cuSize] = above0[2 * cuSize]; + left1[0] = left0[0]; + left1[2 * cuSize] = left0[2 * cuSize]; + for (int i = 1; i < 2 * cuSize; i++) + { + above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2; + left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2; + } + + int predsize = cuSize * cuSize; + + // generate 35 intra predictions into m_predictions + pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)]; + int icost = m_me.COST_MAX, cost; + primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16)); + cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize); + if (cost < icost) + icost = cost; + pixel *above = (cuSize >= 8) ? above1 : above0; + pixel *left = (cuSize >= 8) ? left1 : left0; + primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0); + cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize); + if (cost < icost) + icost = cost; + primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16)); + + // calculate satd costs, keep least cost + ALIGN_VAR_32(pixel, buf_trans[32 * 32]); + primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE); + + int acost = m_me.COST_MAX; + uint32_t mode, lowmode = 4; + for (mode = 5; mode < 35; mode += 5) + { + if (mode < 18) + cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); + else + cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); + COPY2_IF_LT(acost, cost, lowmode, mode); + } + for (uint32_t dist = 2; dist >= 1; dist--) + { + mode = lowmode - dist; + if (mode < 18) + cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); + else + cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); + COPY2_IF_LT(acost, cost, lowmode, mode); + + mode = lowmode + dist; + if (mode < 18) + cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); + else + cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); + COPY2_IF_LT(acost, cost, lowmode, mode); + } + if (acost < icost) + icost = acost; + + const int intraPenalty = 5 * m_lookAheadLambda; + icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ + fenc->intraCost[cuXY] = icost; + int icostAq = icost; + if (bFrameScoreCU) + { + m_costIntra += icost; + if (fenc->invQscaleFactor) + { + icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8; + m_costIntraAq += icostAq; + } + } + fenc->rowSatds[0][0][cuy] += icostAq; + } + bcost += lowresPenalty; + if (!bBidir) + { + if (fenc->intraCost[cuXY] < bcost) + { + if (bFrameScoreCU) m_intraMbs++; + bcost = fenc->intraCost[cuXY]; + listused = 0; + } + } + + /* For I frames these costs were accumulated earlier */ + if (p0 != p1) + { + int bcostAq = bcost; + if (bFrameScoreCU) + { + m_costEst += bcost; + if (fenc->invQscaleFactor) + { + bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8; + m_costEstAq += bcostAq; + } + } + fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq; + } + fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); +} diff --git a/source/encoder/slicetype.h b/source/encoder/slicetype.h new file mode 100644 index 0000000..8805e90 --- /dev/null +++ b/source/encoder/slicetype.h @@ -0,0 +1,189 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_SLICETYPE_H +#define X265_SLICETYPE_H + +#include "common.h" +#include "slice.h" +#include "motion.h" +#include "piclist.h" +#include "wavefront.h" + +namespace x265 { +// private namespace + +struct Lowres; +class Frame; + +#define LOWRES_COST_MASK ((1 << 14) - 1) +#define LOWRES_COST_SHIFT 14 + +#define SET_WEIGHT(w, b, s, d, o) \ + { \ + (w).inputWeight = (s); \ + (w).log2WeightDenom = (d); \ + (w).inputOffset = (o); \ + (w).bPresentFlag = b; \ + } + +class EstimateRow +{ +public: + x265_param* m_param; + MotionEstimate m_me; + Lock m_lock; + pixel* m_predictions; // buffer for 35 intra predictions + + volatile uint32_t m_completed; // Number of CUs in this row for which cost estimation is completed + volatile bool m_active; + + uint64_t m_costEst; // Estimated cost for all CUs in a row + uint64_t m_costEstAq; // Estimated weight Aq cost for all CUs in a row + uint64_t m_costIntraAq; // Estimated weighted Aq Intra cost for all CUs in a row + int m_intraMbs; // Number of Intra CUs + int m_costIntra; // Estimated Intra cost for all CUs in a row + + int m_merange; + int m_lookAheadLambda; + + int m_widthInCU; + int m_heightInCU; + + EstimateRow() + { + m_me.setQP(X265_LOOKAHEAD_QP); + m_me.setSearchMethod(X265_HEX_SEARCH); + m_me.setSubpelRefine(1); + m_predictions = X265_MALLOC(pixel, 35 * 8 * 8); + m_merange = 16; + m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP]; + } + + ~EstimateRow() + { + X265_FREE(m_predictions); + } + + void init(); + + void estimateCUCost(Lowres * *frames, ReferencePlanes * wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]); +}; + +/* CostEstimate manages the cost estimation of a single frame, ie: + * estimateFrameCost() and everything below it in the call graph */ +class CostEstimate : public WaveFront +{ +public: + CostEstimate(ThreadPool *p); + ~CostEstimate(); + void init(x265_param *, Frame *); + + x265_param *m_param; + EstimateRow *m_rows; + pixel *m_wbuffer[4]; + Lowres **m_curframes; + + ReferencePlanes m_weightedRef; + WeightParam m_w; + + int m_paddedLines; // number of lines in padded frame + int m_widthInCU; // width of lowres frame in downscale CUs + int m_heightInCU; // height of lowres frame in downscale CUs + + bool m_bDoSearch[2]; + volatile bool m_bFrameCompleted; + int m_curb, m_curp0, m_curp1; + + void processRow(int row, int threadId); + int64_t estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty); + +protected: + + void weightsAnalyse(Lowres **frames, int b, int p0); + uint32_t weightCostLuma(Lowres **frames, int b, int p0, WeightParam *w); +}; + +class Lookahead : public JobProvider +{ +public: + + Lookahead(x265_param *param, ThreadPool *pool); + ~Lookahead(); + void init(); + void destroy(); + + CostEstimate m_est; // Frame cost estimator + PicList m_inputQueue; // input pictures in order received + PicList m_outputQueue; // pictures to be encoded, in encode order + + x265_param *m_param; + Lowres *m_lastNonB; + int *m_scratch; // temp buffer + + int m_widthInCU; // width of lowres frame in downscale CUs + int m_heightInCU; // height of lowres frame in downscale CUs + int m_lastKeyframe; + int m_histogram[X265_BFRAME_MAX + 1]; + + void addPicture(Frame*, int sliceType); + void flush(); + Frame* getDecidedPicture(); + + void getEstimatedPictureCost(Frame *pic); + +protected: + + Lock m_inputQueueLock; + Lock m_outputQueueLock; + Lock m_decideLock; + Event m_outputAvailable; + volatile int m_bReady; + volatile bool m_bFilling; + volatile bool m_bFlushed; + bool findJob(int); + + /* called by addPicture() or flush() to trigger slice decisions */ + void slicetypeDecide(); + void slicetypeAnalyse(Lowres **frames, bool bKeyframe); + + /* called by slicetypeAnalyse() to make slice decisions */ + bool scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch); + bool scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut); + void slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]); + int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold); + int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b); + void vbvLookahead(Lowres **frames, int numFrames, int keyframes); + + /* called by slicetypeAnalyse() to effect cuTree adjustments to adaptive + * quant offsets */ + void cuTree(Lowres **frames, int numframes, bool bintra); + void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced); + void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance); + + /* called by getEstimatedPictureCost() to finalize cuTree costs */ + int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b); +}; +} + +#endif // ifndef X265_SLICETYPE_H diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp new file mode 100644 index 0000000..3bf5a45 --- /dev/null +++ b/source/encoder/weightPrediction.cpp @@ -0,0 +1,534 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Author: Shazeb Nawaz Khan + * Steve Borho + * Kavitha Sampas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "frame.h" +#include "picyuv.h" +#include "lowres.h" +#include "mv.h" +#include "slicetype.h" +#include "bitstream.h" + +using namespace x265; +namespace { +struct Cache +{ + const int * intraCost; + int numPredDir; + int csp; + int hshift; + int vshift; + int lowresWidthInCU; + int lowresHeightInCU; +}; + +int sliceHeaderCost(WeightParam *w, int lambda, int bChroma) +{ + /* 4 times higher, because chroma is analyzed at full resolution. */ + if (bChroma) + lambda *= 4; + int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma); + return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset))); +} + +/* make a motion compensated copy of lowres ref into mcout with the same stride. + * The borders of mcout are not extended */ +void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs) +{ + intptr_t stride = ref.lumaStride; + const int cuSize = 8; + MV mvmin, mvmax; + + int cu = 0; + + for (int y = 0; y < ref.lines; y += cuSize) + { + intptr_t pixoff = y * stride; + mvmin.y = (int16_t)((-y - 8) << 2); + mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2); + + for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++) + { + ALIGN_VAR_16(pixel, buf8x8[8 * 8]); + intptr_t bstride = 8; + mvmin.x = (int16_t)((-x - 8) << 2); + mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2); + + /* clip MV to available pixels */ + MV mv = mvs[cu]; + mv = mv.clipped(mvmin, mvmax); + pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride); + primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride); + } + } +} + +/* use lowres MVs from lookahead to generate a motion compensated chroma plane. + * if a block had cheaper lowres cost as intra, we treat it as MV 0 */ +void mcChroma(pixel * mcout, + pixel * src, + intptr_t stride, + const MV * mvs, + const Cache& cache, + int height, + int width) +{ + /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres + * luma blocks. We have to adapt block size to chroma csp */ + int csp = cache.csp; + int bw = 16 >> cache.hshift; + int bh = 16 >> cache.vshift; + MV mvmin, mvmax; + + for (int y = 0; y < height; y += bh) + { + /* note: lowres block count per row might be different from chroma block + * count per row because of rounding issues, so be very careful with indexing + * into the lowres structures */ + int cu = y * cache.lowresWidthInCU; + intptr_t pixoff = y * stride; + mvmin.y = (int16_t)((-y - 8) << 2); + mvmax.y = (int16_t)((height - y - 1 + 8) << 2); + + for (int x = 0; x < width; x += bw, cu++, pixoff += bw) + { + if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU) + { + MV mv = mvs[cu]; // lowres MV + mv <<= 1; // fullres MV + mv.x >>= cache.hshift; + mv.y >>= cache.vshift; + + /* clip MV to available pixels */ + mvmin.x = (int16_t)((-x - 8) << 2); + mvmax.x = (int16_t)((width - x - 1 + 8) << 2); + mv = mv.clipped(mvmin, mvmax); + + intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2); + pixel *temp = src + pixoff + fpeloffset; + + int xFrac = mv.x & 0x7; + int yFrac = mv.y & 0x7; + if ((yFrac | xFrac) == 0) + { + primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride); + } + else if (yFrac == 0) + { + primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac); + } + else if (xFrac == 0) + { + primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac); + } + else + { + ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]); + primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1); + primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac); + } + } + else + { + primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride); + } + } + } +} + +/* Measure sum of 8x8 satd costs between source frame and reference + * frame (potentially weighted, potentially motion compensated). We + * always use source images for this analysis since reference recon + * pixels have unreliable availability */ +uint32_t weightCost(pixel * fenc, + pixel * ref, + pixel * weightTemp, + intptr_t stride, + const Cache & cache, + int width, + int height, + WeightParam * w, + bool bLuma) +{ + if (w) + { + /* make a weighted copy of the reference plane */ + int offset = w->inputOffset << (X265_DEPTH - 8); + int weight = w->inputWeight; + int denom = w->log2WeightDenom; + int round = denom ? 1 << (denom - 1) : 0; + int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */ + int pwidth = ((width + 15) >> 4) << 4; + + primitives.weight_pp(ref, weightTemp, stride, pwidth, height, + weight, round << correction, denom + correction, offset); + ref = weightTemp; + } + + uint32_t cost = 0; + pixel *f = fenc, *r = ref; + + if (bLuma) + { + int cu = 0; + for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride) + { + for (int x = 8; x < width; x += 8, cu++) + { + int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride); + cost += X265_MIN(cmp, cache.intraCost[cu]); + } + } + } + else if (cache.csp == X265_CSP_I444) + for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride) + for (int x = 16; x < width; x += 16) + cost += primitives.satd[LUMA_16x16](r + x, stride, f + x, stride); + else + for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride) + for (int x = 8; x < width; x += 8) + cost += primitives.satd[LUMA_8x8](r + x, stride, f + x, stride); + + return cost; +} +} + +namespace x265 { +void weightAnalyse(Slice& slice, Frame& frame, x265_param& param) +{ + WeightParam wp[2][MAX_NUM_REF][3]; + PicYuv *fencPic = frame.m_origPicYuv; + Lowres& fenc = frame.m_lowres; + + Cache cache; + + memset(&cache, 0, sizeof(cache)); + cache.intraCost = fenc.intraCost; + cache.numPredDir = slice.isInterP() ? 1 : 2; + cache.lowresWidthInCU = fenc.width >> 3; + cache.lowresHeightInCU = fenc.lines >> 3; + cache.csp = fencPic->m_picCsp; + cache.hshift = CHROMA_H_SHIFT(cache.csp); + cache.vshift = CHROMA_V_SHIFT(cache.csp); + + /* Use single allocation for motion compensated ref and weight buffers */ + pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight); + if (!mcbuf) + { + slice.disableWeights(); + return; + } + pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight; + + int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP]; + int curPoc = slice.m_poc; + const float epsilon = 1.f / 128.f; + + int chromaDenom, lumaDenom, denom; + chromaDenom = lumaDenom = 7; + int numpixels[3]; + int w16 = ((fencPic->m_picWidth + 15) >> 4) << 4; + int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4; + numpixels[0] = w16 * h16; + numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift); + + for (int list = 0; list < cache.numPredDir; list++) + { + WeightParam *weights = wp[list][0]; + Frame *refFrame = slice.m_refPicList[list][0]; + Lowres& refLowres = refFrame->m_lowres; + int diffPoc = abs(curPoc - refFrame->m_poc); + + /* prepare estimates */ + float guessScale[3], fencMean[3], refMean[3]; + for (int plane = 0; plane < 3; plane++) + { + SET_WEIGHT(weights[plane], false, 1, 0, 0); + uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane]; + uint64_t refVar = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane]; + guessScale[plane] = sqrt((float)fencVar / refVar); + fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8)); + refMean[plane] = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8)); + } + + /* make sure both our scale factors fit */ + while (!list && chromaDenom > 0) + { + float thresh = 127.f / (1 << chromaDenom); + if (guessScale[1] < thresh && guessScale[2] < thresh) + break; + chromaDenom--; + } + + SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0); + SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0); + + MV *mvs = NULL; + + for (int plane = 0; plane < 3; plane++) + { + denom = plane ? chromaDenom : lumaDenom; + if (plane && !weights[0].bPresentFlag) + break; + + /* Early termination */ + x265_emms(); + if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon) + { + SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0); + continue; + } + + if (plane) + { + int scale = Clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f)); + if (scale > 127) + continue; + weights[plane].inputWeight = scale; + } + else + { + weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list); + } + + int mindenom = weights[plane].log2WeightDenom; + int minscale = weights[plane].inputWeight; + int minoff = 0; + + if (!plane && diffPoc <= param.bframes + 1) + { + mvs = fenc.lowresMvs[list][diffPoc - 1]; + + /* test whether this motion search was performed by lookahead */ + if (mvs[0].x != 0x7FFF) + { + /* reference chroma planes must be extended prior to being + * used as motion compensation sources */ + if (!refFrame->m_bChromaExtended) + { + refFrame->m_bChromaExtended = true; + PicYuv *refPic = refFrame->m_origPicYuv; + int width = refPic->m_picWidth >> cache.hshift; + int height = refPic->m_picHeight >> cache.vshift; + extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY); + extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY); + } + } + else + mvs = 0; + } + + /* prepare inputs to weight analysis */ + pixel *orig; + pixel *fref; + intptr_t stride; + int width, height; + switch (plane) + { + case 0: + orig = fenc.lowresPlane[0]; + stride = fenc.lumaStride; + width = fenc.width; + height = fenc.lines; + fref = refLowres.lowresPlane[0]; + if (mvs) + { + mcLuma(mcbuf, refLowres, mvs); + fref = mcbuf; + } + break; + + case 1: + orig = fencPic->m_picOrg[1]; + stride = fencPic->m_strideC; + fref = refFrame->m_origPicYuv->m_picOrg[1]; + + /* Clamp the chroma dimensions to the nearest multiple of + * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres + * blocks and weightCost measures 8x8 blocks. This + * potentially ignores some edge pixels, but simplifies the + * logic and prevents reading uninitialized pixels. Lowres + * planes are border extended and require no clamping. */ + width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift; + height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift; + if (mvs) + { + mcChroma(mcbuf, fref, stride, mvs, cache, height, width); + fref = mcbuf; + } + break; + + case 2: + fref = refFrame->m_origPicYuv->m_picOrg[2]; + orig = fencPic->m_picOrg[2]; + stride = fencPic->m_strideC; + width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift; + height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift; + if (mvs) + { + mcChroma(mcbuf, fref, stride, mvs, cache, height, width); + fref = mcbuf; + } + break; + + default: + slice.disableWeights(); + X265_FREE(mcbuf); + return; + } + + uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane); + if (!origscore) + { + SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0); + continue; + } + + uint32_t minscore = origscore; + bool bFound = false; + + /* x264 uses a table lookup here, selecting search range based on preset */ + static const int scaleDist = 4; + static const int offsetDist = 2; + + int startScale = Clip3(0, 127, minscale - scaleDist); + int endScale = Clip3(0, 127, minscale + scaleDist); + for (int scale = startScale; scale <= endScale; scale++) + { + int deltaWeight = scale - (1 << mindenom); + if (deltaWeight > 127 || deltaWeight <= -128) + continue; + + x265_emms(); + int curScale = scale; + int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f); + if (curOffset < -128 || curOffset > 127) + { + /* Rescale considering the constraints on curOffset. We do it in this order + * because scale has a much wider range than offset (because of denom), so + * it should almost never need to be clamped. */ + curOffset = Clip3(-128, 127, curOffset); + curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f); + curScale = Clip3(0, 127, curScale); + } + + int startOffset = Clip3(-128, 127, curOffset - offsetDist); + int endOffset = Clip3(-128, 127, curOffset + offsetDist); + for (int off = startOffset; off <= endOffset; off++) + { + WeightParam wsp; + SET_WEIGHT(wsp, true, curScale, mindenom, off); + uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) + + sliceHeaderCost(&wsp, lambda, !!plane); + COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true); + + /* Don't check any more offsets if the previous one had a lower cost than the current one */ + if (minoff == startOffset && off != startOffset) + break; + } + } + + /* Use a smaller luma denominator if possible */ + if (!(plane || list)) + { + while (mindenom > 0 && !(minscale & 1)) + { + mindenom--; + minscale >>= 1; + } + } + + if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f) + { + SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0); + } + else + { + SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff); + } + } + + if (weights[0].bPresentFlag) + { + // Make sure both chroma channels match + if (weights[1].bPresentFlag != weights[2].bPresentFlag) + { + if (weights[1].bPresentFlag) + weights[2] = weights[1]; + else + weights[1] = weights[2]; + } + } + + lumaDenom = weights[0].log2WeightDenom; + chromaDenom = weights[1].log2WeightDenom; + + /* reset weight states */ + for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++) + { + SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0); + SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0); + SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0); + } + } + + X265_FREE(mcbuf); + + memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3); + + if (param.logLevel >= X265_LOG_FULL) + { + char buf[1024]; + int p = 0; + bool bWeighted = false; + + p = sprintf(buf, "poc: %d weights:", slice.m_poc); + int numPredDir = slice.isInterP() ? 1 : 2; + for (int list = 0; list < numPredDir; list++) + { + WeightParam* w = &wp[list][0][0]; + if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag) + { + bWeighted = true; + p += sprintf(buf + p, " [L%d:R0 ", list); + if (w[0].bPresentFlag) + p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset); + if (w[1].bPresentFlag) + p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset); + if (w[2].bPresentFlag) + p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset); + p += sprintf(buf + p, "]"); + } + } + + if (bWeighted) + { + if (p < 80) // pad with spaces to ensure progress line overwritten + sprintf(buf + p, "%*s", 80 - p, " "); + x265_log(¶m, X265_LOG_FULL, "%s\n", buf); + } + } +} +} diff --git a/source/filters/filters.cpp b/source/filters/filters.cpp new file mode 100644 index 0000000..26a26ac --- /dev/null +++ b/source/filters/filters.cpp @@ -0,0 +1,79 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Selvakumar Nithiyaruban + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "filters.h" +#include "common.h" + +/* The dithering algorithm is based on Sierra-2-4A error diffusion. */ +void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride, + int width, int height, int16_t *errors, int bitDepth) +{ + const int lShift = 16 - bitDepth; + const int rShift = 16 - bitDepth + 2; + const int half = (1 << (16 - bitDepth + 1)); + const int pixelMax = (1 << bitDepth) - 1; + + memset(errors, 0, (width + 1) * sizeof(int16_t)); + int pitch = 1; + for (int y = 0; y < height; y++, src += srcStride, dst += dstStride) + { + int16_t err = 0; + for (int x = 0; x < width; x++) + { + err = err * 2 + errors[x] + errors[x + 1]; + dst[x * pitch] = (pixel)Clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift); + errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift); + } + } +} + +void ditherImage(x265_picture& picIn, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth) +{ + /* This portion of code is from readFrame in x264. */ + for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++) + { + if ((picIn.bitDepth & 7) && (picIn.bitDepth != 16)) + { + /* upconvert non 16bit high depth planes to 16bit */ + uint16_t *plane = (uint16_t*)picIn.planes[i]; + uint32_t pixelCount = x265_picturePlaneSize(picIn.colorSpace, picWidth, picHeight, i); + int lShift = 16 - picIn.bitDepth; + + /* This loop assumes width is equal to stride which + happens to be true for file reader outputs */ + for (uint32_t j = 0; j < pixelCount; j++) + { + plane[j] = plane[j] << lShift; + } + } + } + + for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++) + { + int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]); + int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]); + + ditherPlane(((pixel*)picIn.planes[i]), picIn.stride[i] / sizeof(pixel), ((uint16_t*)picIn.planes[i]), + picIn.stride[i] / 2, width, height, errorBuf, bitDepth); + } +} diff --git a/source/filters/filters.h b/source/filters/filters.h new file mode 100644 index 0000000..0f2ba3d --- /dev/null +++ b/source/filters/filters.h @@ -0,0 +1,31 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Selvakumar Nithiyaruban + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FILTERS_H +#define X265_FILTERS_H + +#include "x265.h" + +void ditherImage(x265_picture&, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth); + +#endif //X265_FILTERS_H diff --git a/source/input/input.cpp b/source/input/input.cpp new file mode 100644 index 0000000..096638c --- /dev/null +++ b/source/input/input.cpp @@ -0,0 +1,38 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "input.h" +#include "yuv.h" +#include "y4m.h" + +using namespace x265; + +Input* Input::open(InputFileInfo& info, bool bForceY4m) +{ + const char * s = strrchr(info.filename, '.'); + + if (bForceY4m || (s && !strcmp(s, ".y4m"))) + return new Y4MInput(info); + else + return new YUVInput(info); +} diff --git a/source/input/input.h b/source/input/input.h new file mode 100644 index 0000000..6cb287e --- /dev/null +++ b/source/input/input.h @@ -0,0 +1,83 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_INPUT_H +#define X265_INPUT_H + +#define MIN_FRAME_WIDTH 64 +#define MAX_FRAME_WIDTH 8192 +#define MIN_FRAME_HEIGHT 64 +#define MAX_FRAME_HEIGHT 4320 +#define MIN_FRAME_RATE 1 +#define MAX_FRAME_RATE 300 + +#include "x265.h" + +namespace x265 { +// private x265 namespace + +struct InputFileInfo +{ + /* possibly user-supplied, possibly read from file header */ + int width; + int height; + int csp; + int depth; + int fpsNum; + int fpsDenom; + int sarWidth; + int sarHeight; + int frameCount; + + /* user supplied */ + int skipFrames; + const char *filename; +}; + +class Input +{ +protected: + + virtual ~Input() {} + +public: + + Input() {} + + static Input* open(InputFileInfo& info, bool bForceY4m); + + virtual void startReader() = 0; + + virtual void release() = 0; + + virtual bool readPicture(x265_picture& pic) = 0; + + virtual bool isEof() const = 0; + + virtual bool isFail() = 0; + + virtual const char *getName() const = 0; +}; +} + +#endif // ifndef X265_INPUT_H diff --git a/source/input/y4m.cpp b/source/input/y4m.cpp new file mode 100644 index 0000000..e026eeb --- /dev/null +++ b/source/input/y4m.cpp @@ -0,0 +1,466 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "y4m.h" +#include "common.h" + +#include + +#define ENABLE_THREADING 1 + +#if _WIN32 +#include +#include +#if defined(_MSC_VER) +#pragma warning(disable: 4996) // POSIX setmode and fileno deprecated +#endif +#endif + +using namespace x265; +using namespace std; + +static const char header[] = "FRAME"; + +Y4MInput::Y4MInput(InputFileInfo& info) +{ + for (int i = 0; i < QUEUE_SIZE; i++) + buf[i] = NULL; + + readCount.set(0); + writeCount.set(0); + + threadActive = false; + colorSpace = info.csp; + sarWidth = info.sarWidth; + sarHeight = info.sarHeight; + width = info.width; + height = info.height; + rateNum = info.fpsNum; + rateDenom = info.fpsDenom; + depth = info.depth; + framesize = 0; + + ifs = NULL; + if (!strcmp(info.filename, "-")) + { + ifs = &cin; +#if _WIN32 + setmode(fileno(stdin), O_BINARY); +#endif + } + else + ifs = new ifstream(info.filename, ios::binary | ios::in); + + if (ifs && ifs->good() && parseHeader()) + { + int pixelbytes = depth > 8 ? 2 : 1; + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + int stride = (width >> x265_cli_csps[colorSpace].width[i]) * pixelbytes; + framesize += (stride * (height >> x265_cli_csps[colorSpace].height[i])); + } + + threadActive = true; + for (int q = 0; q < QUEUE_SIZE; q++) + { + buf[q] = X265_MALLOC(char, framesize); + if (!buf[q]) + { + x265_log(NULL, X265_LOG_ERROR, "y4m: buffer allocation failure, aborting"); + threadActive = false; + break; + } + } + } + if (!threadActive) + { + if (ifs && ifs != &cin) + delete ifs; + ifs = NULL; + return; + } + + info.width = width; + info.height = height; + info.sarHeight = sarHeight; + info.sarWidth = sarWidth; + info.fpsNum = rateNum; + info.fpsDenom = rateDenom; + info.csp = colorSpace; + info.depth = depth; + info.frameCount = -1; + + size_t estFrameSize = framesize + strlen(header) + 1; /* assume basic FRAME\n headers */ + + /* try to estimate frame count, if this is not stdin */ + if (ifs != &cin) + { + istream::pos_type cur = ifs->tellg(); + +#if defined(_MSC_VER) && _MSC_VER < 1700 + /* Older MSVC versions cannot handle 64bit file sizes properly, so go native */ + HANDLE hFile = CreateFileA(info.filename, GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile != INVALID_HANDLE_VALUE) + { + LARGE_INTEGER size; + if (GetFileSizeEx(hFile, &size)) + info.frameCount = (int)((size.QuadPart - (int64_t)cur) / estFrameSize); + CloseHandle(hFile); + } +#else // if defined(_MSC_VER) && _MSC_VER < 1700 + if (cur >= 0) + { + ifs->seekg(0, ios::end); + istream::pos_type size = ifs->tellg(); + ifs->seekg(cur, ios::beg); + if (size > 0) + info.frameCount = (int)((size - cur) / estFrameSize); + } +#endif // if defined(_MSC_VER) && _MSC_VER < 1700 + } + + if (info.skipFrames) + { +#if X86_64 + ifs->seekg((uint64_t)estFrameSize * info.skipFrames, ios::cur); +#else + for (int i = 0; i < info.skipFrames; i++) + ifs->ignore(estFrameSize); +#endif + } +} + +Y4MInput::~Y4MInput() +{ + if (ifs && ifs != &cin) + delete ifs; + + for (int i = 0; i < QUEUE_SIZE; i++) + X265_FREE(buf[i]); +} + +void Y4MInput::release() +{ + threadActive = false; + readCount.set(readCount.get()); // unblock file reader + stop(); + delete this; +} + +bool Y4MInput::parseHeader() +{ + if (!ifs) + return false; + + int csp = 0; + int d = 0; + + while (!ifs->eof()) + { + // Skip Y4MPEG string + int c = ifs->get(); + while (!ifs->eof() && (c != ' ') && (c != '\n')) + { + c = ifs->get(); + } + + while (c == ' ' && !ifs->eof()) + { + // read parameter identifier + switch (ifs->get()) + { + case 'W': + width = 0; + while (!ifs->eof()) + { + c = ifs->get(); + + if (c == ' ' || c == '\n') + { + break; + } + else + { + width = width * 10 + (c - '0'); + } + } + + break; + + case 'H': + height = 0; + while (!ifs->eof()) + { + c = ifs->get(); + if (c == ' ' || c == '\n') + { + break; + } + else + { + height = height * 10 + (c - '0'); + } + } + + break; + + case 'F': + rateNum = 0; + rateDenom = 0; + while (!ifs->eof()) + { + c = ifs->get(); + if (c == '.') + { + rateDenom = 1; + while (!ifs->eof()) + { + c = ifs->get(); + if (c == ' ' || c == '\n') + { + break; + } + else + { + rateNum = rateNum * 10 + (c - '0'); + rateDenom = rateDenom * 10; + } + } + + break; + } + else if (c == ':') + { + while (!ifs->eof()) + { + c = ifs->get(); + if (c == ' ' || c == '\n') + { + break; + } + else + rateDenom = rateDenom * 10 + (c - '0'); + } + + break; + } + else + { + rateNum = rateNum * 10 + (c - '0'); + } + } + + break; + + case 'A': + sarWidth = 0; + sarHeight = 0; + while (!ifs->eof()) + { + c = ifs->get(); + if (c == ':') + { + while (!ifs->eof()) + { + c = ifs->get(); + if (c == ' ' || c == '\n') + { + break; + } + else + sarHeight = sarHeight * 10 + (c - '0'); + } + + break; + } + else + { + sarWidth = sarWidth * 10 + (c - '0'); + } + } + + break; + + case 'C': + csp = 0; + d = 0; + while (!ifs->eof()) + { + c = ifs->get(); + + if (c <= '9' && c >= '0') + { + csp = csp * 10 + (c - '0'); + } + else if (c == 'p') + { + // example: C420p16 + while (!ifs->eof()) + { + c = ifs->get(); + + if (c <= '9' && c >= '0') + d = d * 10 + (c - '0'); + else + break; + } + break; + } + else + break; + } + + if (d >= 8 && d <= 16) + depth = d; + colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420; + break; + + default: + while (!ifs->eof()) + { + // consume this unsupported configuration word + c = ifs->get(); + if (c == ' ' || c == '\n') + break; + } + + break; + } + } + + if (c == '\n') + { + break; + } + } + + if (width < MIN_FRAME_WIDTH || width > MAX_FRAME_WIDTH || + height < MIN_FRAME_HEIGHT || height > MAX_FRAME_HEIGHT || + (rateNum / rateDenom) < 1 || (rateNum / rateDenom) > MAX_FRAME_RATE || + colorSpace <= X265_CSP_I400 || colorSpace >= X265_CSP_COUNT) + return false; + + return true; +} + +void Y4MInput::startReader() +{ +#if ENABLE_THREADING + if (threadActive) + start(); +#endif +} + +void Y4MInput::threadMain() +{ + do + { + if (!populateFrameQueue()) + break; + } + while (threadActive); + + threadActive = false; + writeCount.set(writeCount.get()); // unblock readPicture +} + +bool Y4MInput::populateFrameQueue() +{ + if (!ifs || ifs->fail()) + return false; + + /* strip off the FRAME header */ + char hbuf[sizeof(header)]; + + ifs->read(hbuf, strlen(header)); + if (ifs->eof()) + return false; + + if (!ifs->good() || memcmp(hbuf, header, strlen(header))) + { + x265_log(NULL, X265_LOG_ERROR, "y4m: frame header missing\n"); + return false; + } + + /* consume bytes up to line feed */ + int c = ifs->get(); + while (c != '\n' && ifs->good()) + c = ifs->get(); + + /* wait for room in the ring buffer */ + int written = writeCount.get(); + int read = readCount.get(); + while (written - read > QUEUE_SIZE - 2) + { + read = readCount.waitForChange(read); + if (!threadActive) + return false; + } + + ifs->read(buf[written % QUEUE_SIZE], framesize); + if (ifs->good()) + { + writeCount.incr(); + return true; + } + else + return false; +} + +bool Y4MInput::readPicture(x265_picture& pic) +{ + int read = readCount.get(); + int written = writeCount.get(); + +#if ENABLE_THREADING + + /* only wait if the read thread is still active */ + while (threadActive && read == written) + written = writeCount.waitForChange(written); + +#else + + populateFrameQueue(); + +#endif // if ENABLE_THREADING + + if (read < written) + { + int pixelbytes = depth > 8 ? 2 : 1; + pic.bitDepth = depth; + pic.colorSpace = colorSpace; + pic.stride[0] = width * pixelbytes; + pic.stride[1] = pic.stride[0] >> x265_cli_csps[colorSpace].width[1]; + pic.stride[2] = pic.stride[0] >> x265_cli_csps[colorSpace].width[2]; + pic.planes[0] = buf[read % QUEUE_SIZE]; + pic.planes[1] = (char*)pic.planes[0] + pic.stride[0] * height; + pic.planes[2] = (char*)pic.planes[1] + pic.stride[1] * (height >> x265_cli_csps[colorSpace].height[1]); + readCount.incr(); + return true; + } + else + return false; +} + diff --git a/source/input/y4m.h b/source/input/y4m.h new file mode 100644 index 0000000..786ecde --- /dev/null +++ b/source/input/y4m.h @@ -0,0 +1,94 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_Y4M_H +#define X265_Y4M_H + +#include "input.h" +#include "threading.h" +#include + +#define QUEUE_SIZE 5 + +namespace x265 { +// x265 private namespace + +class Y4MInput : public Input, public Thread +{ +protected: + + uint32_t rateNum; + + uint32_t rateDenom; + + uint32_t sarWidth; + + uint32_t sarHeight; + + size_t framesize; + + int depth; + + int width; + + int height; + + int colorSpace; + + bool threadActive; + + ThreadSafeInteger readCount; + + ThreadSafeInteger writeCount; + + char* buf[QUEUE_SIZE]; + + std::istream *ifs; + + bool parseHeader(); + + void threadMain(); + + bool populateFrameQueue(); + +public: + + Y4MInput(InputFileInfo& info); + + virtual ~Y4MInput(); + + void release(); + + bool isEof() const { return ifs && ifs->eof(); } + + bool isFail() { return !(ifs && !ifs->fail() && threadActive); } + + void startReader(); + + bool readPicture(x265_picture&); + + const char *getName() const { return "y4m"; } +}; +} + +#endif // ifndef X265_Y4M_H diff --git a/source/input/yuv.cpp b/source/input/yuv.cpp new file mode 100644 index 0000000..c13f471 --- /dev/null +++ b/source/input/yuv.cpp @@ -0,0 +1,239 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "yuv.h" +#include "common.h" + +#include + +#define ENABLE_THREADING 1 + +#if _WIN32 +#include +#include +#if defined(_MSC_VER) +#pragma warning(disable: 4996) // POSIX setmode and fileno deprecated +#endif +#endif + +using namespace x265; +using namespace std; + +YUVInput::YUVInput(InputFileInfo& info) +{ + for (int i = 0; i < QUEUE_SIZE; i++) + buf[i] = NULL; + + readCount.set(0); + writeCount.set(0); + depth = info.depth; + width = info.width; + height = info.height; + colorSpace = info.csp; + threadActive = false; + ifs = NULL; + + uint32_t pixelbytes = depth > 8 ? 2 : 1; + framesize = 0; + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + uint32_t w = width >> x265_cli_csps[colorSpace].width[i]; + uint32_t h = height >> x265_cli_csps[colorSpace].height[i]; + framesize += w * h * pixelbytes; + } + + if (width == 0 || height == 0 || info.fpsNum == 0 || info.fpsDenom == 0) + { + x265_log(NULL, X265_LOG_ERROR, "yuv: width, height, and FPS must be specified\n"); + return; + } + + if (!strcmp(info.filename, "-")) + { + ifs = &cin; +#if _WIN32 + setmode(fileno(stdin), O_BINARY); +#endif + } + else + ifs = new ifstream(info.filename, ios::binary | ios::in); + + if (ifs && ifs->good()) + threadActive = true; + else + { + if (ifs && ifs != &cin) + delete ifs; + ifs = NULL; + return; + } + + for (uint32_t i = 0; i < QUEUE_SIZE; i++) + { + buf[i] = X265_MALLOC(char, framesize); + if (buf[i] == NULL) + { + x265_log(NULL, X265_LOG_ERROR, "yuv: buffer allocation failure, aborting\n"); + threadActive = false; + return; + } + } + + info.frameCount = -1; + + /* try to estimate frame count, if this is not stdin */ + if (ifs != &cin) + { + istream::pos_type cur = ifs->tellg(); + +#if defined(_MSC_VER) && _MSC_VER < 1700 + /* Older MSVC versions cannot handle 64bit file sizes properly, so go native */ + HANDLE hFile = CreateFileA(info.filename, GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile != INVALID_HANDLE_VALUE) + { + LARGE_INTEGER size; + if (GetFileSizeEx(hFile, &size)) + info.frameCount = (int)((size.QuadPart - (int64_t)cur) / framesize); + CloseHandle(hFile); + } +#else // if defined(_MSC_VER) && _MSC_VER < 1700 + if (cur >= 0) + { + ifs->seekg(0, ios::end); + istream::pos_type size = ifs->tellg(); + ifs->seekg(cur, ios::beg); + if (size > 0) + info.frameCount = (int)((size - cur) / framesize); + } +#endif // if defined(_MSC_VER) && _MSC_VER < 1700 + } + + if (info.skipFrames) + { +#if X86_64 + ifs->seekg((uint64_t)framesize * info.skipFrames, ios::cur); +#else + for (int i = 0; i < info.skipFrames; i++) + ifs->ignore(framesize); +#endif + } +} + +YUVInput::~YUVInput() +{ + if (ifs && ifs != &cin) + delete ifs; + for (int i = 0; i < QUEUE_SIZE; i++) + X265_FREE(buf[i]); +} + +void YUVInput::release() +{ + threadActive = false; + readCount.set(readCount.get()); // unblock read thread + stop(); + delete this; +} + +void YUVInput::startReader() +{ +#if ENABLE_THREADING + if (threadActive) + start(); +#endif +} + +void YUVInput::threadMain() +{ + while (threadActive) + { + if (!populateFrameQueue()) + break; + } + + threadActive = false; + writeCount.set(writeCount.get()); // unblock readPicture +} + +bool YUVInput::populateFrameQueue() +{ + if (!ifs || ifs->fail()) + return false; + + /* wait for room in the ring buffer */ + int written = writeCount.get(); + int read = readCount.get(); + while (written - read > QUEUE_SIZE - 2) + { + read = readCount.waitForChange(read); + if (!threadActive) + // release() has been called + return false; + } + + ifs->read(buf[written % QUEUE_SIZE], framesize); + if (ifs->good()) + { + writeCount.incr(); + return true; + } + else + return false; +} + +bool YUVInput::readPicture(x265_picture& pic) +{ + int read = readCount.get(); + int written = writeCount.get(); + +#if ENABLE_THREADING + + /* only wait if the read thread is still active */ + while (threadActive && read == written) + written = writeCount.waitForChange(written); + +#else + + populateFrameQueue(); + +#endif // if ENABLE_THREADING + + if (read < written) + { + uint32_t pixelbytes = depth > 8 ? 2 : 1; + pic.colorSpace = colorSpace; + pic.bitDepth = depth; + pic.stride[0] = width * pixelbytes; + pic.stride[1] = pic.stride[0] >> x265_cli_csps[colorSpace].width[1]; + pic.stride[2] = pic.stride[0] >> x265_cli_csps[colorSpace].width[2]; + pic.planes[0] = buf[read % QUEUE_SIZE]; + pic.planes[1] = (char*)pic.planes[0] + pic.stride[0] * height; + pic.planes[2] = (char*)pic.planes[1] + pic.stride[1] * (height >> x265_cli_csps[colorSpace].height[1]); + readCount.incr(); + return true; + } + else + return false; +} diff --git a/source/input/yuv.h b/source/input/yuv.h new file mode 100644 index 0000000..faa5161 --- /dev/null +++ b/source/input/yuv.h @@ -0,0 +1,86 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_YUV_H +#define X265_YUV_H + +#include "input.h" +#include "threading.h" +#include + +#define QUEUE_SIZE 5 + +namespace x265 { +// private x265 namespace + +class YUVInput : public Input, public Thread +{ +protected: + + int width; + + int height; + + int colorSpace; //< source Color Space Parameter + + uint32_t depth; + + uint32_t framesize; + + bool threadActive; + + ThreadSafeInteger readCount; + + ThreadSafeInteger writeCount; + + char* buf[QUEUE_SIZE]; + + std::istream *ifs; + + int guessFrameCount(); + + void threadMain(); + + bool populateFrameQueue(); + +public: + + YUVInput(InputFileInfo& info); + + virtual ~YUVInput(); + + void release(); + + bool isEof() const { return ifs && ifs->eof(); } + + bool isFail() { return !(ifs && !ifs->fail() && threadActive); } + + void startReader(); + + bool readPicture(x265_picture&); + + const char *getName() const { return "yuv"; } +}; +} + +#endif // ifndef X265_YUV_H diff --git a/source/output/output.cpp b/source/output/output.cpp new file mode 100644 index 0000000..c481d7b --- /dev/null +++ b/source/output/output.cpp @@ -0,0 +1,38 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "output.h" +#include "yuv.h" +#include "y4m.h" + +using namespace x265; + +Output* Output::open(const char *fname, int width, int height, uint32_t bitdepth, uint32_t fpsNum, uint32_t fpsDenom, int csp) +{ + const char * s = strrchr(fname, '.'); + + if (s && !strcmp(s, ".y4m")) + return new Y4MOutput(fname, width, height, fpsNum, fpsDenom, csp); + else + return new YUVOutput(fname, width, height, bitdepth, csp); +} diff --git a/source/output/output.h b/source/output/output.h new file mode 100644 index 0000000..a754846 --- /dev/null +++ b/source/output/output.h @@ -0,0 +1,55 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_OUTPUT_H +#define X265_OUTPUT_H + +#include "x265.h" + +namespace x265 { +// private x265 namespace + +class Output +{ +protected: + + virtual ~Output() {} + +public: + + Output() {} + + static Output* open(const char *fname, int width, int height, uint32_t bitdepth, + uint32_t fpsNum, uint32_t fpsDenom, int csp); + + virtual bool isFail() const = 0; + + virtual void release() = 0; + + virtual bool writePicture(const x265_picture& pic) = 0; + + virtual const char *getName() const = 0; +}; +} + +#endif // ifndef X265_OUTPUT_H diff --git a/source/output/y4m.cpp b/source/output/y4m.cpp new file mode 100644 index 0000000..8339928 --- /dev/null +++ b/source/output/y4m.cpp @@ -0,0 +1,117 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "output.h" +#include "y4m.h" + +using namespace x265; +using namespace std; + +Y4MOutput::Y4MOutput(const char *filename, int w, int h, uint32_t fpsNum, uint32_t fpsDenom, int csp) + : width(w) + , height(h) + , colorSpace(csp) + , frameSize(0) +{ + ofs.open(filename, ios::binary | ios::out); + buf = new char[width]; + + const char *cf = (csp >= X265_CSP_I444) ? "444" : (csp >= X265_CSP_I422) ? "422" : "420"; + + if (ofs) + { + ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << fpsNum << ":" << fpsDenom << " Ip" << " C" << cf << "\n"; + header = ofs.tellp(); + } + + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + frameSize += (uint32_t)((width >> x265_cli_csps[colorSpace].width[i]) * (height >> x265_cli_csps[colorSpace].height[i])); + } +} + +Y4MOutput::~Y4MOutput() +{ + ofs.close(); + delete [] buf; +} + +bool Y4MOutput::writePicture(const x265_picture& pic) +{ + std::ofstream::pos_type outPicPos = header; + outPicPos += (uint64_t)pic.poc * (6 + frameSize); + ofs.seekp(outPicPos); + ofs << "FRAME\n"; + +#if HIGH_BIT_DEPTH + if (pic.bitDepth > 8 && pic.poc == 0) + { + x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n"); + } +#else + if (pic.bitDepth > 8 && pic.poc == 0) + { + x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n"); + } +#endif + + X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n"); + +#if HIGH_BIT_DEPTH + + // encoder gave us short pixels, downshift, then write + X265_CHECK(pic.bitDepth > 8, "invalid bit depth\n"); + int shift = pic.bitDepth - 8; + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + uint16_t *src = (uint16_t*)pic.planes[i]; + for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) + { + for (int w = 0; w < width >> x265_cli_csps[colorSpace].width[i]; w++) + { + buf[w] = (char)(src[w] >> shift); + } + + ofs.write(buf, width >> x265_cli_csps[colorSpace].width[i]); + src += pic.stride[i] / sizeof(*src); + } + } + +#else // if HIGH_BIT_DEPTH + + X265_CHECK(pic.bitDepth == 8, "invalid bit depth\n"); + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + char *src = (char*)pic.planes[i]; + for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) + { + ofs.write(src, width >> x265_cli_csps[colorSpace].width[i]); + src += pic.stride[i] / sizeof(*src); + } + } + +#endif // if HIGH_BIT_DEPTH + + return true; +} diff --git a/source/output/y4m.h b/source/output/y4m.h new file mode 100644 index 0000000..0792057 --- /dev/null +++ b/source/output/y4m.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_Y4M_H +#define X265_Y4M_H + +#include "output.h" +#include + +namespace x265 { +// private x265 namespace + +class Y4MOutput : public Output +{ +protected: + + int width; + + int height; + + int colorSpace; + + uint32_t frameSize; + + std::ofstream ofs; + + std::ofstream::pos_type header; + + char *buf; + + void writeHeader(); + +public: + + Y4MOutput(const char *filename, int width, int height, uint32_t fpsNum, uint32_t fpsDenom, int csp); + + virtual ~Y4MOutput(); + + const char *getName() const { return "y4m"; } + + bool isFail() const { return ofs.fail(); } + + void release() { delete this; } + + bool writePicture(const x265_picture& pic); +}; +} + +#endif // ifndef X265_Y4M_H diff --git a/source/output/yuv.cpp b/source/output/yuv.cpp new file mode 100644 index 0000000..279cd03 --- /dev/null +++ b/source/output/yuv.cpp @@ -0,0 +1,109 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "output.h" +#include "yuv.h" + +using namespace x265; +using namespace std; + +YUVOutput::YUVOutput(const char *filename, int w, int h, uint32_t d, int csp) + : width(w) + , height(h) + , depth(d) + , colorSpace(csp) + , frameSize(0) +{ + ofs.open(filename, ios::binary | ios::out); + buf = new char[width]; + + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + frameSize += (uint32_t)((width >> x265_cli_csps[colorSpace].width[i]) * (height >> x265_cli_csps[colorSpace].height[i])); + } +} + +YUVOutput::~YUVOutput() +{ + ofs.close(); + delete [] buf; +} + +bool YUVOutput::writePicture(const x265_picture& pic) +{ + uint64_t fileOffset = pic.poc; + fileOffset *= frameSize; + + X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n"); + X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n"); + +#if HIGH_BIT_DEPTH + if (depth == 8) + { + int shift = pic.bitDepth - 8; + ofs.seekp((std::streamoff)fileOffset); + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + uint16_t *src = (uint16_t*)pic.planes[i]; + for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) + { + for (int w = 0; w < width >> x265_cli_csps[colorSpace].width[i]; w++) + { + buf[w] = (char)(src[w] >> shift); + } + + ofs.write(buf, width >> x265_cli_csps[colorSpace].width[i]); + src += pic.stride[i] / sizeof(*src); + } + } + } + else + { + ofs.seekp((std::streamoff)(fileOffset * 2)); + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + uint16_t *src = (uint16_t*)pic.planes[i]; + for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) + { + ofs.write((const char*)src, (width * 2) >> x265_cli_csps[colorSpace].width[i]); + src += pic.stride[i] / sizeof(*src); + } + } + } +#else // if HIGH_BIT_DEPTH + ofs.seekp((std::streamoff)fileOffset); + for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) + { + char *src = (char*)pic.planes[i]; + for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) + { + ofs.write(src, width >> x265_cli_csps[colorSpace].width[i]); + src += pic.stride[i] / sizeof(*src); + } + } + +#endif // if HIGH_BIT_DEPTH + + return true; +} diff --git a/source/output/yuv.h b/source/output/yuv.h new file mode 100644 index 0000000..43e8157 --- /dev/null +++ b/source/output/yuv.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_YUV_H +#define X265_YUV_H + +#include "output.h" +#include "common.h" + +#include + +namespace x265 { +// private x265 namespace + +class YUVOutput : public Output +{ +protected: + + int width; + + int height; + + uint32_t depth; + + int colorSpace; + + uint32_t frameSize; + + char *buf; + + std::ofstream ofs; + +public: + + YUVOutput(const char *filename, int width, int height, uint32_t bitdepth, int csp); + + virtual ~YUVOutput(); + + const char *getName() const { return "yuv"; } + + bool isFail() const { return ofs.fail(); } + + void release() { delete this; } + + bool writePicture(const x265_picture& pic); +}; +} + +#endif // ifndef X265_YUV_H diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt new file mode 100644 index 0000000..ff3312f --- /dev/null +++ b/source/test/CMakeLists.txt @@ -0,0 +1,28 @@ +enable_language(ASM_YASM) + +if(MSVC_IDE) + set(YASM_SRC checkasm-a.obj) + add_custom_command( + OUTPUT checkasm-a.obj + COMMAND ${YASM_EXECUTABLE} + ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-a.asm -o checkasm-a.obj + DEPENDS checkasm-a.asm) +else() + set(YASM_SRC checkasm-a.asm) +endif() + +check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC) +if(HAVE_RDTSC) + add_definitions(-DHAVE_RDTSC=1) +endif() + +add_executable(TestBench ${YASM_SRC} + testbench.cpp testharness.h + pixelharness.cpp pixelharness.h + mbdstharness.cpp mbdstharness.h + ipfilterharness.cpp ipfilterharness.h + intrapredharness.cpp intrapredharness.h) +target_link_libraries(TestBench x265-static ${PLATFORM_LIBS}) + +add_executable(PoolTest testpool.cpp) +target_link_libraries(PoolTest x265-static ${PLATFORM_LIBS}) diff --git a/source/test/checkasm-a.asm b/source/test/checkasm-a.asm new file mode 100644 index 0000000..f7b9837 --- /dev/null +++ b/source/test/checkasm-a.asm @@ -0,0 +1,221 @@ +;***************************************************************************** +;* checkasm-a.asm: assembly check tool +;***************************************************************************** +;* Copyright (C) 2008-2014 x264 project +;* +;* Authors: Loren Merritt +;* Henrik Gramner +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;***************************************************************************** + +%include "../common/x86/x86inc.asm" + +SECTION_RODATA + +error_message: db "failed to preserve register", 0 + +%if ARCH_X86_64 +; just random numbers to reduce the chance of incidental match +ALIGN 16 +x6: ddq 0x79445c159ce790641a1b2550a612b48c +x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd +x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943 +x9: ddq 0xd229e1f5b281303facbd382dcf5b8de2 +x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9 +x11: ddq 0x77d410d5c42c882d89b0c0765892729a +x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5 +x13: ddq 0xdd7b8919edd427862e8ec680de14b47c +x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf +x15: ddq 0x6de8f4c914c334d5011ff554472a7a10 +n7: dq 0x21f86d66c8ca00ce +n8: dq 0x75b6ba21077c48ad +n9: dq 0xed56bb2dcb3c7736 +n10: dq 0x8bda43d3fd1a7e06 +n11: dq 0xb64a9c9e5d318408 +n12: dq 0xdf9a54b303f1d3a3 +n13: dq 0x4a75479abd64e097 +n14: dq 0x249214109d5d1c88 +%endif + +SECTION .text + +cextern_naked puts + +; max number of args used by any x265 asm function. +; (max_args % 4) must equal 3 for stack alignment +%define max_args 15 + +%if ARCH_X86_64 + +;----------------------------------------------------------------------------- +; void x265_checkasm_stack_clobber( uint64_t clobber, ... ) +;----------------------------------------------------------------------------- +cglobal checkasm_stack_clobber, 1,2 + ; Clobber the stack with junk below the stack pointer + %define size (max_args+6)*8 + SUB rsp, size + mov r1, size-8 +.loop: + mov [rsp+r1], r0 + sub r1, 8 + jge .loop + ADD rsp, size + RET + +%if WIN64 + %assign free_regs 7 +%else + %assign free_regs 9 +%endif + +;----------------------------------------------------------------------------- +; intptr_t x265_checkasm_call( intptr_t (*func)(), int *ok, ... ) +;----------------------------------------------------------------------------- +cglobal checkasm_call_float +INIT_XMM +cglobal checkasm_call, 2,15,16,max_args*8+8 + mov r6, r0 + mov [rsp+max_args*8], r1 + + ; All arguments have been pushed on the stack instead of registers in order to + ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit. + mov r0, r6mp + mov r1, r7mp + mov r2, r8mp + mov r3, r9mp +%if UNIX64 + mov r4, r10mp + mov r5, r11mp + %assign i 6 + %rep max_args-6 + mov r9, [rsp+stack_offset+(i+1)*8] + mov [rsp+(i-6)*8], r9 + %assign i i+1 + %endrep +%else + %assign i 4 + %rep max_args-4 + mov r9, [rsp+stack_offset+(i+7)*8] + mov [rsp+i*8], r9 + %assign i i+1 + %endrep +%endif + +%if WIN64 + %assign i 6 + %rep 16-6 + mova m %+ i, [x %+ i] + %assign i i+1 + %endrep +%endif + +%assign i 14 +%rep 15-free_regs + mov r %+ i, [n %+ i] + %assign i i-1 +%endrep + call r6 +%assign i 14 +%rep 15-free_regs + xor r %+ i, [n %+ i] + or r14, r %+ i + %assign i i-1 +%endrep + +%if WIN64 + %assign i 6 + %rep 16-6 + pxor m %+ i, [x %+ i] + por m6, m %+ i + %assign i i+1 + %endrep + packsswb m6, m6 + movq r5, m6 + or r14, r5 +%endif + + jz .ok + mov r9, rax + lea r0, [error_message] + call puts + mov r1, [rsp+max_args*8] + mov dword [r1], 0 + mov rax, r9 +.ok: + RET + +%else + +; just random numbers to reduce the chance of incidental match +%define n3 dword 0x6549315c +%define n4 dword 0xe02f3e23 +%define n5 dword 0xb78d0d1d +%define n6 dword 0x33627ba7 + +;----------------------------------------------------------------------------- +; intptr_t x265_checkasm_call( intptr_t (*func)(), int *ok, ... ) +;----------------------------------------------------------------------------- +cglobal checkasm_call_float +cglobal checkasm_call, 1,7 + mov r3, n3 + mov r4, n4 + mov r5, n5 + mov r6, n6 +%rep max_args + push dword [esp+24+max_args*4] +%endrep + call r0 + add esp, max_args*4 + xor r3, n3 + xor r4, n4 + xor r5, n5 + xor r6, n6 + or r3, r4 + or r5, r6 + or r3, r5 + jz .ok + mov r3, eax + lea r1, [error_message] + push r1 + call puts + add esp, 4 + mov r1, r1m + mov dword [r1], 0 + mov eax, r3 +.ok: + REP_RET + +%endif ; ARCH_X86_64 + +;----------------------------------------------------------------------------- +; int x265_stack_pagealign( int (*func)(), int align ) +;----------------------------------------------------------------------------- +cglobal stack_pagealign, 2,2 + movsxdifnidn r1, r1d + push rbp + mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~0xfff + sub rsp, r1 + call r0 + leave + RET + diff --git a/source/test/intrapredharness.cpp b/source/test/intrapredharness.cpp new file mode 100644 index 0000000..97eff94 --- /dev/null +++ b/source/test/intrapredharness.cpp @@ -0,0 +1,304 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "predict.h" +#include "intrapredharness.h" + +using namespace x265; + +IntraPredHarness::IntraPredHarness() +{ + for (int i = 0; i < INPUT_SIZE; i++) + pixel_buff[i] = rand() % PIXEL_MAX; + + initROM(); +} + +bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width) +{ + int j = Predict::ADI_BUF_STRIDE; + intptr_t stride = FENC_STRIDE; + +#if _DEBUG + memset(pixel_out_vec, 0xCD, OUTPUT_SIZE); + memset(pixel_out_c, 0xCD, OUTPUT_SIZE); +#endif + + for (int i = 0; i <= 100; i++) + { + int rand_filter = rand() & 1; + if (width > 16) + rand_filter = 0; + + pixel left[MAX_CU_SIZE * 2 + 1]; + for (int k = 0; k < width * 2 + 1; k++) + { + left[k] = pixel_buff[j - 1 + k * Predict::ADI_BUF_STRIDE]; + } + + ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, rand_filter); + checked(opt, pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, rand_filter); + + for (int k = 0; k < width; k++) + { + if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel))) + return false; + } + + reportfail(); + j += FENC_STRIDE; + } + + return true; +} + +bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width) +{ + int j = Predict::ADI_BUF_STRIDE; + intptr_t stride = FENC_STRIDE; + +#if _DEBUG + memset(pixel_out_vec, 0xCD, OUTPUT_SIZE); + memset(pixel_out_c, 0xCD, OUTPUT_SIZE); +#endif + + for (int i = 0; i <= 100; i++) + { + pixel left[MAX_CU_SIZE * 2 + 1]; + for (int k = 0; k < width * 2 + 1; k++) + { + left[k] = pixel_buff[j - 1 + k * Predict::ADI_BUF_STRIDE]; + } + + ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, 0); + checked(opt, pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, left + 1, 0, 0); + + for (int k = 0; k < width; k++) + { + if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel))) + return false; + } + + reportfail(); + j += FENC_STRIDE; + } + + return true; +} + +bool IntraPredHarness::check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]) +{ + int j = Predict::ADI_BUF_STRIDE; + intptr_t stride = FENC_STRIDE; + +#if _DEBUG + memset(pixel_out_vec, 0xCD, OUTPUT_SIZE); + memset(pixel_out_c, 0xCD, OUTPUT_SIZE); +#endif + + for (int size = 2; size <= 5; size++) + { + int width = (1 << size); + for (int i = 0; i <= 100; i++) + { + int bFilter = (width <= 16) && (rand() % 2); + for (int pmode = 2; pmode <= 34; pmode++) + { + if (!opt[pmode][size - 2]) + continue; + + pixel * refAbove = pixel_buff + j; + pixel * refLeft = refAbove + 3 * width; + refLeft[0] = refAbove[0]; + + checked(opt[pmode][size - 2], pixel_out_vec, stride, refLeft, refAbove, pmode, bFilter); + ref[pmode][size - 2](pixel_out_c, stride, refLeft, refAbove, pmode, bFilter); + + for (int k = 0; k < width; k++) + { + if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel))) + return false; + } + + reportfail(); + } + + j += FENC_STRIDE; + } + } + + return true; +} + +bool IntraPredHarness::check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]) +{ + int j = Predict::ADI_BUF_STRIDE; + int isLuma; + +#if _DEBUG + memset(pixel_out_33_vec, 0xCD, OUTPUT_SIZE_33); + memset(pixel_out_33_c, 0xCD, OUTPUT_SIZE_33); +#endif + + for (int size = 2; size <= 5; size++) + { + if (opt[size - 2] == NULL) continue; + + const int width = (1 << size); + + for (int i = 0; i <= 100; i++) + { + isLuma = (width <= 16) ? true : false; // bFilter is true for 4x4, 8x8, 16x16 and false for 32x32 + + pixel * refAbove0 = pixel_buff + j; + pixel * refLeft0 = refAbove0 + 3 * width; + + pixel * refAbove1 = pixel_buff + j + 3 * FENC_STRIDE; // keep this offset, since vector code may broken input buffer range [-(width-1), 0] + pixel * refLeft1 = refAbove1 + 3 * width + FENC_STRIDE; + refLeft0[0] = refAbove0[0] = refLeft1[0] = refAbove1[0]; + + ref[size - 2](pixel_out_33_c, refAbove0, refLeft0, refAbove1, refLeft1, isLuma); + checked(opt[size - 2], pixel_out_33_vec, refAbove0, refLeft0, refAbove1, refLeft1, isLuma); + + for (int p = 2 - 2; p <= 34 - 2; p++) + { + for (int k = 0; k < width; k++) + { + if (memcmp(pixel_out_33_c + p * (width * width) + k * width, pixel_out_33_vec + p * (width * width) + k * width, width * sizeof(pixel))) + { + printf("\nFailed: (%dx%d) Mode(%2d), Line[%2d], bfilter=%d\n", width, width, p + 2, k, isLuma); + opt[size - 2](pixel_out_33_vec, refAbove0, refLeft0, refAbove1, refLeft1, isLuma); + return false; + } + } + } + + reportfail(); + j += FENC_STRIDE; + } + } + + return true; +} + +bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++) + { + if (opt.intra_pred[1][i]) + { + const int size = (1 << (i + 2)); + if (!check_dc_primitive(ref.intra_pred[1][i], opt.intra_pred[1][i], size)) + { + printf("intra_dc %dx%d failed\n", size, size); + return false; + } + } + if (opt.intra_pred[0][i]) + { + const int size = (1 << (i + 2)); + if (!check_planar_primitive(ref.intra_pred[0][i], opt.intra_pred[0][i], size)) + { + printf("intra_planar %dx%d failed\n", size, size); + return false; + } + } + } + + // NOTE: always call since this function have check pointer in loop + if (!check_angular_primitive(ref.intra_pred, opt.intra_pred)) + { + printf("intra_angular failed\n"); + return false; + } + + if (opt.intra_pred_allangs[0]) + { + if (!check_allangs_primitive(ref.intra_pred_allangs, opt.intra_pred_allangs)) + { + printf("intra_allangs failed\n"); + return false; + } + } + + return true; +} + +void IntraPredHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + int width = 64; + uint16_t srcStride = 96; + + for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++) + { + const int size = (1 << (i + 2)); + if (opt.intra_pred[1][i]) + { + printf("intra_dc_%dx%d[f=0]", size, size); + REPORT_SPEEDUP(opt.intra_pred[1][i], ref.intra_pred[1][i], + pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, pixel_buff, 0, 0); + if (size <= 16) + { + printf("intra_dc_%dx%d[f=1]", size, size); + REPORT_SPEEDUP(opt.intra_pred[1][i], ref.intra_pred[1][i], + pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, pixel_buff, 0, 1); + } + } + if (opt.intra_pred[0][i]) + { + printf("intra_planar %2dx%d", size, size); + REPORT_SPEEDUP(opt.intra_pred[0][i], ref.intra_pred[0][i], + pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, pixel_buff, 0, 0); + } + if (opt.intra_pred_allangs[i]) + { + bool bFilter = (size <= 16); + pixel * refAbove = pixel_buff + srcStride; + pixel * refLeft = refAbove + 3 * size; + refLeft[0] = refAbove[0]; + printf("intra_allangs%dx%d", size, size); + REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i], + pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter); + } + } + + for (int ii = 2; ii <= 5; ii++) + { + for (int p = 2; p <= 34; p += 1) + { + int pmode = p; //(rand()%33)+2; + if (opt.intra_pred[pmode][ii - 2]) + { + width = (1 << ii); + bool bFilter = (width <= 16); + pixel * refAbove = pixel_buff + srcStride; + pixel * refLeft = refAbove + 3 * width; + refLeft[0] = refAbove[0]; + printf("intra_ang%dx%d[%2d]", width, width, pmode); + REPORT_SPEEDUP(opt.intra_pred[pmode][ii - 2], ref.intra_pred[pmode][ii - 2], + pixel_out_vec, FENC_STRIDE, refAbove, refLeft, pmode, bFilter); + } + } + } +} diff --git a/source/test/intrapredharness.h b/source/test/intrapredharness.h new file mode 100644 index 0000000..622880d --- /dev/null +++ b/source/test/intrapredharness.h @@ -0,0 +1,60 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _INTRAPREDHARNESS_H_1 +#define _INTRAPREDHARNESS_H_1 1 + +#include "testharness.h" +#include "primitives.h" + +class IntraPredHarness : public TestHarness +{ +protected: + + enum { INPUT_SIZE = 4 * 65 * 65 * 100 }; + enum { OUTPUT_SIZE = 64 * FENC_STRIDE }; + enum { OUTPUT_SIZE_33 = 33 * OUTPUT_SIZE }; + + ALIGN_VAR_16(pixel, pixel_buff[INPUT_SIZE]); + pixel pixel_out_c[OUTPUT_SIZE]; + pixel pixel_out_vec[OUTPUT_SIZE]; + pixel pixel_out_33_c[OUTPUT_SIZE_33]; + pixel pixel_out_33_vec[OUTPUT_SIZE_33]; + + bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width); + bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width); + bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]); + bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]); + +public: + + IntraPredHarness(); + + const char *getName() const { return "intrapred"; } + + bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt); + + void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt); +}; + +#endif // ifndef _INTRAPREDHARNESS_H_1 diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp new file mode 100644 index 0000000..f23e84b --- /dev/null +++ b/source/test/ipfilterharness.cpp @@ -0,0 +1,778 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Devaki , + * Rajesh Paulraj + * Praveen Kumar Tiwari + * Min Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "ipfilterharness.h" + +using namespace x265; + +IPFilterHarness::IPFilterHarness() +{ + /* [0] --- Random values + * [1] --- Minimum + * [2] --- Maximum */ + for (int i = 0; i < TEST_BUF_SIZE; i++) + { + pixel_test_buff[0][i] = rand() & PIXEL_MAX; + short_test_buff[0][i] = (rand() % (2 * SMAX)) - SMAX; + + pixel_test_buff[1][i] = PIXEL_MIN; + short_test_buff[1][i] = SMIN; + + pixel_test_buff[2][i] = PIXEL_MAX; + short_test_buff[2][i] = SMAX; + } + + memset(IPF_C_output_p, 0xCD, TEST_BUF_SIZE * sizeof(pixel)); + memset(IPF_vec_output_p, 0xCD, TEST_BUF_SIZE * sizeof(pixel)); + memset(IPF_C_output_s, 0xCD, TEST_BUF_SIZE * sizeof(int16_t)); + memset(IPF_vec_output_s, 0xCD, TEST_BUF_SIZE * sizeof(int16_t)); + + int pixelMax = (1 << X265_DEPTH) - 1; + int shortMax = (1 << 15) - 1; + for (int i = 0; i < TEST_BUF_SIZE; i++) + { + pixel_buff[i] = (pixel)(rand() & pixelMax); + int isPositive = (rand() & 1) ? 1 : -1; + short_buff[i] = (int16_t)(isPositive * (rand() & shortMax)); + } +} + +bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma, int csp) +{ + intptr_t rand_srcStride; + int min_size = isChroma ? 2 : 4; + int max_size = isChroma ? (MAX_CU_SIZE >> 1) : MAX_CU_SIZE; + + if (isChroma && (csp == X265_CSP_I444)) + { + min_size = 4; + max_size = MAX_CU_SIZE; + } + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + int rand_height = (int16_t)rand() % 100; + int rand_width = (int16_t)rand() % 100; + + rand_srcStride = rand_width + rand() % 100; + if (rand_srcStride < rand_width) + rand_srcStride = rand_width; + + rand_width &= ~(min_size - 1); + rand_width = Clip3(min_size, max_size, rand_width); + + rand_height &= ~(min_size - 1); + rand_height = Clip3(min_size, max_size, rand_height); + + ref(pixel_test_buff[index], + rand_srcStride, + IPF_C_output_s, + rand_width, + rand_height); + + checked(opt, pixel_test_buff[index], + rand_srcStride, + IPF_vec_output_s, + rand_width, + rand_height); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + + return true; +} + +bool IPFilterHarness::check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + { + rand_srcStride = rand() % 100 + 2; + rand_dstStride = rand() % 100 + 64; + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_p, + rand_dstStride, + coeffIdx); + + ref(pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_p, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_vec_output_p, IPF_C_output_p, TEST_BUF_SIZE * sizeof(pixel))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_s, + rand_dstStride, + coeffIdx); + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_s, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterChroma_hps_primitive(filter_hps_t ref, filter_hps_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + { + // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7) + for (int isRowExt = 0; isRowExt < 2; isRowExt++) + { + rand_srcStride = rand() % 100 + 2; + rand_dstStride = rand() % 100 + 64; + + ref(pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_s, + rand_dstStride, + coeffIdx, + isRowExt); + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_s, + rand_dstStride, + coeffIdx, + isRowExt); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterChroma_sp_primitive(filter_sp_t ref, filter_sp_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_p, + rand_dstStride, + coeffIdx); + + checked(opt, short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_p, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_vec_output_p, IPF_C_output_p, TEST_BUF_SIZE * sizeof(pixel))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterChroma_ss_primitive(filter_ss_t ref, filter_ss_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_s, + rand_dstStride, + coeffIdx); + + checked(opt, short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_s, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_C_output_s, IPF_vec_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride + 6, + rand_srcStride, + IPF_vec_output_p, + rand_dstStride, + coeffIdx); + + ref(pixel_test_buff[index] + 3 * rand_srcStride + 6, + rand_srcStride, + IPF_C_output_p, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_vec_output_p, IPF_C_output_p, TEST_BUF_SIZE)) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterLuma_ps_primitive(filter_ps_t ref, filter_ps_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_s, + rand_dstStride, + coeffIdx); + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_s, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterLuma_hps_primitive(filter_hps_t ref, filter_hps_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + { + // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7) + for (int isRowExt = 0; isRowExt < 2; isRowExt++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(pixel_test_buff[index] + 3 * rand_srcStride + 6, + rand_srcStride, + IPF_C_output_s, + rand_dstStride, + coeffIdx, + isRowExt); + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride + 6, + rand_srcStride, + IPF_vec_output_s, + rand_dstStride, + coeffIdx, + isRowExt); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterLuma_sp_primitive(filter_sp_t ref, filter_sp_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_p, + rand_dstStride, + coeffIdx); + + checked(opt, short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_p, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_vec_output_p, IPF_C_output_p, TEST_BUF_SIZE * sizeof(pixel))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_s, + rand_dstStride, + coeffIdx); + + checked(opt, short_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_s, + rand_dstStride, + coeffIdx); + + if (memcmp(IPF_C_output_s, IPF_vec_output_s, TEST_BUF_SIZE * sizeof(int16_t))) + return false; + + reportfail(); + } + } + + return true; +} + +bool IPFilterHarness::check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt) +{ + intptr_t rand_srcStride, rand_dstStride; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + + for (int coeffIdxX = 0; coeffIdxX < 4; coeffIdxX++) + { + for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++) + { + rand_srcStride = rand() % 100; + rand_dstStride = rand() % 100 + 64; + + ref(pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_C_output_p, + rand_dstStride, + coeffIdxX, + coeffIdxY); + + checked(opt, pixel_test_buff[index] + 3 * rand_srcStride, + rand_srcStride, + IPF_vec_output_p, + rand_dstStride, + coeffIdxX, + coeffIdxY); + + if (memcmp(IPF_vec_output_p, IPF_C_output_p, TEST_BUF_SIZE * sizeof(pixel))) + return false; + + reportfail(); + } + } + } + + return true; +} + +bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + if (opt.luma_p2s) + { + // last parameter does not matter in case of luma + if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s, 0, 1)) + { + printf("luma_p2s failed\n"); + return false; + } + } + + for (int value = 0; value < NUM_LUMA_PARTITIONS; value++) + { + if (opt.luma_hpp[value]) + { + if (!check_IPFilterLuma_primitive(ref.luma_hpp[value], opt.luma_hpp[value])) + { + printf("luma_hpp[%s]", lumaPartStr[value]); + return false; + } + } + if (opt.luma_hps[value]) + { + if (!check_IPFilterLuma_hps_primitive(ref.luma_hps[value], opt.luma_hps[value])) + { + printf("luma_hps[%s]", lumaPartStr[value]); + return false; + } + } + if (opt.luma_vpp[value]) + { + if (!check_IPFilterLuma_primitive(ref.luma_vpp[value], opt.luma_vpp[value])) + { + printf("luma_vpp[%s]", lumaPartStr[value]); + return false; + } + } + if (opt.luma_vps[value]) + { + if (!check_IPFilterLuma_ps_primitive(ref.luma_vps[value], opt.luma_vps[value])) + { + printf("luma_vps[%s]", lumaPartStr[value]); + return false; + } + } + if (opt.luma_vsp[value]) + { + if (!check_IPFilterLuma_sp_primitive(ref.luma_vsp[value], opt.luma_vsp[value])) + { + printf("luma_vsp[%s]", lumaPartStr[value]); + return false; + } + } + if (opt.luma_vss[value]) + { + if (!check_IPFilterLuma_ss_primitive(ref.luma_vss[value], opt.luma_vss[value])) + { + printf("luma_vss[%s]", lumaPartStr[value]); + return false; + } + } + if (opt.luma_hvpp[value]) + { + if (!check_IPFilterLumaHV_primitive(ref.luma_hvpp[value], opt.luma_hvpp[value])) + { + printf("luma_hvpp[%s]", lumaPartStr[value]); + return false; + } + } + } + + for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++) + { + if (opt.chroma_p2s[csp]) + { + if (!check_IPFilter_primitive(ref.chroma_p2s[csp], opt.chroma_p2s[csp], 1, csp)) + { + printf("chroma_p2s[%s]", x265_source_csp_names[csp]); + return false; + } + } + for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++) + { + if (opt.chroma[csp].filter_hpp[value]) + { + if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_hpp[value], opt.chroma[csp].filter_hpp[value])) + { + printf("chroma_hpp[%s]", chromaPartStr[csp][value]); + return false; + } + } + if (opt.chroma[csp].filter_hps[value]) + { + if (!check_IPFilterChroma_hps_primitive(ref.chroma[csp].filter_hps[value], opt.chroma[csp].filter_hps[value])) + { + printf("chroma_hps[%s]", chromaPartStr[csp][value]); + return false; + } + } + if (opt.chroma[csp].filter_vpp[value]) + { + if (!check_IPFilterChroma_primitive(ref.chroma[csp].filter_vpp[value], opt.chroma[csp].filter_vpp[value])) + { + printf("chroma_vpp[%s]", chromaPartStr[csp][value]); + return false; + } + } + if (opt.chroma[csp].filter_vps[value]) + { + if (!check_IPFilterChroma_ps_primitive(ref.chroma[csp].filter_vps[value], opt.chroma[csp].filter_vps[value])) + { + printf("chroma_vps[%s]", chromaPartStr[csp][value]); + return false; + } + } + if (opt.chroma[csp].filter_vsp[value]) + { + if (!check_IPFilterChroma_sp_primitive(ref.chroma[csp].filter_vsp[value], opt.chroma[csp].filter_vsp[value])) + { + printf("chroma_vsp[%s]", chromaPartStr[csp][value]); + return false; + } + } + if (opt.chroma[csp].filter_vss[value]) + { + if (!check_IPFilterChroma_ss_primitive(ref.chroma[csp].filter_vss[value], opt.chroma[csp].filter_vss[value])) + { + printf("chroma_vss[%s]", chromaPartStr[csp][value]); + return false; + } + } + } + } + + return true; +} + +void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + int height = 64; + int width = 64; + int16_t srcStride = 96; + int16_t dstStride = 96; + int maxVerticalfilterHalfDistance = 3; + + if (opt.luma_p2s) + { + printf("luma_p2s\t"); + REPORT_SPEEDUP(opt.luma_p2s, ref.luma_p2s, + pixel_buff, srcStride, IPF_vec_output_s, width, height); + } + + for (int value = 0; value < NUM_LUMA_PARTITIONS; value++) + { + if (opt.luma_hpp[value]) + { + printf("luma_hpp[%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_hpp[value], ref.luma_hpp[value], + pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1); + } + + if (opt.luma_hps[value]) + { + printf("luma_hps[%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_hps[value], ref.luma_hps[value], + pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_s, dstStride, 1, 1); + } + + if (opt.luma_vpp[value]) + { + printf("luma_vpp[%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_vpp[value], ref.luma_vpp[value], + pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_p, dstStride, 1); + } + + if (opt.luma_vps[value]) + { + printf("luma_vps[%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_vps[value], ref.luma_vps[value], + pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_s, dstStride, 1); + } + + if (opt.luma_vsp[value]) + { + printf("luma_vsp[%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_vsp[value], ref.luma_vsp[value], + short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_p, dstStride, 1); + } + + if (opt.luma_vss[value]) + { + printf("luma_vss[%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_vss[value], ref.luma_vss[value], + short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_s, dstStride, 1); + } + + if (opt.luma_hvpp[value]) + { + printf("luma_hv [%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.luma_hvpp[value], ref.luma_hvpp[value], + pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3); + } + } + + for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++) + { + printf("= Color Space %s =\n", x265_source_csp_names[csp]); + if (opt.chroma_p2s[csp]) + { + printf("chroma_p2s\t"); + REPORT_SPEEDUP(opt.chroma_p2s[csp], ref.chroma_p2s[csp], + pixel_buff, srcStride, IPF_vec_output_s, width, height); + } + for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++) + { + if (opt.chroma[csp].filter_hpp[value]) + { + printf("chroma_hpp[%s]", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].filter_hpp[value], ref.chroma[csp].filter_hpp[value], + pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1); + } + if (opt.chroma[csp].filter_hps[value]) + { + printf("chroma_hps[%s]", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].filter_hps[value], ref.chroma[csp].filter_hps[value], + pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1); + } + if (opt.chroma[csp].filter_vpp[value]) + { + printf("chroma_vpp[%s]", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].filter_vpp[value], ref.chroma[csp].filter_vpp[value], + pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_p, dstStride, 1); + } + if (opt.chroma[csp].filter_vps[value]) + { + printf("chroma_vps[%s]", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].filter_vps[value], ref.chroma[csp].filter_vps[value], + pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_s, dstStride, 1); + } + if (opt.chroma[csp].filter_vsp[value]) + { + printf("chroma_vsp[%s]", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].filter_vsp[value], ref.chroma[csp].filter_vsp[value], + short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_p, dstStride, 1); + } + if (opt.chroma[csp].filter_vss[value]) + { + printf("chroma_vss[%s]", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].filter_vss[value], ref.chroma[csp].filter_vss[value], + short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, + IPF_vec_output_s, dstStride, 1); + } + } + } +} diff --git a/source/test/ipfilterharness.h b/source/test/ipfilterharness.h new file mode 100644 index 0000000..580ea18 --- /dev/null +++ b/source/test/ipfilterharness.h @@ -0,0 +1,77 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Devaki , + * Rajesh Paulraj + * Praveen Kumar Tiwari + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _IPFILTERHARNESS_H_1 +#define _IPFILTERHARNESS_H_1 1 + +#include "testharness.h" +#include "primitives.h" + +class IPFilterHarness : public TestHarness +{ +protected: + + // Assuming max_height = max_width = max_srcStride = max_dstStride = 100 + enum { TEST_BUF_SIZE = 200 * 200 }; + enum { ITERS = 100 }; + enum { TEST_CASES = 3 }; + enum { SMAX = 1 << 12 }; + enum { SMIN = -1 << 12 }; + + ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]); + int16_t short_buff[TEST_BUF_SIZE]; + int16_t IPF_vec_output_s[TEST_BUF_SIZE]; + int16_t IPF_C_output_s[TEST_BUF_SIZE]; + pixel IPF_vec_output_p[TEST_BUF_SIZE]; + pixel IPF_C_output_p[TEST_BUF_SIZE]; + + pixel pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]; + int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE]; + + bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma, int csp); + bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt); + bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt); + bool check_IPFilterChroma_hps_primitive(filter_hps_t ref, filter_hps_t opt); + bool check_IPFilterChroma_sp_primitive(filter_sp_t ref, filter_sp_t opt); + bool check_IPFilterChroma_ss_primitive(filter_ss_t ref, filter_ss_t opt); + bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt); + bool check_IPFilterLuma_ps_primitive(filter_ps_t ref, filter_ps_t opt); + bool check_IPFilterLuma_hps_primitive(filter_hps_t ref, filter_hps_t opt); + bool check_IPFilterLuma_sp_primitive(filter_sp_t ref, filter_sp_t opt); + bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt); + bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt); + +public: + + IPFilterHarness(); + + const char *getName() const { return "interp"; } + + bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt); + + void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt); +}; + +#endif // ifndef _FILTERHARNESS_H_1 diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp new file mode 100644 index 0000000..88e4676 --- /dev/null +++ b/source/test/mbdstharness.cpp @@ -0,0 +1,509 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Min Chen + * Praveen Kumar Tiwari + * Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "mbdstharness.h" + +using namespace x265; + +struct DctConf +{ + const char *name; + int width; +}; + +const DctConf dctInfo[] = +{ + { "dst4x4\t", 4 }, + { "dct4x4\t", 4 }, + { "dct8x8\t", 8 }, + { "dct16x16", 16 }, + { "dct32x32", 32 }, +}; + +const DctConf idctInfo[] = +{ + { "idst4x4\t", 4 }, + { "idct4x4\t", 4 }, + { "idct8x8\t", 8 }, + { "idct16x16", 16 }, + { "idct32x32", 32 }, +}; + +MBDstHarness::MBDstHarness() +{ + const int idct_max = (1 << (BIT_DEPTH + 4)) - 1; + + /* [0] --- Random values + * [1] --- Minimum + * [2] --- Maximum */ + for (int i = 0; i < TEST_BUF_SIZE; i++) + { + short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX); + int_test_buff[0][i] = rand() % PIXEL_MAX; + int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX; + int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX); + + short_test_buff[1][i] = -PIXEL_MAX; + int_test_buff[1][i] = -PIXEL_MAX; + int_idct_test_buff[1][i] = SHORT_MIN; + int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX; + + short_test_buff[2][i] = PIXEL_MAX; + int_test_buff[2][i] = PIXEL_MAX; + int_idct_test_buff[2][i] = SHORT_MAX; + int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX; + + mbuf1[i] = rand() & PIXEL_MAX; + mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX); + mbufidct[i] = (rand() & idct_max); + } + +#if _DEBUG + memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t)); + memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t)); + + memset(mintbuf1, 0, MAX_TU_SIZE * sizeof(int)); + memset(mintbuf2, 0, MAX_TU_SIZE * sizeof(int)); + memset(mintbuf3, 0, MAX_TU_SIZE * sizeof(int)); + memset(mintbuf4, 0, MAX_TU_SIZE * sizeof(int)); +#endif // if _DEBUG +} + +bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width) +{ + int j = 0; + intptr_t cmp_size = sizeof(int) * width * width; + + for (int i = 0; i < ITERS; i++) + { + int index = rand() % TEST_CASES; + + ref(short_test_buff[index] + j, mintbuf3, width); + checked(opt, short_test_buff[index] + j, mintbuf4, width); + + if (memcmp(mintbuf3, mintbuf4, cmp_size)) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, intptr_t width) +{ + int j = 0; + intptr_t cmp_size = sizeof(int16_t) * width * width; + + for (int i = 0; i < ITERS; i++) + { + int index = rand() % TEST_CASES; + + ref(int_idct_test_buff[index] + j, mshortbuf2, width); + checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width); + + if (memcmp(mshortbuf2, mshortbuf3, cmp_size)) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool MBDstHarness::check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt) +{ + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int index = rand() % TEST_CASES; + int log2TrSize = (rand() % 4) + 2; + + int width = (1 << log2TrSize); + int height = width; + int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1); + int per = qp / 6; + int rem = qp % 6; + static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 }; + int scale = invQuantScales[rem] << per; + int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; + int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; + + ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift); + checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift); + + if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width)) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool MBDstHarness::check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt) +{ + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int log2TrSize = (rand() % 4) + 2; + + int width = (1 << log2TrSize); + int height = width; + + int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1); + int per = qp / 6; + int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; + int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; + + int cmp_size = sizeof(int) * height * width; + int index1 = rand() % TEST_CASES; + + ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift); + checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift); + + if (memcmp(mintbuf1, mintbuf2, cmp_size)) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool MBDstHarness::check_quant_primitive(quant_t ref, quant_t opt) +{ + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int width = (rand() % 4 + 1) * 4; + int height = width; + + uint32_t optReturnValue = 0; + uint32_t refReturnValue = 0; + + int bits = (rand() % 24) + 8; + int valueToAdd = rand() % (1 << bits); + int cmp_size = sizeof(int) * height * width; + int cmp_size1 = sizeof(short) * height * width; + int numCoeff = height * width; + + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + + refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff); + optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff); + + if (memcmp(mintbuf1, mintbuf3, cmp_size)) + return false; + + if (memcmp(mshortbuf2, mshortbuf3, cmp_size1)) + return false; + + if (optReturnValue != refReturnValue) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt) +{ + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int width = (rand() % 4 + 1) * 4; + int height = width; + + uint32_t optReturnValue = 0; + uint32_t refReturnValue = 0; + + int bits = rand() % 32; + int valueToAdd = rand() % (1 << bits); + int cmp_size = sizeof(short) * height * width; + int numCoeff = height * width; + + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + + refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff); + optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff); + + if (memcmp(mshortbuf2, mshortbuf3, cmp_size)) + return false; + + if (optReturnValue != refReturnValue) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt) +{ + ALIGN_VAR_32(int16_t, qcoeff[32 * 32]); + + for (int i = 0; i < 4; i++) + { + int log2TrSize = i + 2; + int num = 1 << (log2TrSize * 2); + int mask = num - 1; + + for (int n = 0; n <= num; n++) + { + memset(qcoeff, 0, num * sizeof(int16_t)); + + for (int j = 0; j < n; j++) + { + int k = rand() & mask; + while (qcoeff[k]) + { + k = (k + 11) & mask; + } + + qcoeff[k] = (int16_t)rand() - RAND_MAX / 2; + } + + int refval = ref(qcoeff, num); + int optval = (int)checked(opt, qcoeff, num); + + if (refval != optval) + return false; + + reportfail(); + } + } + + return true; +} + +bool MBDstHarness::check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt) +{ + int j = 0; + + for (int s = 0; s < 4; s++) + { + int log2TrSize = s + 2; + int num = 1 << (log2TrSize * 2); + int cmp_size = sizeof(int) * num; + + for (int i = 0; i < ITERS; i++) + { + memset(mubuf1, 0, num * sizeof(uint32_t)); + memset(mubuf2, 0, num * sizeof(uint32_t)); + memset(mushortbuf1, 0, num * sizeof(uint16_t)); + + for (int k = 0; k < num; k++) + mushortbuf1[k] = rand() % UNSIGNED_SHORT_MAX; + + int index = rand() % TEST_CASES; + + ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num); + checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num); + + if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size)) + return false; + + if (memcmp(mubuf1, mubuf2, cmp_size)) + return false; + + reportfail(); + j += INCR; + } + j = 0; + } + + return true; +} + + +bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + for (int i = 0; i < NUM_DCTS; i++) + { + if (opt.dct[i]) + { + if (!check_dct_primitive(ref.dct[i], opt.dct[i], dctInfo[i].width)) + { + printf("\n%s failed\n", dctInfo[i].name); + return false; + } + } + } + + for (int i = 0; i < NUM_IDCTS; i++) + { + if (opt.idct[i]) + { + if (!check_idct_primitive(ref.idct[i], opt.idct[i], idctInfo[i].width)) + { + printf("%s failed\n", idctInfo[i].name); + return false; + } + } + } + + if (opt.dequant_normal) + { + if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal)) + { + printf("dequant: Failed!\n"); + return false; + } + } + + if (opt.dequant_scaling) + { + if (!check_dequant_primitive(ref.dequant_scaling, opt.dequant_scaling)) + { + printf("dequant_scaling: Failed!\n"); + return false; + } + } + + if (opt.quant) + { + if (!check_quant_primitive(ref.quant, opt.quant)) + { + printf("quant: Failed!\n"); + return false; + } + } + + if (opt.nquant) + { + if (!check_nquant_primitive(ref.nquant, opt.nquant)) + { + printf("nquant: Failed!\n"); + return false; + } + } + + if (opt.count_nonzero) + { + if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero)) + { + printf("count_nonzero: Failed!\n"); + return false; + } + } + + if (opt.dequant_scaling) + { + if (!check_dequant_primitive(ref.dequant_scaling, opt.dequant_scaling)) + { + printf("dequant_scaling: Failed!\n"); + return false; + } + } + + if (opt.denoiseDct) + { + if (!check_denoise_dct_primitive(ref.denoiseDct, opt.denoiseDct)) + { + printf("denoiseDct: Failed!\n"); + return false; + } + } + + return true; +} + +void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + for (int value = 0; value < NUM_DCTS; value++) + { + if (opt.dct[value]) + { + printf("%s\t", dctInfo[value].name); + REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width); + } + } + + for (int value = 0; value < NUM_IDCTS; value++) + { + if (opt.idct[value]) + { + printf("%s\t", idctInfo[value].name); + REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width); + } + } + + if (opt.dequant_normal) + { + printf("dequant_normal\t"); + REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1); + } + + if (opt.dequant_scaling) + { + printf("dequant_scaling\t"); + REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1); + } + + if (opt.quant) + { + printf("quant\t\t"); + REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32); + } + + if (opt.nquant) + { + printf("nquant\t\t"); + REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32); + } + + if (opt.count_nonzero) + { + for (int i = 4; i <= 32; i <<= 1) + { + printf("count_nonzero[%dx%d]", i, i); + REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1, i * i) + } + } + + if (opt.denoiseDct) + { + printf("denoiseDct\t"); + REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32); + } + +} diff --git a/source/test/mbdstharness.h b/source/test/mbdstharness.h new file mode 100644 index 0000000..a8b4de2 --- /dev/null +++ b/source/test/mbdstharness.h @@ -0,0 +1,86 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * Min Chen + * Praveen Kumar Tiwari + * Nabajit Deka + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _MBDSTHARNESS_H_1 +#define _MBDSTHARNESS_H_1 1 + +#include "testharness.h" +#include "primitives.h" + +class MBDstHarness : public TestHarness +{ +protected: + + enum { ITERS = 128 }; + enum { INCR = 16 }; + enum { MAX_TU_SIZE = 32 * 32 }; + enum { TEST_BUF_SIZE = MAX_TU_SIZE + ITERS * INCR }; + enum { TEST_CASES = 3 }; + + ALIGN_VAR_32(int16_t, mbuf1[TEST_BUF_SIZE]); + int16_t mbufdct[TEST_BUF_SIZE]; + int mbufidct[TEST_BUF_SIZE]; + + int16_t mshortbuf2[MAX_TU_SIZE]; + int16_t mshortbuf3[MAX_TU_SIZE]; + + int mintbuf1[MAX_TU_SIZE]; + int mintbuf2[MAX_TU_SIZE]; + int mintbuf3[MAX_TU_SIZE]; + int mintbuf4[MAX_TU_SIZE]; + + int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE]; + int int_test_buff[TEST_CASES][TEST_BUF_SIZE]; + int int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE]; + + uint32_t mubuf1[MAX_TU_SIZE]; + uint32_t mubuf2[MAX_TU_SIZE]; + uint16_t mushortbuf1[MAX_TU_SIZE]; + + int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE]; + int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE]; + + bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt); + bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt); + bool check_quant_primitive(quant_t ref, quant_t opt); + bool check_nquant_primitive(nquant_t ref, nquant_t opt); + bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width); + bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width); + bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt); + bool check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt); + +public: + + MBDstHarness(); + + const char *getName() const { return "transforms"; } + + bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt); + + void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt); +}; + +#endif // ifndef _MBDSTHARNESS_H_1 diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp new file mode 100644 index 0000000..bb6e0e6 --- /dev/null +++ b/source/test/pixelharness.cpp @@ -0,0 +1,1781 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "pixelharness.h" +#include "primitives.h" + +using namespace x265; + +PixelHarness::PixelHarness() +{ + /* [0] --- Random values + * [1] --- Minimum + * [2] --- Maximum */ + for (int i = 0; i < BUFFSIZE; i++) + { + pixel_test_buff[0][i] = rand() % PIXEL_MAX; + short_test_buff[0][i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; // max(SHORT_MIN, min(rand(), SMAX)); + short_test_buff1[0][i] = rand() & PIXEL_MAX; // For block copy only + short_test_buff2[0][i] = rand() % 16383; // for addAvg + int_test_buff[0][i] = rand() % SHORT_MAX; + ushort_test_buff[0][i] = rand() % ((1 << 16) - 1); + uchar_test_buff[0][i] = rand() % ((1 << 8) - 1); + + pixel_test_buff[1][i] = PIXEL_MIN; + short_test_buff[1][i] = SMIN; + short_test_buff1[1][i] = PIXEL_MIN; + short_test_buff2[1][i] = -16384; + int_test_buff[1][i] = SHORT_MIN; + ushort_test_buff[1][i] = PIXEL_MIN; + uchar_test_buff[1][i] = PIXEL_MIN; + + pixel_test_buff[2][i] = PIXEL_MAX; + short_test_buff[2][i] = SMAX; + short_test_buff1[2][i] = PIXEL_MAX; + short_test_buff2[2][i] = 16383; + int_test_buff[2][i] = SHORT_MAX; + ushort_test_buff[2][i] = ((1 << 16) - 1); + uchar_test_buff[2][i] = 255; + + pbuf1[i] = rand() & PIXEL_MAX; + pbuf2[i] = rand() & PIXEL_MAX; + pbuf3[i] = rand() & PIXEL_MAX; + pbuf4[i] = rand() & PIXEL_MAX; + + sbuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX)); + sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX)); + ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; + psbuf1[i] = (rand() % 65) - 32; // range is between -32 to 32 + sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only + } +} + +bool PixelHarness::check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt) +{ + int j = 0; + intptr_t stride = STRIDE; + + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + int vres = (int)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride); + int cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride); + if (vres != cres) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt) +{ + int j = 0; + intptr_t stride = STRIDE; + + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + int vres = (int)checked(opt, short_test_buff[index1], stride, pixel_test_buff[index2] + j, stride); + int cres = ref(short_test_buff[index1], stride, pixel_test_buff[index2] + j, stride); + if (vres != cres) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt) +{ + int j = 0; + intptr_t stride = STRIDE; + + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + int vres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride); + int cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride); + if (vres != cres) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixelcmp_x3(pixelcmp_x3_t ref, pixelcmp_x3_t opt) +{ + ALIGN_VAR_16(int, cres[16]); + ALIGN_VAR_16(int, vres[16]); + int j = 0; + intptr_t stride = FENC_STRIDE - 5; + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + checked(opt, pixel_test_buff[index1], + pixel_test_buff[index2] + j, + pixel_test_buff[index2] + j + 1, + pixel_test_buff[index2] + j + 2, stride, &vres[0]); + ref(pixel_test_buff[index1], + pixel_test_buff[index2] + j, + pixel_test_buff[index2] + j + 1, + pixel_test_buff[index2] + j + 2, stride, &cres[0]); + if ((vres[0] != cres[0]) || ((vres[1] != cres[1])) || ((vres[2] != cres[2]))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixelcmp_x4(pixelcmp_x4_t ref, pixelcmp_x4_t opt) +{ + ALIGN_VAR_16(int, cres[16]); + ALIGN_VAR_16(int, vres[16]); + int j = 0; + intptr_t stride = FENC_STRIDE - 5; + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + checked(opt, pixel_test_buff[index1], + pixel_test_buff[index2] + j, + pixel_test_buff[index2] + j + 1, + pixel_test_buff[index2] + j + 2, + pixel_test_buff[index2] + j + 3, stride, &vres[0]); + ref(pixel_test_buff[index1], + pixel_test_buff[index2] + j, + pixel_test_buff[index2] + j + 1, + pixel_test_buff[index2] + j + 2, + pixel_test_buff[index2] + j + 3, stride, &cres[0]); + + if ((vres[0] != cres[0]) || ((vres[1] != cres[1])) || ((vres[2] != cres[2])) || ((vres[3] != cres[3]))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_calresidual(calcresidual_t ref, calcresidual_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + memset(ref_dest, 0, 64 * 64 * sizeof(int16_t)); + memset(opt_dest, 0, 64 * 64 * sizeof(int16_t)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, pbuf1 + j, pixel_test_buff[index] + j, opt_dest, stride); + ref(pbuf1 + j, pixel_test_buff[index] + j, ref_dest, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt) +{ + int j = 0; + for (int i = 0; i < ITERS; i++) + { + // NOTE: stride must be multiple of 16, because minimum block is 4x4 + int stride = (STRIDE + (rand() % STRIDE)) & ~15; + int cres = ref(sbuf1 + j, stride); + int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride); + + if (cres != vres) + { + return false; + } + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0, 64 * 64 * sizeof(pixel)); + memset(opt_dest, 0, 64 * 64 * sizeof(pixel)); + int j = 0; + int width = 2 * (rand() % 32 + 1); + int height = 8; + int w0 = rand() % 128; + int shift = rand() % 15; + int round = shift ? (1 << (shift - 1)) : 0; + int offset = (rand() % 256) - 128; + intptr_t stride = 64; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, short_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round, shift, offset); + ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round, shift, offset); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_weightp(weightp_pp_t ref, weightp_pp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0, 64 * 64 * sizeof(pixel)); + memset(opt_dest, 0, 64 * 64 * sizeof(pixel)); + int j = 0; + int width = 16 * (rand() % 4 + 1); + int height = 8; + int w0 = rand() % 128; + int shift = rand() % 15; + int round = shift ? (1 << (shift - 1)) : 0; + int offset = (rand() % 256) - 128; + intptr_t stride = 64; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round, shift, offset); + ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round, shift, offset); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_downscale_t(downscale_t ref, downscale_t opt) +{ + ALIGN_VAR_16(pixel, ref_destf[32 * 32]); + ALIGN_VAR_16(pixel, opt_destf[32 * 32]); + + ALIGN_VAR_16(pixel, ref_desth[32 * 32]); + ALIGN_VAR_16(pixel, opt_desth[32 * 32]); + + ALIGN_VAR_16(pixel, ref_destv[32 * 32]); + ALIGN_VAR_16(pixel, opt_destv[32 * 32]); + + ALIGN_VAR_16(pixel, ref_destc[32 * 32]); + ALIGN_VAR_16(pixel, opt_destc[32 * 32]); + + intptr_t src_stride = 64; + intptr_t dst_stride = 32; + int bx = 32; + int by = 32; + int j = 0; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + ref(pixel_test_buff[index] + j, ref_destf, ref_desth, ref_destv, + ref_destc, src_stride, dst_stride, bx, by); + checked(opt, pixel_test_buff[index] + j, opt_destf, opt_desth, opt_destv, + opt_destc, src_stride, dst_stride, bx, by); + + if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel))) + return false; + if (memcmp(ref_desth, opt_desth, 32 * 32 * sizeof(pixel))) + return false; + if (memcmp(ref_destv, opt_destv, 32 * 32 * sizeof(pixel))) + return false; + if (memcmp(ref_destc, opt_destc, 32 * 32 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE); + ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt) +{ + ALIGN_VAR_16(int32_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int32_t, opt_dest[64 * 64]); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride); + ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt) +{ + ALIGN_VAR_16(int32_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int32_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride); + ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, int_test_buff[index] + j, stride, shift); + ref(ref_dest, int_test_buff[index] + j, stride, shift); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + int opt_cnt = (int)checked(opt, opt_dest, short_test_buff1[index] + j, stride); + int ref_cnt = ref(ref_dest, short_test_buff1[index] + j, stride); + + if ((ref_cnt != opt_cnt) || memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE); + ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift); + ref(ref_dest, short_test_buff[index] + j, stride, shift); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + int j = 0; + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + checked(ref, ref_dest, stride, pixel_test_buff[index1] + j, + stride, pixel_test_buff[index2] + j, stride, 32); + opt(opt_dest, stride, pixel_test_buff[index1] + j, + stride, pixel_test_buff[index2] + j, stride, 32); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_pp(copy_pp_t ref, copy_pp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + // we don't know the partition size so we are checking the entire output buffer so + // we must initialize the buffers + memset(ref_dest, 0, sizeof(ref_dest)); + memset(opt_dest, 0, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, opt_dest, stride, pixel_test_buff[index] + j, stride); + ref(ref_dest, stride, pixel_test_buff[index] + j, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_sp(copy_sp_t ref, copy_sp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + // we don't know the partition size so we are checking the entire output buffer so + // we must initialize the buffers + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride1 = 64, stride2 = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, opt_dest, stride1, short_test_buff1[index] + j, stride2); + ref(ref_dest, stride1, short_test_buff1[index] + j, stride2); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_ps(copy_ps_t ref, copy_ps_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + // we don't know the partition size so we are checking the entire output buffer so + // we must initialize the buffers + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, opt_dest, stride, pixel_test_buff[index] + j, stride); + ref(ref_dest, stride, pixel_test_buff[index] + j, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_copy_ss(copy_ss_t ref, copy_ss_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + // we don't know the partition size so we are checking the entire output buffer so + // we must initialize the buffers + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, opt_dest, stride, short_test_buff1[index] + j, stride); + ref(ref_dest, stride, short_test_buff1[index] + j, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + intptr_t stride = 64; + for (int i = 0; i < ITERS; i++) + { + int16_t value = (rand() % SHORT_MAX) + 1; + + checked(opt, opt_dest, stride, value); + ref(ref_dest, stride, value); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + } + + return true; +} + +bool PixelHarness::check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt) +{ + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride2 = 64, stride = STRIDE; + for (int i = 0; i < 1; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, + pixel_test_buff[index2] + j, stride, stride); + ref(ref_dest, stride2, pixel_test_buff[index1] + j, + pixel_test_buff[index2] + j, stride, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_scale_pp(scale_t ref, scale_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0, sizeof(ref_dest)); + memset(opt_dest, 0, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, opt_dest, pixel_test_buff[index] + j, stride); + ref(ref_dest, pixel_test_buff[index] + j, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_transpose(transpose_t ref, transpose_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0, sizeof(ref_dest)); + memset(opt_dest, 0, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, opt_dest, pixel_test_buff[index] + j, stride); + ref(ref_dest, pixel_test_buff[index] + j, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride2 = 64, stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride); + ref(ref_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_pixel_var(var_t ref, var_t opt) +{ + int j = 0; + + intptr_t stride = STRIDE; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + uint64_t vres = checked(opt, pixel_test_buff[index], stride); + uint64_t cres = ref(pixel_test_buff[index], stride); + if (vres != cres) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt) +{ + ALIGN_VAR_32(int, sum0[2][4]); + ALIGN_VAR_32(int, sum1[2][4]); + + for (int i = 0; i < ITERS; i++) + { + intptr_t stride = rand() % 64; + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + ref(pixel_test_buff[index1] + i, stride, pixel_test_buff[index2] + i, stride, sum0); + checked(opt, pixel_test_buff[index1] + i, stride, pixel_test_buff[index2] + i, stride, sum1); + + if (memcmp(sum0, sum1, sizeof(sum0))) + return false; + + reportfail(); + } + + return true; +} + +/* TODO: This function causes crashes when checked. Is this a real bug? */ +bool PixelHarness::check_ssim_end(ssim_end4_t ref, ssim_end4_t opt) +{ + ALIGN_VAR_32(int, sum0[5][4]); + ALIGN_VAR_32(int, sum1[5][4]); + + for (int i = 0; i < ITERS; i++) + { + for (int j = 0; j < 5; j++) + { + for (int k = 0; k < 4; k++) + { + sum0[j][k] = rand() % (1 << 12); + sum1[j][k] = rand() % (1 << 12); + } + } + + int width = (rand() % 4) + 1; // range[1-4] + float cres = ref(sum0, sum1, width); + float vres = checked_float(opt, sum0, sum1, width); + if (fabs(vres - cres) > 0.00001) + return false; + + reportfail(); + } + + return true; +} + +bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + int j = 0; + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + intptr_t stride = STRIDE; + + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, stride, stride, stride); + checked(opt, short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, stride, stride, stride); + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int width = 16 * (rand() % 4 + 1); + int8_t sign = rand() % 3; + if (sign == 2) + { + sign = -1; + } + + ref(ref_dest, psbuf1 + j, width, sign); + checked(opt, opt_dest, psbuf1 + j, width, sign); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int width = 16 + rand() % 48; + int height = 16 + rand() % 48; + intptr_t srcStride = 64; + intptr_t dstStride = width; + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)255); + ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)255); + + if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt) +{ + ALIGN_VAR_16(pixel, ref_dest[64 * 64]); + ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int width = 16 + rand() % 48; + int height = 16 + rand() % 48; + intptr_t srcStride = 64; + intptr_t dstStride = width; + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + int index = i % TEST_CASES; + checked(opt, uchar_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)2); + ref(uchar_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)2); + + if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + +bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + if (opt.satd[part]) + { + if (!check_pixelcmp(ref.satd[part], opt.satd[part])) + { + printf("satd[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sa8d_inter[part]) + { + if (!check_pixelcmp(ref.sa8d_inter[part], opt.sa8d_inter[part])) + { + printf("sa8d_inter[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sad[part]) + { + if (!check_pixelcmp(ref.sad[part], opt.sad[part])) + { + printf("sad[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sse_pp[part]) + { + if (!check_pixelcmp(ref.sse_pp[part], opt.sse_pp[part])) + { + printf("sse_pp[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sse_sp[part]) + { + if (!check_pixelcmp_sp(ref.sse_sp[part], opt.sse_sp[part])) + { + printf("sse_sp[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sse_ss[part]) + { + if (!check_pixelcmp_ss(ref.sse_ss[part], opt.sse_ss[part])) + { + printf("sse_ss[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sad_x3[part]) + { + if (!check_pixelcmp_x3(ref.sad_x3[part], opt.sad_x3[part])) + { + printf("sad_x3[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.sad_x4[part]) + { + if (!check_pixelcmp_x4(ref.sad_x4[part], opt.sad_x4[part])) + { + printf("sad_x4[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.pixelavg_pp[part]) + { + if (!check_pixelavg_pp(ref.pixelavg_pp[part], opt.pixelavg_pp[part])) + { + printf("pixelavg_pp[%s]: failed!\n", lumaPartStr[part]); + return false; + } + } + + if (opt.luma_copy_pp[part]) + { + if (!check_copy_pp(ref.luma_copy_pp[part], opt.luma_copy_pp[part])) + { + printf("luma_copy_pp[%s] failed\n", lumaPartStr[part]); + return false; + } + } + + if (opt.luma_copy_sp[part]) + { + if (!check_copy_sp(ref.luma_copy_sp[part], opt.luma_copy_sp[part])) + { + printf("luma_copy_sp[%s] failed\n", lumaPartStr[part]); + return false; + } + } + + if (opt.luma_copy_ps[part]) + { + if (!check_copy_ps(ref.luma_copy_ps[part], opt.luma_copy_ps[part])) + { + printf("luma_copy_ps[%s] failed\n", lumaPartStr[part]); + return false; + } + } + + if (opt.luma_copy_ss[part]) + { + if (!check_copy_ss(ref.luma_copy_ss[part], opt.luma_copy_ss[part])) + { + printf("luma_copy_ss[%s] failed\n", lumaPartStr[part]); + return false; + } + } + + if (opt.luma_addAvg[part]) + { + if (!check_addAvg(ref.luma_addAvg[part], opt.luma_addAvg[part])) + { + printf("luma_addAvg[%s] failed\n", lumaPartStr[part]); + return false; + } + } + + if (part < NUM_SQUARE_BLOCKS) + { + if (opt.luma_sub_ps[part]) + { + if (!check_pixel_sub_ps(ref.luma_sub_ps[part], opt.luma_sub_ps[part])) + { + printf("luma_sub_ps[%s] failed\n", lumaPartStr[part]); + return false; + } + } + + if (opt.luma_add_ps[part]) + { + if (!check_pixel_add_ps(ref.luma_add_ps[part], opt.luma_add_ps[part])) + { + printf("luma_add_ps[%s] failed\n", lumaPartStr[part]); + return false; + } + } + } + + for (int i = 0; i < X265_CSP_COUNT; i++) + { + if (opt.chroma[i].copy_pp[part]) + { + if (!check_copy_pp(ref.chroma[i].copy_pp[part], opt.chroma[i].copy_pp[part])) + { + printf("chroma_copy_pp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + if (opt.chroma[i].copy_sp[part]) + { + if (!check_copy_sp(ref.chroma[i].copy_sp[part], opt.chroma[i].copy_sp[part])) + { + printf("chroma_copy_sp[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + if (opt.chroma[i].copy_ps[part]) + { + if (!check_copy_ps(ref.chroma[i].copy_ps[part], opt.chroma[i].copy_ps[part])) + { + printf("chroma_copy_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + if (opt.chroma[i].copy_ss[part]) + { + if (!check_copy_ss(ref.chroma[i].copy_ss[part], opt.chroma[i].copy_ss[part])) + { + printf("chroma_copy_ss[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + if (opt.chroma[i].addAvg[part]) + { + if (!check_addAvg(ref.chroma[i].addAvg[part], opt.chroma[i].addAvg[part])) + { + printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + if (part < NUM_SQUARE_BLOCKS) + { + if (opt.chroma[i].sub_ps[part]) + { + if (!check_pixel_sub_ps(ref.chroma[i].sub_ps[part], opt.chroma[i].sub_ps[part])) + { + printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + if (opt.chroma[i].add_ps[part]) + { + if (!check_pixel_add_ps(ref.chroma[i].add_ps[part], opt.chroma[i].add_ps[part])) + { + printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } + } + } + + return true; +} + +bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + for (int size = 4; size <= 64; size *= 2) + { + int part = partitionFromSizes(size, size); // 2Nx2N + if (!testPartition(part, ref, opt)) return false; + + if (size > 4) + { + part = partitionFromSizes(size, size >> 1); // 2NxN + if (!testPartition(part, ref, opt)) return false; + part = partitionFromSizes(size >> 1, size); // Nx2N + if (!testPartition(part, ref, opt)) return false; + } + if (size > 8) + { + // 4 AMP modes + part = partitionFromSizes(size, size >> 2); + if (!testPartition(part, ref, opt)) return false; + part = partitionFromSizes(size, 3 * (size >> 2)); + if (!testPartition(part, ref, opt)) return false; + + part = partitionFromSizes(size >> 2, size); + if (!testPartition(part, ref, opt)) return false; + part = partitionFromSizes(3 * (size >> 2), size); + if (!testPartition(part, ref, opt)) return false; + } + } + + for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) + { + if (opt.calcresidual[i]) + { + if (!check_calresidual(ref.calcresidual[i], opt.calcresidual[i])) + { + printf("calcresidual width: %d failed!\n", 4 << i); + return false; + } + } + if (opt.sa8d[i]) + { + if (!check_pixelcmp(ref.sa8d[i], opt.sa8d[i])) + { + printf("sa8d[%dx%d]: failed!\n", 4 << i, 4 << i); + return false; + } + } + + if ((i <= BLOCK_32x32) && opt.ssd_s[i]) + { + if (!check_ssd_s(ref.ssd_s[i], opt.ssd_s[i])) + { + printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i); + return false; + } + } + + if (opt.blockfill_s[i]) + { + if (!check_blockfill_s(ref.blockfill_s[i], opt.blockfill_s[i])) + { + printf("blockfill_s[%dx%d]: failed!\n", 4 << i, 4 << i); + return false; + } + } + if (opt.transpose[i]) + { + if (!check_transpose(ref.transpose[i], opt.transpose[i])) + { + printf("transpose[%dx%d] failed\n", 4 << i, 4 << i); + return false; + } + } + + if (opt.var[i]) + { + if (!check_pixel_var(ref.var[i], opt.var[i])) + { + printf("var[%dx%d] failed\n", 4 << i, 4 << i); + return false; + } + } + + if ((i < BLOCK_64x64) && opt.copy_cnt[i]) + { + if (!check_copy_cnt_t(ref.copy_cnt[i], opt.copy_cnt[i])) + { + printf("copy_cnt[%dx%d] failed!\n", 4 << i, 4 << i); + return false; + } + } + + if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i]) + { + if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i])) + { + printf("cvt16to32_shr failed!\n"); + return false; + } + } + + if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i]) + { + if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i])) + { + printf("cvt32to16_shl failed!\n"); + return false; + } + } + + if ((i < BLOCK_64x64) && opt.copy_shl[i]) + { + if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i])) + { + printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i); + return false; + } + } + + } + + if (opt.cvt32to16_shr) + { + if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr)) + { + printf("cvt32to16 failed!\n"); + return false; + } + } + + if (opt.cvt16to32_shl) + { + if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl)) + { + printf("cvt16to32_shl failed!\n"); + return false; + } + } + + if (opt.weight_pp) + { + if (!check_weightp(ref.weight_pp, opt.weight_pp)) + { + printf("Weighted Prediction (pixel) failed!\n"); + return false; + } + } + + if (opt.weight_sp) + { + if (!check_weightp(ref.weight_sp, opt.weight_sp)) + { + printf("Weighted Prediction (short) failed!\n"); + return false; + } + } + + if (opt.frame_init_lowres_core) + { + if (!check_downscale_t(ref.frame_init_lowres_core, opt.frame_init_lowres_core)) + { + printf("downscale failed!\n"); + return false; + } + } + + if (opt.scale1D_128to64) + { + if (!check_scale_pp(ref.scale1D_128to64, opt.scale1D_128to64)) + { + printf("scale1D_128to64 failed!\n"); + return false; + } + } + + if (opt.scale2D_64to32) + { + if (!check_scale_pp(ref.scale2D_64to32, opt.scale2D_64to32)) + { + printf("scale2D_64to32 failed!\n"); + return false; + } + } + + if (opt.ssim_4x4x2_core) + { + if (!check_ssim_4x4x2_core(ref.ssim_4x4x2_core, opt.ssim_4x4x2_core)) + { + printf("ssim_end_4 failed!\n"); + return false; + } + } + + if (opt.ssim_end_4) + { + if (!check_ssim_end(ref.ssim_end_4, opt.ssim_end_4)) + { + printf("ssim_end_4 failed!\n"); + return false; + } + } + + if (opt.saoCuOrgE0) + { + if (!check_saoCuOrgE0_t(ref.saoCuOrgE0, opt.saoCuOrgE0)) + { + printf("SAO_EO_0 failed\n"); + return false; + } + } + + if (opt.planecopy_sp) + { + if (!check_planecopy_sp(ref.planecopy_sp, opt.planecopy_sp)) + { + printf("planecopy_sp failed\n"); + return false; + } + } + + if (opt.planecopy_cp) + { + if (!check_planecopy_cp(ref.planecopy_cp, opt.planecopy_cp)) + { + printf("planecopy_cp failed\n"); + return false; + } + } + + if (opt.copy_shr) + { + if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr)) + { + printf("copy_shr failed!\n"); + return false; + } + } + + return true; +} + +void PixelHarness::measurePartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + ALIGN_VAR_16(int, cres[16]); + pixel *fref = pbuf2 + 2 * INCR; + char header[128]; +#define HEADER(str, ...) sprintf(header, str, __VA_ARGS__); printf("%22s", header); + + if (opt.satd[part]) + { + HEADER("satd[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.satd[part], ref.satd[part], pbuf1, STRIDE, fref, STRIDE); + } + + if (opt.pixelavg_pp[part]) + { + HEADER("avg_pp[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.pixelavg_pp[part], ref.pixelavg_pp[part], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32); + } + + if (opt.sa8d_inter[part]) + { + HEADER("sa8d[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sa8d_inter[part], ref.sa8d_inter[part], pbuf1, STRIDE, fref, STRIDE); + } + + if (opt.sad[part]) + { + HEADER("sad[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sad[part], ref.sad[part], pbuf1, STRIDE, fref, STRIDE); + } + + if (opt.sad_x3[part]) + { + HEADER("sad_x3[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sad_x3[part], ref.sad_x3[part], pbuf1, fref, fref + 1, fref - 1, FENC_STRIDE + 5, &cres[0]); + } + + if (opt.sad_x4[part]) + { + HEADER("sad_x4[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sad_x4[part], ref.sad_x4[part], pbuf1, fref, fref + 1, fref - 1, fref - INCR, FENC_STRIDE + 5, &cres[0]); + } + + if (opt.sse_pp[part]) + { + HEADER("sse_pp[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sse_pp[part], ref.sse_pp[part], pbuf1, STRIDE, fref, STRIDE); + } + + if (opt.sse_sp[part]) + { + HEADER("sse_sp[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sse_sp[part], ref.sse_sp[part], (int16_t*)pbuf1, STRIDE, fref, STRIDE); + } + + if (opt.sse_ss[part]) + { + HEADER("sse_ss[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.sse_ss[part], ref.sse_ss[part], (int16_t*)pbuf1, STRIDE, (int16_t*)fref, STRIDE); + } + + if (opt.luma_copy_pp[part]) + { + HEADER("luma_copy_pp[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_copy_pp[part], ref.luma_copy_pp[part], pbuf1, 64, pbuf2, 128); + } + + if (opt.luma_copy_sp[part]) + { + HEADER("luma_copy_sp[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_copy_sp[part], ref.luma_copy_sp[part], pbuf1, 64, sbuf3, 128); + } + + if (opt.luma_copy_ps[part]) + { + HEADER("luma_copy_ps[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_copy_ps[part], ref.luma_copy_ps[part], sbuf1, 64, pbuf1, 128); + } + if (opt.luma_copy_ss[part]) + { + HEADER("luma_copy_ss[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_copy_ss[part], ref.luma_copy_ss[part], sbuf1, 64, sbuf2, 128); + } + if (opt.luma_addAvg[part]) + { + HEADER("luma_addAvg[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); + } + if (part < NUM_SQUARE_BLOCKS) + { + if (opt.luma_sub_ps[part]) + { + HEADER("luma_sub_ps[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE); + } + if (opt.luma_add_ps[part]) + { + HEADER("luma_add_ps[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.luma_add_ps[part], ref.luma_add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); + } + } + + for (int i = 0; i < X265_CSP_COUNT; i++) + { + if (opt.chroma[i].copy_pp[part]) + { + HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].copy_pp[part], ref.chroma[i].copy_pp[part], pbuf1, 64, pbuf2, 128); + } + if (opt.chroma[i].copy_sp[part]) + { + HEADER("[%s] copy_sp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].copy_sp[part], ref.chroma[i].copy_sp[part], pbuf1, 64, sbuf3, 128); + } + if (opt.chroma[i].copy_ps[part]) + { + HEADER("[%s] copy_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].copy_ps[part], ref.chroma[i].copy_ps[part], sbuf1, 64, pbuf1, 128); + } + if (opt.chroma[i].copy_ss[part]) + { + HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].copy_ss[part], ref.chroma[i].copy_ss[part], sbuf1, 64, sbuf2, 128); + } + if (opt.chroma[i].addAvg[part]) + { + HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); + } + if (part < NUM_SQUARE_BLOCKS) + { + if (opt.chroma[i].sub_ps[part]) + { + HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].sub_ps[part], ref.chroma[i].sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE); + } + if (opt.chroma[i].add_ps[part]) + { + HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].add_ps[part], ref.chroma[i].add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); + } + } + } + +#undef HEADER +} + +void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) +{ + char header[128]; + +#define HEADER(str, ...) sprintf(header, str, __VA_ARGS__); printf("%22s", header); +#define HEADER0(str) printf("%22s", str); + + for (int size = 4; size <= 64; size *= 2) + { + int part = partitionFromSizes(size, size); // 2Nx2N + measurePartition(part, ref, opt); + + if (size > 4) + { + part = partitionFromSizes(size, size >> 1); // 2NxN + measurePartition(part, ref, opt); + part = partitionFromSizes(size >> 1, size); // Nx2N + measurePartition(part, ref, opt); + } + if (size > 8) + { + // 4 AMP modes + part = partitionFromSizes(size, size >> 2); + measurePartition(part, ref, opt); + part = partitionFromSizes(size, 3 * (size >> 2)); + measurePartition(part, ref, opt); + + part = partitionFromSizes(size >> 2, size); + measurePartition(part, ref, opt); + part = partitionFromSizes(3 * (size >> 2), size); + measurePartition(part, ref, opt); + } + } + + for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) + { + if ((i <= BLOCK_32x32) && opt.ssd_s[i]) + { + HEADER("ssd_s[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.ssd_s[i], ref.ssd_s[i], sbuf1, STRIDE); + } + if (opt.sa8d[i]) + { + HEADER("sa8d[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.sa8d[i], ref.sa8d[i], pbuf1, STRIDE, pbuf2, STRIDE); + } + if (opt.calcresidual[i]) + { + HEADER("residual[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.calcresidual[i], ref.calcresidual[i], pbuf1, pbuf2, sbuf1, 64); + } + + if (opt.blockfill_s[i]) + { + HEADER("blkfill[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.blockfill_s[i], ref.blockfill_s[i], sbuf1, 64, SHORT_MAX); + } + + if (opt.transpose[i]) + { + HEADER("transpose[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.transpose[i], ref.transpose[i], pbuf1, pbuf2, STRIDE); + } + + if (opt.var[i]) + { + HEADER("var[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE); + } + + if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i]) + { + HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4); + } + + if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i]) + { + HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3); + } + + if ((i < BLOCK_64x64) && opt.copy_cnt[i]) + { + HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE); + } + + if ((i < BLOCK_64x64) && opt.copy_shl[i]) + { + HEADER("copy_shl[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64); + } + + } + + if (opt.cvt32to16_shr) + { + HEADER0("cvt32to16_shr"); + REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64); + } + + if (opt.cvt16to32_shl) + { + HEADER0("cvt16to32_shl"); + REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64); + } + + if (opt.weight_pp) + { + HEADER0("weight_pp"); + REPORT_SPEEDUP(opt.weight_pp, ref.weight_pp, pbuf1, pbuf2, 64, 32, 32, 128, 1 << 9, 10, 100); + } + + if (opt.weight_sp) + { + HEADER0("weight_sp"); + REPORT_SPEEDUP(opt.weight_sp, ref.weight_sp, (int16_t*)sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100); + } + + if (opt.frame_init_lowres_core) + { + HEADER0("downscale"); + REPORT_SPEEDUP(opt.frame_init_lowres_core, ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64); + } + + if (opt.scale1D_128to64) + { + HEADER0("scale1D_128to64"); + REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1, 64); + } + + if (opt.scale2D_64to32) + { + HEADER0("scale2D_64to32"); + REPORT_SPEEDUP(opt.scale2D_64to32, ref.scale2D_64to32, pbuf2, pbuf1, 64); + } + + if (opt.ssim_4x4x2_core) + { + HEADER0("ssim_4x4x2_core"); + REPORT_SPEEDUP(opt.ssim_4x4x2_core, ref.ssim_4x4x2_core, pbuf1, 64, pbuf2, 64, (int(*)[4])sbuf1); + } + + if (opt.ssim_end_4) + { + HEADER0("ssim_end_4"); + REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4); + } + + if (opt.saoCuOrgE0) + { + HEADER0("SAO_EO_0"); + REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1); + } + + if (opt.planecopy_sp) + { + HEADER0("planecopy_sp"); + REPORT_SPEEDUP(opt.planecopy_sp, ref.planecopy_sp, ushort_test_buff[0], 64, pbuf1, 64, 64, 64, 8, 255); + } + + if (opt.planecopy_cp) + { + HEADER0("planecopy_cp"); + REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2); + } + + if (opt.copy_shr) + { + HEADER0("copy_shr"); + REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64); + } + +} diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h new file mode 100644 index 0000000..1255d99 --- /dev/null +++ b/source/test/pixelharness.h @@ -0,0 +1,111 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _PIXELHARNESS_H_1 +#define _PIXELHARNESS_H_1 1 + +#include "testharness.h" +#include "primitives.h" + +class PixelHarness : public TestHarness +{ +protected: + + enum { INCR = 32 }; + enum { STRIDE = 64 }; + enum { ITERS = 100 }; + enum { MAX_HEIGHT = 64 }; + enum { PAD_ROWS = 64 }; + enum { BUFFSIZE = STRIDE * (MAX_HEIGHT + PAD_ROWS) + INCR * ITERS }; + enum { TEST_CASES = 3 }; + enum { SMAX = 1 << 12 }; + enum { SMIN = -1 << 12 }; + + ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]); + pixel pbuf2[BUFFSIZE]; + pixel pbuf3[BUFFSIZE]; + pixel pbuf4[BUFFSIZE]; + int ibuf1[BUFFSIZE]; + int8_t psbuf1[BUFFSIZE]; + + int16_t sbuf1[BUFFSIZE]; + int16_t sbuf2[BUFFSIZE]; + int16_t sbuf3[BUFFSIZE]; + + pixel pixel_test_buff[TEST_CASES][BUFFSIZE]; + int16_t short_test_buff[TEST_CASES][BUFFSIZE]; + int16_t short_test_buff1[TEST_CASES][BUFFSIZE]; + int16_t short_test_buff2[TEST_CASES][BUFFSIZE]; + int int_test_buff[TEST_CASES][BUFFSIZE]; + uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE]; + uint8_t uchar_test_buff[TEST_CASES][BUFFSIZE]; + + bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt); + bool check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt); + bool check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt); + bool check_pixelcmp_x3(pixelcmp_x3_t ref, pixelcmp_x3_t opt); + bool check_pixelcmp_x4(pixelcmp_x4_t ref, pixelcmp_x4_t opt); + bool check_copy_pp(copy_pp_t ref, copy_pp_t opt); + bool check_copy_sp(copy_sp_t ref, copy_sp_t opt); + bool check_copy_ps(copy_ps_t ref, copy_ps_t opt); + bool check_copy_ss(copy_ss_t ref, copy_ss_t opt); + bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt); + bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt); + bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt); + bool check_scale_pp(scale_t ref, scale_t opt); + bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt); + bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt); + bool check_calresidual(calcresidual_t ref, calcresidual_t opt); + bool check_transpose(transpose_t ref, transpose_t opt); + bool check_weightp(weightp_pp_t ref, weightp_pp_t opt); + bool check_weightp(weightp_sp_t ref, weightp_sp_t opt); + bool check_downscale_t(downscale_t ref, downscale_t opt); + bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt); + bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt); + bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt); + bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt); + bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt); + bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt); + bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt); + bool check_pixel_var(var_t ref, var_t opt); + bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt); + bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt); + bool check_addAvg(addAvg_t, addAvg_t); + bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt); + bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt); + bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt); + +public: + + PixelHarness(); + + const char *getName() const { return "pixel"; } + + bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt); + bool testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt); + + void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt); + void measurePartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt); +}; + +#endif // ifndef _PIXELHARNESS_H_1 diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp new file mode 100644 index 0000000..ef2d9a1 --- /dev/null +++ b/source/test/testbench.cpp @@ -0,0 +1,236 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Gopu Govindaswamy + * Mandar Gurav + * Mahesh Pittala + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +#include "pixelharness.h" +#include "mbdstharness.h" +#include "ipfilterharness.h" +#include "intrapredharness.h" +#include "param.h" +#include "cpu.h" + +using namespace x265; + +const char* lumaPartStr[NUM_LUMA_PARTITIONS] = +{ + " 4x4", " 8x8", "16x16", "32x32", "64x64", + " 8x4", " 4x8", + " 16x8", " 8x16", + "32x16", "16x32", + "64x32", "32x64", + "16x12", "12x16", " 16x4", " 4x16", + "32x24", "24x32", " 32x8", " 8x32", + "64x48", "48x64", "64x16", "16x64", +}; + +const char* chromaPartStr420[NUM_CHROMA_PARTITIONS] = +{ + " 2x2", " 4x4", " 8x8", "16x16", "32x32", + " 4x2", " 2x4", + " 8x4", " 4x8", + " 16x8", " 8x16", + "32x16", "16x32", + " 8x6", " 6x8", " 8x2", " 2x8", + "16x12", "12x16", " 16x4", " 4x16", + "32x24", "24x32", " 32x8", " 8x32", +}; + +const char* chromaPartStr422[NUM_CHROMA_PARTITIONS] = +{ + " 2x4", " 4x8", " 8x16", "16x32", "32x64", + " 4x4", " 2x8", + " 8x8", " 4x16", + "16x16", " 8x32", + "32x32", "16x64", + " 8x12", " 6x16", " 8x4", " 2x16", + "16x24", "12x32", " 16x8", " 4x32", + "32x48", "24x64", "32x16", " 8x64", +}; + +const char* const* chromaPartStr[X265_CSP_COUNT] = +{ + lumaPartStr, + chromaPartStr420, + chromaPartStr422, + lumaPartStr +}; + +void do_help() +{ + printf("x265 optimized primitive testbench\n\n"); + printf("usage: TestBench [--cpuid CPU] [--testbench BENCH] [--help]\n\n"); + printf(" CPU is comma separated SIMD arch list, example: SSE4,AVX\n"); + printf(" BENCH is one of (pixel,transforms,interp,intrapred)\n\n"); + printf("By default, the test bench will test all benches on detected CPU architectures\n"); + printf("Options and testbench name may be truncated.\n"); +} + +PixelHarness HPixel; +MBDstHarness HMBDist; +IPFilterHarness HIPFilter; +IntraPredHarness HIPred; + +int main(int argc, char *argv[]) +{ + int cpuid = x265::cpu_detect(); + const char *testname = 0; + + if (!(argc & 1)) + { + do_help(); + return 0; + } + for (int i = 1; i < argc - 1; i += 2) + { + if (strncmp(argv[i], "--", 2)) + { + printf("** invalid long argument: %s\n\n", argv[i]); + do_help(); + return 1; + } + const char *name = argv[i] + 2; + const char *value = argv[i + 1]; + if (!strncmp(name, "cpuid", strlen(name))) + { + bool bError = false; + cpuid = parseCpuName(value, bError); + if (bError) + { + printf("Invalid CPU name: %s\n", value); + return 1; + } + } + else if (!strncmp(name, "testbench", strlen(name))) + { + testname = value; + printf("Testing only harnesses that match name <%s>\n", testname); + } + else + { + printf("** invalid long argument: %s\n\n", name); + do_help(); + return 1; + } + } + + int seed = (int)time(NULL); + const char *bpp[] = { "8bpp", "16bpp" }; + printf("Using random seed %X %s\n", seed, bpp[HIGH_BIT_DEPTH]); + srand(seed); + + // To disable classes of tests, simply comment them out in this list + TestHarness *harness[] = + { + &HPixel, + &HMBDist, + &HIPFilter, + &HIPred + }; + + EncoderPrimitives cprim; + memset(&cprim, 0, sizeof(EncoderPrimitives)); + Setup_C_Primitives(cprim); + Setup_Alias_Primitives(cprim); + + struct test_arch_t + { + char name[12]; + int flag; + } test_arch[] = + { + { "SSE2", X265_CPU_SSE2 }, + { "SSE3", X265_CPU_SSE3 }, + { "SSSE3", X265_CPU_SSSE3 }, + { "SSE4", X265_CPU_SSE4 }, + { "AVX", X265_CPU_AVX }, + { "XOP", X265_CPU_XOP }, + { "AVX2", X265_CPU_AVX2 }, + { "", 0 }, + }; + + for (int i = 0; test_arch[i].flag; i++) + { + if (test_arch[i].flag & cpuid) + printf("Testing primitives: %s\n", test_arch[i].name); + else + continue; + + EncoderPrimitives vecprim; + memset(&vecprim, 0, sizeof(vecprim)); + Setup_Instrinsic_Primitives(vecprim, test_arch[i].flag); + for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++) + { + if (testname && strncmp(testname, harness[h]->getName(), strlen(testname))) + continue; + if (!harness[h]->testCorrectness(cprim, vecprim)) + { + fprintf(stderr, "\nx265: intrinsic primitive has failed. Go and fix that Right Now!\n"); + return -1; + } + } + + EncoderPrimitives asmprim; + memset(&asmprim, 0, sizeof(asmprim)); + Setup_Assembly_Primitives(asmprim, test_arch[i].flag); + memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives)); + for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++) + { + if (testname && strncmp(testname, harness[h]->getName(), strlen(testname))) + continue; + if (!harness[h]->testCorrectness(cprim, asmprim)) + { + fprintf(stderr, "\nx265: asm primitive has failed. Go and fix that Right Now!\n"); + return -1; + } + } + } + + /******************* Cycle count for all primitives **********************/ + + EncoderPrimitives optprim; + memset(&optprim, 0, sizeof(optprim)); + Setup_Instrinsic_Primitives(optprim, cpuid); + Setup_Assembly_Primitives(optprim, cpuid); + Setup_Alias_Primitives(optprim); + + /* some hybrid primitives may rely on other primitives in the + * global primitive table, so set up those pointers. This is a + * bit ugly, but I don't see a better solution */ + memcpy(&primitives, &optprim, sizeof(EncoderPrimitives)); + + printf("\nTest performance improvement with full optimizations\n"); + + for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++) + { + if (testname && strncmp(testname, harness[h]->getName(), strlen(testname))) + continue; + printf("== %s primitives ==\n", harness[h]->getName()); + harness[h]->measureSpeed(cprim, optprim); + } + + printf("\n"); + return 0; +} diff --git a/source/test/testharness.h b/source/test/testharness.h new file mode 100644 index 0000000..1704f3e --- /dev/null +++ b/source/test/testharness.h @@ -0,0 +1,173 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef _TESTHARNESS_H_ +#define _TESTHARNESS_H_ 1 + +#include "common.h" +#include "primitives.h" + +#if _MSC_VER +#pragma warning(disable: 4324) // structure was padded due to __declspec(align()) +#endif + +#if HIGH_BIT_DEPTH +#define BIT_DEPTH 10 +#else +#define BIT_DEPTH 8 +#endif +#define PIXEL_MAX ((1 << BIT_DEPTH) - 1) +#define PIXEL_MIN 0 +#define SHORT_MAX 32767 +#define SHORT_MIN -32767 +#define UNSIGNED_SHORT_MAX 65535 + +using namespace x265; + +extern const char* lumaPartStr[NUM_LUMA_PARTITIONS]; +extern const char* const* chromaPartStr[X265_CSP_COUNT]; + +class TestHarness +{ +public: + + TestHarness() {} + + virtual ~TestHarness() {} + + virtual bool testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) = 0; + + virtual void measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt) = 0; + + virtual const char *getName() const = 0; + +protected: + + /* Temporary variables for stack checks */ + int m_ok; + + uint64_t m_rand; +}; + +#ifdef _MSC_VER +#include +#elif HAVE_RDTSC +#include +#elif defined(__GNUC__) +/* fallback for older GCC/MinGW */ +static inline uint32_t __rdtsc(void) +{ + uint32_t a = 0; + + asm volatile("rdtsc" : "=a" (a) ::"edx"); + return a; +} + +#endif // ifdef _MSC_VER + +#define BENCH_RUNS 1000 + +// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc +// and discards invalid times. Repeats 1000 times to get a good average. Then measures +// the C reference with fewer runs and reports X factor and average cycles. +#define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \ + { \ + uint32_t cycles = 0; int runs = 0; \ + RUNOPT(__VA_ARGS__); \ + for (int ti = 0; ti < BENCH_RUNS; ti++) { \ + uint32_t t0 = (uint32_t)__rdtsc(); \ + RUNOPT(__VA_ARGS__); \ + RUNOPT(__VA_ARGS__); \ + RUNOPT(__VA_ARGS__); \ + RUNOPT(__VA_ARGS__); \ + uint32_t t1 = (uint32_t)__rdtsc() - t0; \ + if (t1 * runs <= cycles * 4 && ti > 0) { cycles += t1; runs++; } \ + } \ + uint32_t refcycles = 0; int refruns = 0; \ + RUNREF(__VA_ARGS__); \ + for (int ti = 0; ti < BENCH_RUNS / 4; ti++) { \ + uint32_t t0 = (uint32_t)__rdtsc(); \ + RUNREF(__VA_ARGS__); \ + RUNREF(__VA_ARGS__); \ + RUNREF(__VA_ARGS__); \ + RUNREF(__VA_ARGS__); \ + uint32_t t1 = (uint32_t)__rdtsc() - t0; \ + if (t1 * refruns <= refcycles * 4 && ti > 0) { refcycles += t1; refruns++; } \ + } \ + x265_emms(); \ + float optperf = (10.0f * cycles / runs) / 4; \ + float refperf = (10.0f * refcycles / refruns) / 4; \ + printf("\t%3.2fx ", refperf / optperf); \ + printf("\t %-8.2lf \t %-8.2lf\n", optperf, refperf); \ + } + +extern "C" { +#if X265_ARCH_X86 +int x265_stack_pagealign(int (*func)(), int align); + +/* detect when callee-saved regs aren't saved + * needs an explicit asm check because it only sometimes crashes in normal use. */ +intptr_t x265_checkasm_call(intptr_t (*func)(), int *ok, ...); +float x265_checkasm_call_float(float (*func)(), int *ok, ...); +#else +#define x265_stack_pagealign(func, align) func() +#endif + +#if X86_64 + +/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit. + * This is done by clobbering the stack with junk around the stack pointer and calling the + * assembly function through x265_checkasm_call with added dummy arguments which forces all + * real arguments to be passed on the stack and not in registers. For 32-bit argument the + * upper half of the 64-bit register location on the stack will now contain junk. Note that + * this is dependent on compiler behavior and that interrupts etc. at the wrong time may + * overwrite the junk written to the stack so there's no guarantee that it will always + * detect all functions that assumes zero-extension. + */ +void x265_checkasm_stack_clobber(uint64_t clobber, ...); +#define checked(func, ...) ( \ + m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \ + x265_checkasm_stack_clobber(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \ + m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \ + m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \ + x265_checkasm_call((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__)) + +#define checked_float(func, ...) ( \ + m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \ + x265_checkasm_stack_clobber(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \ + m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \ + m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \ + x265_checkasm_call_float((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__)) +#define reportfail() if (!m_ok) { fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); } +#elif ARCH_X86 +#define checked(func, ...) x265_checkasm_call((intptr_t(*)())func, &m_ok, __VA_ARGS__); +#define checked_float(func, ...) x265_checkasm_call_float((float(*)())func, &m_ok, __VA_ARGS__); + +#else // if X86_64 +#define checked(func, ...) func(__VA_ARGS__) +#define checked_float(func, ...) func(__VA_ARGS__) +#define reportfail() +#endif // if X86_64 +} + +#endif // ifndef _TESTHARNESS_H_ diff --git a/source/test/testpool.cpp b/source/test/testpool.cpp new file mode 100644 index 0000000..01f037b --- /dev/null +++ b/source/test/testpool.cpp @@ -0,0 +1,238 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com + *****************************************************************************/ + +#include "common.h" +#include "threadpool.h" +#include "wavefront.h" +#include "threading.h" +#include "md5.h" +#include "PPA/ppa.h" + +#include +#include + +using namespace x265; + +struct CUData +{ + CUData() + { + memset(digest, 0, sizeof(digest)); + } + + unsigned char digest[16]; +}; + +struct RowData +{ + RowData() : active(false), curCol(0) {} + + Lock lock; + volatile bool active; + volatile int curCol; +}; + +// Create a fake frame class with manufactured data in each CU block. We +// need to create an MD5 hash such that each CU's hash includes the hashes +// of the blocks that would have HEVC data dependencies (left, top-left, +// top, top-right). This will give us one deterministic output hash. We +// then generate the same hash using the thread pool and wave-front parallelism +// to verify the thread-pool behavior and the wave-front schedule data +// structures. +class MD5Frame : public WaveFront +{ +private: + + CUData *cu; + RowData *row; + int numrows; + int numcols; + Event complete; + +public: + + MD5Frame(ThreadPool *pool) : WaveFront(pool), cu(0), row(0) {} + + virtual ~MD5Frame() + { + // ensure no threads are lingering on FindJob() before allowing + // this object's vtable to be destroyed + JobProvider::flush(); + + delete[] this->cu; + delete[] this->row; + } + + void initialize(int cols, int rows); + + void encode(); + + void processRow(int row, int threadid); +}; + +void MD5Frame::initialize(int cols, int rows) +{ + this->cu = new CUData[rows * cols]; + this->row = new RowData[rows]; + this->numrows = rows; + this->numcols = cols; + + if (!this->WaveFront::init(rows)) + { + assert(!"Unable to initialize job queue"); + } +} + +void MD5Frame::encode() +{ + this->JobProvider::enqueue(); + + this->WaveFront::enqueueRow(0); + + // NOTE: When EnableRow after enqueueRow at first row, we'd better call pokeIdleThread, it will release a thread to do job + this->WaveFront::enableRow(0); + this->m_pool->pokeIdleThread(); + + this->complete.wait(); + + this->JobProvider::dequeue(); + + unsigned int *outdigest = (unsigned int*)this->cu[this->numrows * this->numcols - 1].digest; + + std::stringstream ss; + + for (int i = 0; i < 4; i++) + { + ss << std::hex << outdigest[i]; + } + + if (ss.str().compare("da667b741a7a9d0ee862158da2dd1882")) + std::cout << "Bad hash: " << ss.str() << std::endl; +} + +void MD5Frame::processRow(int rownum, int) +{ + // Called by worker thread + RowData &curRow = this->row[rownum]; + + assert(rownum < this->numrows && rownum >= 0); + assert(curRow.curCol < this->numcols); + + while (curRow.curCol < this->numcols) + { + int id = rownum * this->numcols + curRow.curCol; + CUData &curCTU = this->cu[id]; + MD5 hash; + + // * Fake CTU processing * + PPAStartCpuEventFunc(encode_block); + memset(curCTU.digest, id, sizeof(curCTU.digest)); + hash.update(curCTU.digest, sizeof(curCTU.digest)); + if (curRow.curCol > 0) + hash.update(this->cu[id - 1].digest, sizeof(curCTU.digest)); + + if (rownum > 0) + { + if (curRow.curCol > 0) + hash.update(this->cu[id - this->numcols - 1].digest, sizeof(curCTU.digest)); + + hash.update(this->cu[id - this->numcols].digest, sizeof(curCTU.digest)); + if (curRow.curCol < this->numcols - 1) + hash.update(this->cu[id - this->numcols + 1].digest, sizeof(curCTU.digest)); + } + + hash.finalize(curCTU.digest); + PPAStopCpuEventFunc(encode_block); + + curRow.curCol++; + + if (curRow.curCol >= 2 && rownum < this->numrows - 1) + { + ScopedLock below(this->row[rownum + 1].lock); + + if (this->row[rownum + 1].active == false && + this->row[rownum + 1].curCol + 2 <= curRow.curCol) + { + // set active indicator so row is only enqueued once + // row stays marked active until blocked or done + this->row[rownum + 1].active = true; + this->WaveFront::enqueueRow(rownum + 1); + this->WaveFront::enableRow(rownum + 1); + } + } + + ScopedLock self(curRow.lock); + + if (rownum > 0 && + curRow.curCol < this->numcols - 1 && + this->row[rownum - 1].curCol < curRow.curCol + 2) + { + // row is blocked, quit job + curRow.active = false; + return; + } + } + + // * Row completed * + + if (rownum == this->numrows - 1) + this->complete.trigger(); +} + +int main(int, char **) +{ + ThreadPool *pool; + + PPA_INIT(); + + pool = ThreadPool::allocThreadPool(1); + { + MD5Frame frame(pool); + frame.initialize(60, 40); + frame.encode(); + } + pool->release(); + pool = ThreadPool::allocThreadPool(2); + { + MD5Frame frame(pool); + frame.initialize(60, 40); + frame.encode(); + } + pool->release(); + pool = ThreadPool::allocThreadPool(4); + { + MD5Frame frame(pool); + frame.initialize(60, 40); + frame.encode(); + } + pool->release(); + pool = ThreadPool::allocThreadPool(8); + { + MD5Frame frame(pool); + frame.initialize(60, 40); + frame.encode(); + } + pool->release(); + + return 0; +} diff --git a/source/x265.cpp b/source/x265.cpp new file mode 100644 index 0000000..474cea9 --- /dev/null +++ b/source/x265.cpp @@ -0,0 +1,1172 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#if _MSC_VER +#pragma warning(disable: 4127) // conditional expression is constant, yes I know +#endif + +#include "input/input.h" +#include "output/output.h" +#include "filters/filters.h" +#include "common.h" +#include "param.h" +#include "cpu.h" +#include "x265.h" + +#if HAVE_VLD +/* Visual Leak Detector */ +#include +#endif +#include "PPA/ppa.h" + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#define GetConsoleTitle(t, n) +#define SetConsoleTitle(t) +#endif + +using namespace x265; + +static const char short_options[] = "o:p:f:F:r:I:i:b:s:t:q:m:hwV?"; +static const struct option long_options[] = +{ + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "asm", required_argument, NULL, 0 }, + { "no-asm", no_argument, NULL, 0 }, + { "threads", required_argument, NULL, 0 }, + { "preset", required_argument, NULL, 'p' }, + { "tune", required_argument, NULL, 't' }, + { "frame-threads", required_argument, NULL, 'F' }, + { "no-pmode", no_argument, NULL, 0 }, + { "pmode", no_argument, NULL, 0 }, + { "no-pme", no_argument, NULL, 0 }, + { "pme", no_argument, NULL, 0 }, + { "log-level", required_argument, NULL, 0 }, + { "profile", required_argument, NULL, 0 }, + { "level-idc", required_argument, NULL, 0 }, + { "high-tier", no_argument, NULL, 0 }, + { "no-high-tier", no_argument, NULL, 0 }, + { "csv", required_argument, NULL, 0 }, + { "no-cu-stats", no_argument, NULL, 0 }, + { "cu-stats", no_argument, NULL, 0 }, + { "y4m", no_argument, NULL, 0 }, + { "no-progress", no_argument, NULL, 0 }, + { "output", required_argument, NULL, 'o' }, + { "input", required_argument, NULL, 0 }, + { "input-depth", required_argument, NULL, 0 }, + { "input-res", required_argument, NULL, 0 }, + { "input-csp", required_argument, NULL, 0 }, + { "interlace", required_argument, NULL, 0 }, + { "no-interlace", no_argument, NULL, 0 }, + { "fps", required_argument, NULL, 0 }, + { "seek", required_argument, NULL, 0 }, + { "frame-skip", required_argument, NULL, 0 }, + { "frames", required_argument, NULL, 'f' }, + { "recon", required_argument, NULL, 'r' }, + { "recon-depth", required_argument, NULL, 0 }, + { "no-wpp", no_argument, NULL, 0 }, + { "wpp", no_argument, NULL, 0 }, + { "ctu", required_argument, NULL, 's' }, + { "tu-intra-depth", required_argument, NULL, 0 }, + { "tu-inter-depth", required_argument, NULL, 0 }, + { "me", required_argument, NULL, 0 }, + { "subme", required_argument, NULL, 'm' }, + { "merange", required_argument, NULL, 0 }, + { "max-merge", required_argument, NULL, 0 }, + { "no-temporal-mvp", no_argument, NULL, 0 }, + { "temporal-mvp", no_argument, NULL, 0 }, + { "rdpenalty", required_argument, NULL, 0 }, + { "no-rect", no_argument, NULL, 0 }, + { "rect", no_argument, NULL, 0 }, + { "no-amp", no_argument, NULL, 0 }, + { "amp", no_argument, NULL, 0 }, + { "no-early-skip", no_argument, NULL, 0 }, + { "early-skip", no_argument, NULL, 0 }, + { "no-fast-cbf", no_argument, NULL, 0 }, + { "fast-cbf", no_argument, NULL, 0 }, + { "no-tskip", no_argument, NULL, 0 }, + { "tskip", no_argument, NULL, 0 }, + { "no-tskip-fast", no_argument, NULL, 0 }, + { "tskip-fast", no_argument, NULL, 0 }, + { "cu-lossless", no_argument, NULL, 0 }, + { "no-cu-lossless", no_argument, NULL, 0 }, + { "no-constrained-intra", no_argument, NULL, 0 }, + { "constrained-intra", no_argument, NULL, 0 }, + { "fast-intra", no_argument, NULL, 0 }, + { "no-fast-intra", no_argument, NULL, 0 }, + { "no-open-gop", no_argument, NULL, 0 }, + { "open-gop", no_argument, NULL, 0 }, + { "keyint", required_argument, NULL, 'I' }, + { "min-keyint", required_argument, NULL, 'i' }, + { "scenecut", required_argument, NULL, 0 }, + { "no-scenecut", no_argument, NULL, 0 }, + { "rc-lookahead", required_argument, NULL, 0 }, + { "bframes", required_argument, NULL, 'b' }, + { "bframe-bias", required_argument, NULL, 0 }, + { "b-adapt", required_argument, NULL, 0 }, + { "no-b-adapt", no_argument, NULL, 0 }, + { "no-b-pyramid", no_argument, NULL, 0 }, + { "b-pyramid", no_argument, NULL, 0 }, + { "ref", required_argument, NULL, 0 }, + { "no-weightp", no_argument, NULL, 0 }, + { "weightp", no_argument, NULL, 'w' }, + { "no-weightb", no_argument, NULL, 0 }, + { "weightb", no_argument, NULL, 0 }, + { "crf", required_argument, NULL, 0 }, + { "crf-max", required_argument, NULL, 0 }, + { "crf-min", required_argument, NULL, 0 }, + { "vbv-maxrate", required_argument, NULL, 0 }, + { "vbv-bufsize", required_argument, NULL, 0 }, + { "vbv-init", required_argument, NULL, 0 }, + { "bitrate", required_argument, NULL, 0 }, + { "qp", required_argument, NULL, 'q' }, + { "aq-mode", required_argument, NULL, 0 }, + { "aq-strength", required_argument, NULL, 0 }, + { "ipratio", required_argument, NULL, 0 }, + { "pbratio", required_argument, NULL, 0 }, + { "cbqpoffs", required_argument, NULL, 0 }, + { "crqpoffs", required_argument, NULL, 0 }, + { "rd", required_argument, NULL, 0 }, + { "psy-rd", required_argument, NULL, 0 }, + { "psy-rdoq", required_argument, NULL, 0 }, + { "scaling-list", required_argument, NULL, 0 }, + { "lossless", no_argument, NULL, 0 }, + { "no-lossless", no_argument, NULL, 0 }, + { "no-signhide", no_argument, NULL, 0 }, + { "signhide", no_argument, NULL, 0 }, + { "no-lft", no_argument, NULL, 0 }, + { "lft", no_argument, NULL, 0 }, + { "no-sao", no_argument, NULL, 0 }, + { "sao", no_argument, NULL, 0 }, + { "no-sao-non-deblock", no_argument, NULL, 0 }, + { "sao-non-deblock", no_argument, NULL, 0 }, + { "no-ssim", no_argument, NULL, 0 }, + { "ssim", no_argument, NULL, 0 }, + { "no-psnr", no_argument, NULL, 0 }, + { "psnr", no_argument, NULL, 0 }, + { "hash", required_argument, NULL, 0 }, + { "no-strong-intra-smoothing", no_argument, NULL, 0 }, + { "strong-intra-smoothing", no_argument, NULL, 0 }, + { "no-cutree", no_argument, NULL, 0 }, + { "cutree", no_argument, NULL, 0 }, + { "no-hrd", no_argument, NULL, 0 }, + { "hrd", no_argument, NULL, 0 }, + { "sar", required_argument, NULL, 0 }, + { "overscan", required_argument, NULL, 0 }, + { "videoformat", required_argument, NULL, 0 }, + { "range", required_argument, NULL, 0 }, + { "colorprim", required_argument, NULL, 0 }, + { "transfer", required_argument, NULL, 0 }, + { "colormatrix", required_argument, NULL, 0 }, + { "chromaloc", required_argument, NULL, 0 }, + { "crop-rect", required_argument, NULL, 0 }, + { "no-dither", no_argument, NULL, 0 }, + { "dither", no_argument, NULL, 0 }, + { "no-repeat-headers", no_argument, NULL, 0 }, + { "repeat-headers", no_argument, NULL, 0 }, + { "aud", no_argument, NULL, 0 }, + { "no-aud", no_argument, NULL, 0 }, + { "info", no_argument, NULL, 0 }, + { "no-info", no_argument, NULL, 0 }, + { "qpfile", required_argument, NULL, 0 }, + { "lambda-file", required_argument, NULL, 0 }, + { "b-intra", no_argument, NULL, 0 }, + { "no-b-intra", no_argument, NULL, 0 }, + { "nr", required_argument, NULL, 0 }, + { "stats", required_argument, NULL, 0 }, + { "pass", required_argument, NULL, 0 }, + { "slow-firstpass", no_argument, NULL, 0 }, + { "no-slow-firstpass", no_argument, NULL, 0 }, + { "analysis-mode", required_argument, NULL, 0 }, + { "analysis-file", required_argument, NULL, 0 }, + { 0, 0, 0, 0 } +}; + +/* Ctrl-C handler */ +static volatile sig_atomic_t b_ctrl_c /* = 0 */; +static void sigint_handler(int) +{ + b_ctrl_c = 1; +} + +struct CLIOptions +{ + Input* input; + Output* recon; + std::fstream bitstreamFile; + bool bProgress; + bool bForceY4m; + bool bDither; + + uint32_t seek; // number of frames to skip from the beginning + uint32_t framesToBeEncoded; // number of frames to encode + uint64_t totalbytes; + size_t analysisRecordSize; // number of bytes read from or dumped into file + int analysisHeaderSize; + + int64_t startTime; + int64_t prevUpdateTime; + float frameRate; + FILE* qpfile; + FILE* analysisFile; + + /* in microseconds */ + static const int UPDATE_INTERVAL = 250000; + + CLIOptions() + { + input = NULL; + recon = NULL; + framesToBeEncoded = seek = 0; + totalbytes = 0; + bProgress = true; + bForceY4m = false; + startTime = x265_mdate(); + prevUpdateTime = 0; + bDither = false; + qpfile = NULL; + analysisFile = NULL; + analysisRecordSize = 0; + analysisHeaderSize = 0; + } + + void destroy(); + void writeNALs(const x265_nal* nal, uint32_t nalcount); + void printStatus(uint32_t frameNum, x265_param *param); + void printVersion(x265_param *param); + void showHelp(x265_param *param); + bool parse(int argc, char **argv, x265_param* param); + bool parseQPFile(x265_picture &pic_org); + void readAnalysisFile(x265_picture* pic, x265_param*); + void writeAnalysisFile(x265_picture* pic, x265_param*); + bool validateFanout(x265_param*); +}; + +void CLIOptions::destroy() +{ + if (input) + input->release(); + input = NULL; + if (recon) + recon->release(); + recon = NULL; + if (qpfile) + fclose(qpfile); + qpfile = NULL; + if (analysisFile) + fclose(analysisFile); + analysisFile = NULL; +} + +void CLIOptions::writeNALs(const x265_nal* nal, uint32_t nalcount) +{ + PPAScopeEvent(bitstream_write); + for (uint32_t i = 0; i < nalcount; i++) + { + bitstreamFile.write((const char*)nal->payload, nal->sizeBytes); + totalbytes += nal->sizeBytes; + nal++; + } +} + +void CLIOptions::printStatus(uint32_t frameNum, x265_param *param) +{ + char buf[200]; + int64_t time = x265_mdate(); + + if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL)) + return; + int64_t elapsed = time - startTime; + double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0; + float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum); + if (framesToBeEncoded) + { + int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000)); + sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d", + 100. * frameNum / framesToBeEncoded, frameNum, framesToBeEncoded, fps, bitrate, + eta / 3600, (eta / 60) % 60, eta % 60); + } + else + { + sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate); + } + fprintf(stderr, "%s \r", buf + 5); + SetConsoleTitle(buf); + fflush(stderr); // needed in windows + prevUpdateTime = time; +} + +void CLIOptions::printVersion(x265_param *param) +{ + x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str); + x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str); +} + +void CLIOptions::showHelp(x265_param *param) +{ + x265_param_default(param); + printVersion(param); + +#define H0 printf +#define OPT(value) (value ? "enabled" : "disabled") + H0("\nSyntax: x265 [options] infile [-o] outfile\n"); + H0(" infile can be YUV or Y4M\n"); + H0(" outfile is raw HEVC bitstream\n"); + H0("\nExecutable Options:\n"); + H0("-h/--help Show this help text and exit\n"); + H0("-V/--version Show version info and exit\n"); + H0("\nOutput Options:\n"); + H0("-o/--output Bitstream output file name\n"); + H0(" --log-level Logging level: none error warning info debug full. Default %s\n", logLevelNames[param->logLevel + 1]); + H0(" --no-progress Disable CLI progress reports\n"); + H0(" --[no-]cu-stats Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats)); + H0(" --csv Comma separated log file, log level >= 3 frame log, else one line per run\n"); + H0("\nInput Options:\n"); + H0(" --input Raw YUV or Y4M input file name. `-` for stdin\n"); + H0(" --y4m Force parsing of input stream as YUV4MPEG2 regardless of file extension\n"); + H0(" --fps Source frame rate (float or num/denom), auto-detected if Y4M\n"); + H0(" --input-res WxH Source picture size [w x h], auto-detected if Y4M\n"); + H0(" --input-depth Bit-depth of input file. Default 8\n"); + H0(" --input-csp Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n"); + H0("-f/--frames Maximum number of frames to encode. Default all\n"); + H0(" --seek First frame to encode\n"); + H0(" --[no-]interlace Indicate input pictures are interlace fields in temporal order. Default progressive\n"); + H0(" --dither Enable dither if downscaling to 8 bit pixels. Default disabled\n"); + H0("\nQuality reporting metrics:\n"); + H0(" --[no-]ssim Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim)); + H0(" --[no-]psnr Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr)); + H0("\nProfile, Level, Tier:\n"); + H0(" --profile Enforce an encode profile: main, main10, mainstillpicture\n"); + H0(" --level-idc Force a minumum required decoder level (as '5.0' or '50')\n"); + H0(" --[no-]high-tier If a decoder level is specified, this modifier selects High tier of that level\n"); + H0("\nThreading, performance:\n"); + H0(" --threads Number of threads for thread pool (0: detect CPU core count, default)\n"); + H0("-F/--frame-threads Number of concurrently encoded frames. 0: auto-determined by core count\n"); + H0(" --[no-]wpp Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront)); + H0(" --[no-]pmode Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis)); + H0(" --[no-]pme Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation)); + H0(" --[no-]asm Override CPU detection. Default: auto\n"); + H0("\nPresets:\n"); + H0("-p/--preset Trade off performance for compression efficiency. Default medium\n"); + H0(" ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n"); + H0("-t/--tune Tune the settings for a particular type of source or situation:\n"); + H0(" psnr, ssim, zerolatency, or fastdecode\n"); + H0("\nQuad-Tree size and depth:\n"); + H0("-s/--ctu <64|32|16> Maximum CU size (default: 64x64). Default %d\n", param->maxCUSize); + H0(" --tu-intra-depth Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth); + H0(" --tu-inter-depth Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth); + H0(" --[no-]rect Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter)); + H0(" --[no-]amp Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP)); + H0("\nAnalysis:\n"); + H0(" --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel); + H0(" --psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %f\n", param->psyRd); + H0(" --psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %f\n", param->psyRdoq); + H0(" --nr An integer value in range of 100 to 1000, which denotes strength of noise reduction. Default disabled\n"); + H0(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast)); + H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip)); + H0(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode)); + H0("\nCoding tools:\n"); + H0("-w/--[no-]weightp Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred)); + H0(" --[no-]weightb Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred)); + H0(" --[no-]cu-lossless Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless)); + H0(" --[no-]signhide Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding)); + H0(" --[no-]tskip Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip)); + H0("\nTemporal / motion search options:\n"); + H0(" --me Motion search method dia hex umh star full. Default %d\n", param->searchMethod); + H0("-m/--subme Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine); + H0(" --merange Motion search range. Default %d\n", param->searchRange); + H0(" --max-merge <1..5> Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand); + H0(" --[no-]temporal-mvp Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp)); + H0("\nSpatial / intra options:\n"); + H0(" --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing)); + H0(" --[no-]constrained-intra Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra)); + H0(" --[no-]b-intra Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames)); + H0(" --[no-]fast-intra Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra)); + H0(" --rdpenalty <0..2> penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty); + H0("\nSlice decision options:\n"); + H0(" --[no-]open-gop Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP)); + H0("-I/--keyint Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax); + H0("-i/--min-keyint Scenecuts closer together than this are coded as I, not IDR. Default: auto\n"); + H0(" --no-scenecut Disable adaptive I-frame decision\n"); + H0(" --scenecut How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold); + H0(" --rc-lookahead Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth); + H0(" --bframes Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes); + H0(" --bframe-bias Bias towards B frame decisions. Default %d\n", param->bFrameBias); + H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive); + H0(" --[no-]b-pyramid Use B-frames as references. Default %s\n", OPT(param->bBPyramid)); + H0(" --ref max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences); + H0(" --qpfile Force frametypes and QPs for some or all frames\n"); + H0(" Format of each line: framenumber frametype QP\n"); + H0(" QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n"); + H0(" QPs are restricted by qpmin/qpmax.\n"); + H0("\nRate control, Quantization:\n"); + H0(" --bitrate Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate); + H0("-q/--qp QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n"); + H0(" --crf Quality-based VBR (0-51). Default %f\n", param->rc.rfConstant); + H0(" --[no-]lossless Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless)); + H0(" --crf-max With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax); + H0(" May cause VBV underflows!\n"); + H0(" --crf-min With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin); + H0(" this specifies a minimum rate factor value for encode!\n"); + H0(" --vbv-maxrate Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate); + H0(" --vbv-bufsize Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize); + H0(" --vbv-init Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %f\n", param->rc.vbvBufferInit); + H0(" --aq-mode Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode); + H0(" --aq-strength Reduces blocking and blurring in flat and textured areas.(0 to 3.0). Default %f\n", param->rc.aqStrength); + H0(" --[no-]cutree Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree)); + H0(" --ipratio QP factor between I and P. Default %f\n", param->rc.ipFactor); + H0(" --pbratio QP factor between P and B. Default %f\n", param->rc.pbFactor); + H0(" --cbqpoffs Chroma Cb QP Offset. Default %d\n", param->cbQpOffset); + H0(" --crqpoffs Chroma Cr QP Offset. Default %d\n", param->crQpOffset); + H0(" --stats Filename for stats file in multipass pass rate control. Default x265_2pass.log\n"); + H0(" --pass Multi pass rate control.\n" + " - 1 : First pass, creates stats file\n" + " - 2 : Last pass, does not overwrite stats file\n" + " - 3 : Nth pass, overwrites stats file\n"); + H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass)); + H0(" --analysis-mode save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode); + H0(" --analysis-file Specify file name used for either dumping or reading analysis data.\n"); + H0(" --scaling-list Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n"); + H0(" --lambda-file Specify a file containing replacement values for the lambda tables\n"); + H0(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n"); + H0(" Blank lines and lines starting with hash(#) are ignored\n"); + H0(" Comma is considered to be white-space\n"); + H0("\nLoop filters (deblock and SAO):\n"); + H0(" --[no-]lft Enable Deblocking Loop Filter. Default %s\n", OPT(param->bEnableLoopFilter)); + H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO)); + H0(" --[no-]sao-non-deblock Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked)); + H0("\nVUI options:\n"); + H0(" --sar Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n"); + H0(" Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n"); + H0(" 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n"); + H0(" 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of . Default %d\n", param->vui.aspectRatioIdc); + H0(" --crop-rect Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n"); + H0(" --overscan Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n"); + H0(" --videoformat Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n"); + H0(" --range Specify black level and range of luma and chroma signals as full or limited Default limited\n"); + H0(" --colorprim Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n"); + H0(" smpte240m, film, bt2020. Default undef\n"); + H0(" --transfer Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n"); + H0(" smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n"); + H0(" bt2020-10, bt2020-12. Default undef\n"); + H0(" --colormatrix Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n"); + H0(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n"); + H0(" --chromaloc Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField); + H0("\nBitstream options:\n"); + H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI)); + H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters)); + H0(" --[no-]hrd Enable HRD parameters signalling. Default %s\n", OPT(param->bEmitHRDSEI)); + H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders)); + H0(" --hash Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI); + H0("\nReconstructed video options (debugging):\n"); + H0("-r/--recon Reconstructed raw image YUV or Y4M output file name\n"); + H0(" --recon-depth Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n"); +#undef OPT +#undef H0 + printf("\n\nFull documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n"); + exit(0); +} + +bool CLIOptions::parse(int argc, char **argv, x265_param* param) +{ + bool bError = 0; + int help = 0; + int inputBitDepth = 8; + int reconFileBitDepth = 0; + const char *inputfn = NULL; + const char *reconfn = NULL; + const char *bitstreamfn = NULL; + const char *preset = NULL; + const char *tune = NULL; + const char *profile = NULL; + const char *analysisfn = "x265_analysis.dat"; + + if (argc <= 1) + { + x265_log(NULL, X265_LOG_ERROR, "No input file. Run x265 --help for a list of options.\n"); + return true; + } + + /* Presets are applied before all other options. */ + for (optind = 0;; ) + { + int c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) + break; + if (c == 'p') + preset = optarg; + if (c == 't') + tune = optarg; + else if (c == '?') + showHelp(param); + } + + if (x265_param_default_preset(param, preset, tune) < 0) + { + x265_log(NULL, X265_LOG_ERROR, "preset or tune unrecognized\n"); + return true; + } + + for (optind = 0;; ) + { + int long_options_index = -1; + int c = getopt_long(argc, argv, short_options, long_options, &long_options_index); + if (c == -1) + { + break; + } + + switch (c) + { + case 'h': + showHelp(param); + break; + + case 'V': + printVersion(param); + x265_setup_primitives(param, -1); + exit(0); + + default: + if (long_options_index < 0 && c > 0) + { + for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options[0]); i++) + { + if (long_options[i].val == c) + { + long_options_index = (int)i; + break; + } + } + + if (long_options_index < 0) + { + /* getopt_long might have already printed an error message */ + if (c != 63) + x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c); + return true; + } + } + if (long_options_index < 0) + { + x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c); + return true; + } +#define OPT(longname) \ + else if (!strcmp(long_options[long_options_index].name, longname)) +#define OPT2(name1, name2) \ + else if (!strcmp(long_options[long_options_index].name, name1) || \ + !strcmp(long_options[long_options_index].name, name2)) + + if (0) ; + OPT2("frame-skip", "seek") this->seek = (uint32_t)x265_atoi(optarg, bError); + OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError); + OPT("no-progress") this->bProgress = false; + OPT("output") bitstreamfn = optarg; + OPT("input") inputfn = optarg; + OPT("recon") reconfn = optarg; + OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError); + OPT("dither") this->bDither = true; + OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError); + OPT("y4m") this->bForceY4m = true; + OPT("profile") profile = optarg; /* handled last */ + OPT("preset") /* handled above */; + OPT("tune") /* handled above */; + OPT("analysis-file") analysisfn = optarg; + OPT("qpfile") + { + this->qpfile = fopen(optarg, "rb"); + if (!this->qpfile) + { + x265_log(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file \n", optarg); + return false; + } + } + else + bError |= !!x265_param_parse(param, long_options[long_options_index].name, optarg); + + if (bError) + { + const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind - 2]; + x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg); + return true; + } +#undef OPT + } + } + + if (optind < argc && !inputfn) + inputfn = argv[optind++]; + if (optind < argc && !bitstreamfn) + bitstreamfn = argv[optind++]; + if (optind < argc) + { + x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argv[optind]); + return true; + } + + if (argc <= 1 || help) + showHelp(param); + + if (inputfn == NULL || bitstreamfn == NULL) + { + x265_log(param, X265_LOG_ERROR, "input or output file not specified, try -V for help\n"); + return true; + } + +#if HIGH_BIT_DEPTH + if (param->internalBitDepth != 10) + { + x265_log(param, X265_LOG_ERROR, "Only bit depths of 10 are supported in this build\n"); + return true; + } +#else + if (param->internalBitDepth != 8) + { + x265_log(param, X265_LOG_ERROR, "Only bit depths of 8 are supported in this build\n"); + return true; + } +#endif // if HIGH_BIT_DEPTH + + InputFileInfo info; + info.filename = inputfn; + info.depth = inputBitDepth; + info.csp = param->internalCsp; + info.width = param->sourceWidth; + info.height = param->sourceHeight; + info.fpsNum = param->fpsNum; + info.fpsDenom = param->fpsDenom; + info.sarWidth = param->vui.sarWidth; + info.sarHeight = param->vui.sarHeight; + info.skipFrames = seek; + info.frameCount = 0; + getParamAspectRatio(param, info.sarWidth, info.sarHeight); + + this->input = Input::open(info, this->bForceY4m); + if (!this->input || this->input->isFail()) + { + x265_log(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn); + return true; + } + + if (info.depth < 8 || info.depth > 16) + { + x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth); + return true; + } + + /* Unconditionally accept height/width/csp from file info */ + param->sourceWidth = info.width; + param->sourceHeight = info.height; + param->internalCsp = info.csp; + + /* Accept fps and sar from file info if not specified by user */ + if (param->fpsDenom == 0 || param->fpsNum == 0) + { + param->fpsDenom = info.fpsDenom; + param->fpsNum = info.fpsNum; + } + if (!param->vui.aspectRatioIdc && info.sarWidth && info.sarHeight) + setParamAspectRatio(param, info.sarWidth, info.sarHeight); + if (this->framesToBeEncoded == 0 && info.frameCount > (int)seek) + this->framesToBeEncoded = info.frameCount - seek; + param->totalFrames = this->framesToBeEncoded; + + if (x265_param_apply_profile(param, profile)) + return true; + + if (param->logLevel >= X265_LOG_INFO) + { + char buf[128]; + int p = sprintf(buf, "%dx%d fps %d/%d %sp%d", param->sourceWidth, param->sourceHeight, + param->fpsNum, param->fpsDenom, x265_source_csp_names[param->internalCsp], info.depth); + + int width, height; + getParamAspectRatio(param, width, height); + if (width && height) + p += sprintf(buf + p, " sar %d:%d", width, height); + + if (framesToBeEncoded <= 0 || info.frameCount <= 0) + strcpy(buf + p, " unknown frame count"); + else + sprintf(buf + p, " frames %u - %d of %d", this->seek, this->seek + this->framesToBeEncoded - 1, info.frameCount); + + fprintf(stderr, "%s [info]: %s\n", input->getName(), buf); + } + + this->input->startReader(); + + if (reconfn) + { + if (reconFileBitDepth == 0) + reconFileBitDepth = param->internalBitDepth; + this->recon = Output::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth, + param->fpsNum, param->fpsDenom, param->internalCsp); + if (this->recon->isFail()) + { + x265_log(param, X265_LOG_WARNING, "unable to write reconstruction file\n"); + this->recon->release(); + this->recon = 0; + } + else + fprintf(stderr, "%s [info]: reconstructed images %dx%d fps %d/%d %s\n", this->recon->getName(), + param->sourceWidth, param->sourceHeight, param->fpsNum, param->fpsDenom, + x265_source_csp_names[param->internalCsp]); + } + + this->bitstreamFile.open(bitstreamfn, std::fstream::binary | std::fstream::out); + if (!this->bitstreamFile) + { + x265_log(NULL, X265_LOG_ERROR, "failed to open bitstream file <%s> for writing\n", bitstreamfn); + return true; + } + + if (param->analysisMode) + { + const char *mode = param->analysisMode == X265_ANALYSIS_SAVE ? "wb" : "rb"; + this->analysisFile = fopen(analysisfn, mode); + if (!this->analysisFile) + { + x265_log(NULL, X265_LOG_ERROR, "failed to open analysis file %s\n", analysisfn); + return true; + } + } + + return false; +} + +bool CLIOptions::validateFanout(x265_param *param) +{ +#define CMP_OPT_FANOUT(opt, param_val)\ + {\ + bErr = 0;\ + p = strstr(paramBuf, opt "=");\ + char* q = strstr(paramBuf, "no-"opt);\ + if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\ + bErr = 1;\ + else if (!param_val && !q)\ + bErr = 1;\ + else if (param_val && (q || !strstr(paramBuf, opt)))\ + bErr = 1;\ + if (bErr)\ + {\ + x265_log(param, X265_LOG_ERROR, "different " opt " setting than given in analysis file (%d vs %d)\n", param_val, i);\ + X265_FREE(paramBuf);\ + return false;\ + }\ + } + + char *p = NULL, *paramBuf; + int i, j; + uint32_t k , l; + bool bErr = false; + + paramBuf = X265_MALLOC(char, MAXPARAMSIZE); + if (!paramBuf) + return false; + + fread(paramBuf, 1, MAXPARAMSIZE, this->analysisFile); + + /* check whether fanout options are compatible */ + if (strncmp(paramBuf, "#options:", 9)) + { + x265_log(param, X265_LOG_ERROR, "options list in analysis file is not valid\n"); + X265_FREE(paramBuf); + return false; + } + + char* buf = strchr(paramBuf, '\n'); + if (!buf) + { + x265_log(param, X265_LOG_ERROR, "Malformed analysis file\n"); + X265_FREE(paramBuf); + return false; + } + *buf = '\0'; + fseek(this->analysisFile, (int)strlen(paramBuf) + 1, SEEK_SET); + + if (sscanf(paramBuf, "#options: %dx%d", &i, &j) != 2) + { + x265_log(param, X265_LOG_ERROR, "Resolution specified in analysis file is not valid\n"); + X265_FREE(paramBuf); + return false; + } + if ((p = strstr(paramBuf, " fps=")) == 0 || sscanf(p, " fps=%u/%u", &k, &l) != 2) + { + x265_log(param, X265_LOG_ERROR, "fps specified in analysis file is not valid\n"); + X265_FREE(paramBuf); + return false; + } + if (k != param->fpsNum || l != param->fpsDenom) + { + x265_log(param, X265_LOG_ERROR, "fps mismatch than given in analysis file (%u/%u vs %u/%u)\n", + param->fpsNum, param->fpsDenom, k, l); + X265_FREE(paramBuf); + return false; + } + + CMP_OPT_FANOUT("bitdepth", param->internalBitDepth); + CMP_OPT_FANOUT("weightp", param->bEnableWeightedPred); + CMP_OPT_FANOUT("bframes", param->bframes); + CMP_OPT_FANOUT("b-pyramid", param->bBPyramid); + CMP_OPT_FANOUT("b-adapt", param->bFrameAdaptive); + CMP_OPT_FANOUT("open-gop", param->bOpenGOP); + CMP_OPT_FANOUT("keyint", param->keyframeMax); + CMP_OPT_FANOUT("min-keyint", param->keyframeMin); + CMP_OPT_FANOUT("scenecut", param->scenecutThreshold); + CMP_OPT_FANOUT("ctu", (int)param->maxCUSize); + CMP_OPT_FANOUT("ref", param->maxNumReferences); + CMP_OPT_FANOUT("rc-lookahead", param->lookaheadDepth); + +#undef CMP_OPT_FANOUT + + X265_FREE(paramBuf); + return true; +} + +void CLIOptions::readAnalysisFile(x265_picture* pic, x265_param* p) +{ + int poc, width, height; + uint32_t numPart, numCU; + fread(&width, sizeof(int), 1, this->analysisFile); + fread(&height, sizeof(int), 1, this->analysisFile); + fread(&poc, sizeof(int), 1, this->analysisFile); + fread(&pic->sliceType, sizeof(int), 1, this->analysisFile); + fread(&numCU, sizeof(int), 1, this->analysisFile); + fread(&numPart, sizeof(int), 1, this->analysisFile); + + if (poc != pic->poc || width != p->sourceWidth || height != p->sourceHeight) + { + x265_log(NULL, X265_LOG_WARNING, "Error in reading intra-inter data.\n"); + x265_free_analysis_data(pic); + return; + } + + fread(pic->analysisData.intraData->depth, + sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); + fread(pic->analysisData.intraData->modes, + sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); + fread(pic->analysisData.intraData->partSizes, + sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); + fread(pic->analysisData.intraData->poc, + sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile); + fread(pic->analysisData.intraData->cuAddr, + sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile); + fread(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile); +} + +void CLIOptions::writeAnalysisFile(x265_picture* pic, x265_param *p) +{ + uint64_t seekTo = pic->poc * this->analysisRecordSize + this->analysisHeaderSize; + fseeko(this->analysisFile, seekTo, SEEK_SET); + fwrite(&p->sourceWidth, sizeof(int), 1, this->analysisFile); + fwrite(&p->sourceHeight, sizeof(int), 1, this->analysisFile); + fwrite(&pic->poc, sizeof(int), 1, this->analysisFile); + fwrite(&pic->sliceType, sizeof(int), 1, this->analysisFile); + fwrite(&pic->analysisData.numCUsInFrame, sizeof(int), 1, this->analysisFile); + fwrite(&pic->analysisData.numPartitions, sizeof(int), 1, this->analysisFile); + + fwrite(pic->analysisData.intraData->depth, + sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); + fwrite(pic->analysisData.intraData->modes, + sizeof(uint8_t), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); + fwrite(pic->analysisData.intraData->partSizes, + sizeof(char), pic->analysisData.numPartitions * pic->analysisData.numCUsInFrame, this->analysisFile); + fwrite(pic->analysisData.intraData->poc, sizeof(int), pic->analysisData.numCUsInFrame, this->analysisFile); + fwrite(pic->analysisData.intraData->cuAddr, sizeof(uint32_t), pic->analysisData.numCUsInFrame, this->analysisFile); + fwrite(pic->analysisData.interData, sizeof(x265_inter_data), pic->analysisData.numCUsInFrame * 85, this->analysisFile); +} + +bool CLIOptions::parseQPFile(x265_picture &pic_org) +{ + int32_t num = -1, qp, ret; + char type; + uint32_t filePos; + pic_org.forceqp = 0; + pic_org.sliceType = X265_TYPE_AUTO; + while (num < pic_org.poc) + { + filePos = ftell(qpfile); + qp = -1; + ret = fscanf(qpfile, "%d %c%*[ \t]%d\n", &num, &type, &qp); + + if (num > pic_org.poc || ret == EOF) + { + fseek(qpfile, filePos, SEEK_SET); + break; + } + if (num < pic_org.poc && ret >= 2) + continue; + if (ret == 3 && qp >= 0) + pic_org.forceqp = qp + 1; + if (type == 'I') pic_org.sliceType = X265_TYPE_IDR; + else if (type == 'i') pic_org.sliceType = X265_TYPE_I; + else if (type == 'P') pic_org.sliceType = X265_TYPE_P; + else if (type == 'B') pic_org.sliceType = X265_TYPE_BREF; + else if (type == 'b') pic_org.sliceType = X265_TYPE_B; + else ret = 0; + if (ret < 2 || qp < -1 || qp > 51) + return 0; + } + return 1; +} + +int main(int argc, char **argv) +{ +#if HAVE_VLD + // This uses Microsoft's proprietary WCHAR type, but this only builds on Windows to start with + VLDSetReportOptions(VLD_OPT_REPORT_TO_DEBUGGER | VLD_OPT_REPORT_TO_FILE, L"x265_leaks.txt"); +#endif + PPA_INIT(); + + x265_param *param = x265_param_alloc(); + CLIOptions cliopt; + + if (cliopt.parse(argc, argv, param)) + { + cliopt.destroy(); + x265_param_free(param); + exit(1); + } + + x265_encoder *encoder = x265_encoder_open(param); + if (!encoder) + { + x265_log(param, X265_LOG_ERROR, "failed to open encoder\n"); + cliopt.destroy(); + x265_param_free(param); + x265_cleanup(); + exit(1); + } + + /* get the encoder parameters post-initialization */ + x265_encoder_parameters(encoder, param); + + /* Control-C handler */ + if (signal(SIGINT, sigint_handler) == SIG_ERR) + x265_log(param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s\n", strerror(errno)); + + x265_picture pic_orig, pic_out; + x265_picture *pic_in = &pic_orig; + x265_picture *pic_recon = cliopt.recon ? &pic_out : NULL; + uint32_t inFrameCount = 0; + uint32_t outFrameCount = 0; + x265_nal *p_nal; + x265_stats stats; + uint32_t nal; + int16_t *errorBuf = NULL; + + if (!param->bRepeatHeaders) + { + if (x265_encoder_headers(encoder, &p_nal, &nal) < 0) + { + x265_log(param, X265_LOG_ERROR, "Failure generating stream headers\n"); + goto fail; + } + else + cliopt.writeNALs(p_nal, nal); + } + + x265_picture_init(param, pic_in); + + if (param->analysisMode && !pic_recon) + { + x265_log(NULL, X265_LOG_ERROR, "Must specify recon with analysis-mode option.\n"); + goto fail; + } + if (param->analysisMode) + { + if (param->analysisMode == X265_ANALYSIS_SAVE) + { + char *p = x265_param2string(param); + if (!p) + { + x265_log(NULL, X265_LOG_ERROR, "analysis: buffer allocation failure, aborting"); + goto fail; + } + uint32_t numCU = pic_in->analysisData.numCUsInFrame; + uint32_t numPart = pic_in->analysisData.numPartitions; + + cliopt.analysisRecordSize = ((sizeof(int) * 4 + sizeof(uint32_t) * 2) + sizeof(x265_inter_data) * numCU * 85 + + sizeof(uint8_t) * 2 * numPart * numCU + sizeof(char) * numPart * numCU + sizeof(int) * numCU + sizeof(uint32_t) * numCU); + + fprintf(cliopt.analysisFile, "#options: %s\n", p); + cliopt.analysisHeaderSize = ftell(cliopt.analysisFile); + X265_FREE(p); + } + else + { + if (!cliopt.validateFanout(param)) + goto fail; + } + } + + if (cliopt.bDither) + { + errorBuf = X265_MALLOC(int16_t, param->sourceWidth + 1); + if (errorBuf) + memset(errorBuf, 0, (param->sourceWidth + 1) * sizeof(int16_t)); + else + cliopt.bDither = false; + } + + // main encoder loop + while (pic_in && !b_ctrl_c) + { + pic_orig.poc = inFrameCount; + if (cliopt.qpfile && !param->rc.bStatRead) + { + if (!cliopt.parseQPFile(pic_orig)) + { + x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d\n", pic_in->poc); + fclose(cliopt.qpfile); + cliopt.qpfile = NULL; + } + } + + if (cliopt.framesToBeEncoded && inFrameCount >= cliopt.framesToBeEncoded) + pic_in = NULL; + else if (cliopt.input->readPicture(pic_orig)) + inFrameCount++; + else + pic_in = NULL; + + if (pic_in) + { + if (pic_in->bitDepth > X265_DEPTH && cliopt.bDither) + { + ditherImage(*pic_in, param->sourceWidth, param->sourceHeight, errorBuf, X265_DEPTH); + pic_in->bitDepth = X265_DEPTH; + } + if (param->analysisMode) + { + x265_alloc_analysis_data(pic_in); + + if (param->analysisMode == X265_ANALYSIS_LOAD) + cliopt.readAnalysisFile(pic_in, param); + } + } + + int numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, pic_in, pic_recon); + if (numEncoded < 0) + { + b_ctrl_c = 1; + break; + } + outFrameCount += numEncoded; + if (numEncoded && pic_recon) + { + cliopt.recon->writePicture(pic_out); + if (param->analysisMode == X265_ANALYSIS_SAVE) + cliopt.writeAnalysisFile(pic_recon, param); + if (param->analysisMode) + x265_free_analysis_data(pic_recon); + } + + if (nal) + cliopt.writeNALs(p_nal, nal); + + // Because x265_encoder_encode() lazily encodes entire GOPs, updates are per-GOP + cliopt.printStatus(outFrameCount, param); + } + + /* Flush the encoder */ + while (!b_ctrl_c) + { + uint32_t numEncoded = x265_encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon); + outFrameCount += numEncoded; + if (numEncoded && pic_recon) + { + cliopt.recon->writePicture(pic_out); + if (param->analysisMode == X265_ANALYSIS_SAVE) + cliopt.writeAnalysisFile(pic_recon, param); + if (param->analysisMode) + x265_free_analysis_data(pic_recon); + } + + if (nal) + cliopt.writeNALs(p_nal, nal); + + cliopt.printStatus(outFrameCount, param); + + if (!numEncoded) + break; + } + + /* clear progress report */ + if (cliopt.bProgress) + fprintf(stderr, "%*s\r", 80, " "); + +fail: + x265_encoder_get_stats(encoder, &stats, sizeof(stats)); + if (param->csvfn && !b_ctrl_c) + x265_encoder_log(encoder, argc, argv); + x265_encoder_close(encoder); + cliopt.bitstreamFile.close(); + + if (b_ctrl_c) + fprintf(stderr, "aborted at input frame %d, output frame %d\n", + cliopt.seek + inFrameCount, stats.encodedPictureCount); + + if (stats.encodedPictureCount) + { + printf("\nencoded %d frames in %.2fs (%.2f fps), %.2f kb/s", stats.encodedPictureCount, + stats.elapsedEncodeTime, stats.encodedPictureCount / stats.elapsedEncodeTime, stats.bitrate); + + if (param->bEnablePsnr) + printf(", Global PSNR: %.3f", stats.globalPsnr); + + if (param->bEnableSsim) + printf(", SSIM Mean Y: %.7f (%6.3f dB)", stats.globalSsim, x265_ssim2dB(stats.globalSsim)); + + printf("\n"); + } + else + { + printf("\nencoded 0 frames\n"); + } + + x265_cleanup(); /* Free library singletons */ + + cliopt.destroy(); + + x265_param_free(param); + + X265_FREE(errorBuf); + +#if HAVE_VLD + assert(VLDReportLeaks() == 0); +#endif + return 0; +} diff --git a/source/x265.def.in b/source/x265.def.in new file mode 100644 index 0000000..e78bfc1 --- /dev/null +++ b/source/x265.def.in @@ -0,0 +1,24 @@ +EXPORTS +x265_encoder_open_${X265_BUILD} +x265_setup_primitives +x265_param_default +x265_param_default_preset +x265_param_parse +x265_param_alloc +x265_param_free +x265_picture_init +x265_picture_alloc +x265_picture_free +x265_alloc_analysis_data +x265_free_analysis_data +x265_param_apply_profile +x265_max_bit_depth +x265_version_str +x265_build_info_str +x265_encoder_headers +x265_encoder_parameters +x265_encoder_encode +x265_encoder_get_stats +x265_encoder_log +x265_encoder_close +x265_cleanup diff --git a/source/x265.h b/source/x265.h new file mode 100644 index 0000000..e5474b7 --- /dev/null +++ b/source/x265.h @@ -0,0 +1,1151 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_H +#define X265_H + +#include +#include "x265_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* x265_encoder: + * opaque handler for encoder */ +typedef struct x265_encoder x265_encoder; + +/* Application developers planning to link against a shared library version of + * libx265 from a Microsoft Visual Studio or similar development environment + * will need to define X265_API_IMPORTS before including this header. + * This clause does not apply to MinGW, similar development environments, or non + * Windows platforms. */ +#ifdef X265_API_IMPORTS +#define X265_API __declspec(dllimport) +#else +#define X265_API +#endif + +typedef enum +{ + NAL_UNIT_CODED_SLICE_TRAIL_N = 0, + NAL_UNIT_CODED_SLICE_TRAIL_R, + NAL_UNIT_CODED_SLICE_TSA_N, + NAL_UNIT_CODED_SLICE_TLA_R, + NAL_UNIT_CODED_SLICE_STSA_N, + NAL_UNIT_CODED_SLICE_STSA_R, + NAL_UNIT_CODED_SLICE_RADL_N, + NAL_UNIT_CODED_SLICE_RADL_R, + NAL_UNIT_CODED_SLICE_RASL_N, + NAL_UNIT_CODED_SLICE_RASL_R, + NAL_UNIT_CODED_SLICE_BLA_W_LP = 16, + NAL_UNIT_CODED_SLICE_BLA_W_RADL, + NAL_UNIT_CODED_SLICE_BLA_N_LP, + NAL_UNIT_CODED_SLICE_IDR_W_RADL, + NAL_UNIT_CODED_SLICE_IDR_N_LP, + NAL_UNIT_CODED_SLICE_CRA, + NAL_UNIT_VPS = 32, + NAL_UNIT_SPS, + NAL_UNIT_PPS, + NAL_UNIT_ACCESS_UNIT_DELIMITER, + NAL_UNIT_EOS, + NAL_UNIT_EOB, + NAL_UNIT_FILLER_DATA, + NAL_UNIT_PREFIX_SEI, + NAL_UNIT_SUFFIX_SEI, + NAL_UNIT_INVALID = 64, +} NalUnitType; + +/* The data within the payload is already NAL-encapsulated; the type is merely + * in the struct for easy access by the calling application. All data returned + * in an x265_nal, including the data in payload, is no longer valid after the + * next call to x265_encoder_encode. Thus it must be used or copied before + * calling x265_encoder_encode again. */ +typedef struct x265_nal +{ + uint32_t type; /* NalUnitType */ + uint32_t sizeBytes; /* size in bytes */ + uint8_t* payload; +} x265_nal; + +/* Stores inter (motion estimation) analysis data for a single frame */ +typedef struct x265_inter_data +{ + uint32_t zOrder; + int ref[2]; + int costZero[2]; + int16_t mvx[2]; + int16_t mvy[2]; + uint32_t depth; + int poc; + uint32_t cuAddr; +} x265_inter_data; + +/* Stores intra (motion estimation) analysis data for a single frame */ +typedef struct x265_intra_data +{ + uint8_t* depth; + uint8_t* modes; + char* partSizes; + int* poc; + uint32_t* cuAddr; +} x265_intra_data; + +/* Stores all analysis data for a single frame */ +typedef struct x265_analysis_data +{ + x265_inter_data* interData; + x265_intra_data* intraData; + uint32_t numCUsInFrame; + uint32_t numPartitions; +} x265_analysis_data; + +/* Used to pass pictures into the encoder, and to get picture data back out of + * the encoder. The input and output semantics are different */ +typedef struct x265_picture +{ + /* Must be specified on input pictures, the number of planes is determined + * by the colorSpace value */ + void* planes[3]; + + /* Stride is the number of bytes between row starts */ + int stride[3]; + + /* Must be specified on input pictures. x265_picture_init() will set it to + * the encoder's internal bit depth, but this field must describe the depth + * of the input pictures. Must be between 8 and 16. Values larger than 8 + * imply 16bits per input sample. If input bit depth is larger than the + * internal bit depth, the encoder will down-shift pixels. Input samples + * larger than 8bits will be masked to internal bit depth. On output the + * bitDepth will be the internal encoder bit depth */ + int bitDepth; + + /* Must be specified on input pictures: X265_TYPE_AUTO or other. + * x265_picture_init() sets this to auto, returned on output */ + int sliceType; + + /* Ignored on input, set to picture count, returned on output */ + int poc; + + /* Must be specified on input pictures: X265_CSP_I420 or other. It must + * match the internal color space of the encoder. x265_picture_init() will + * initialize this value to the internal color space */ + int colorSpace; + + /* presentation time stamp: user-specified, returned on output */ + int64_t pts; + + /* display time stamp: ignored on input, copied from reordered pts. Returned + * on output */ + int64_t dts; + + /* The value provided on input is returned with the same picture (POC) on + * output */ + void* userData; + + /* force quantizer for != X265_QP_AUTO */ + int forceqp; + + /* If param.analysisMode is X265_ANALYSIS_OFF this field is ignored on input + * and output. Else the user must call x265_alloc_analysis_data() to + * allocate analysis buffers for every picture passed to the encoder. + * + * On input when param.analysisMode is X265_ANALYSIS_LOAD and analysisData + * member pointers are valid, the encoder will use the data stored here to + * reduce encoder work. + * + * On output when param.analysisMode is X265_ANALYSIS_SAVE and analysisData + * member pointers are valid, the encoder will write output analysis into + * this data structure */ + x265_analysis_data analysisData; + + /* new data members to this structure must be added to the end so that + * users of x265_picture_alloc/free() can be assured of future safety */ +} x265_picture; + +typedef enum +{ + X265_DIA_SEARCH, + X265_HEX_SEARCH, + X265_UMH_SEARCH, + X265_STAR_SEARCH, + X265_FULL_SEARCH +} X265_ME_METHODS; + +/* CPU flags */ + +/* x86 */ +#define X265_CPU_CMOV 0x0000001 +#define X265_CPU_MMX 0x0000002 +#define X265_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ +#define X265_CPU_MMXEXT X265_CPU_MMX2 +#define X265_CPU_SSE 0x0000008 +#define X265_CPU_SSE2 0x0000010 +#define X265_CPU_SSE3 0x0000020 +#define X265_CPU_SSSE3 0x0000040 +#define X265_CPU_SSE4 0x0000080 /* SSE4.1 */ +#define X265_CPU_SSE42 0x0000100 /* SSE4.2 */ +#define X265_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */ +#define X265_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ +#define X265_CPU_XOP 0x0000800 /* AMD XOP */ +#define X265_CPU_FMA4 0x0001000 /* AMD FMA4 */ +#define X265_CPU_AVX2 0x0002000 /* AVX2 */ +#define X265_CPU_FMA3 0x0004000 /* Intel FMA3 */ +#define X265_CPU_BMI1 0x0008000 /* BMI1 */ +#define X265_CPU_BMI2 0x0010000 /* BMI2 */ +/* x86 modifiers */ +#define X265_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */ +#define X265_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */ +#define X265_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */ +#define X265_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */ +#define X265_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ +#define X265_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */ +#define X265_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */ +#define X265_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow + * SIMD multiplies, slow SIMD variable shifts, slow pshufb, + * cacheline split penalties -- gather everything here that + * isn't shared by other CPUs to avoid making half a dozen + * new SLOW flags. */ +#define X265_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */ +#define X265_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */ + +/* ARM */ +#define X265_CPU_ARMV6 0x0000001 +#define X265_CPU_NEON 0x0000002 /* ARM NEON */ +#define X265_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ + +#define X265_MAX_SUBPEL_LEVEL 7 + +/* Log level */ +#define X265_LOG_NONE (-1) +#define X265_LOG_ERROR 0 +#define X265_LOG_WARNING 1 +#define X265_LOG_INFO 2 +#define X265_LOG_DEBUG 3 +#define X265_LOG_FULL 4 + +#define X265_B_ADAPT_NONE 0 +#define X265_B_ADAPT_FAST 1 +#define X265_B_ADAPT_TRELLIS 2 + +#define X265_BFRAME_MAX 16 + +#define X265_TYPE_AUTO 0x0000 /* Let x265 choose the right type */ +#define X265_TYPE_IDR 0x0001 +#define X265_TYPE_I 0x0002 +#define X265_TYPE_P 0x0003 +#define X265_TYPE_BREF 0x0004 /* Non-disposable B-frame */ +#define X265_TYPE_B 0x0005 +#define X265_QP_AUTO 0 + +#define X265_AQ_NONE 0 +#define X265_AQ_VARIANCE 1 +#define X265_AQ_AUTO_VARIANCE 2 +#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR) +#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF) + +/* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */ + +/* Supported internal color space types (according to semantics of chroma_format_idc) */ +#define X265_CSP_I400 0 /* yuv 4:0:0 planar */ +#define X265_CSP_I420 1 /* yuv 4:2:0 planar */ +#define X265_CSP_I422 2 /* yuv 4:2:2 planar */ +#define X265_CSP_I444 3 /* yuv 4:4:4 planar */ +#define X265_CSP_COUNT 4 /* Number of supported internal color spaces */ + +/* These color spaces will eventually be supported as input pictures. The pictures will + * be converted to the appropriate planar color spaces at ingest */ +#define X265_CSP_NV12 4 /* yuv 4:2:0, with one y plane and one packed u+v */ +#define X265_CSP_NV16 5 /* yuv 4:2:2, with one y plane and one packed u+v */ + +/* Interleaved color-spaces may eventually be supported as input pictures */ +#define X265_CSP_BGR 6 /* packed bgr 24bits */ +#define X265_CSP_BGRA 7 /* packed bgr 32bits */ +#define X265_CSP_RGB 8 /* packed rgb 24bits */ +#define X265_CSP_MAX 9 /* end of list */ + +#define X265_EXTENDED_SAR 255 /* aspect ratio explicitly specified as width:height */ + +/* Analysis options */ +#define X265_ANALYSIS_OFF 0 +#define X265_ANALYSIS_SAVE 1 +#define X265_ANALYSIS_LOAD 2 + +typedef struct +{ + int planes; + int width[3]; + int height[3]; +} x265_cli_csp; + +static const x265_cli_csp x265_cli_csps[] = +{ + { 1, { 0, 0, 0 }, { 0, 0, 0 } }, /* i400 */ + { 3, { 0, 1, 1 }, { 0, 1, 1 } }, /* i420 */ + { 3, { 0, 1, 1 }, { 0, 0, 0 } }, /* i422 */ + { 3, { 0, 0, 0 }, { 0, 0, 0 } }, /* i444 */ + { 2, { 0, 0 }, { 0, 1 } }, /* nv12 */ + { 2, { 0, 0 }, { 0, 0 } }, /* nv16 */ +}; + +/* rate tolerance method */ +typedef enum +{ + X265_RC_ABR, + X265_RC_CQP, + X265_RC_CRF +} X265_RC_METHODS; + +/* Output statistics from encoder */ +typedef struct x265_stats +{ + double globalPsnrY; + double globalPsnrU; + double globalPsnrV; + double globalPsnr; + double globalSsim; + double elapsedEncodeTime; /* wall time since encoder was opened */ + double elapsedVideoTime; /* encoded picture count / frame rate */ + double bitrate; /* accBits / elapsed video time */ + uint32_t encodedPictureCount; /* number of output pictures thus far */ + uint32_t totalWPFrames; /* number of uni-directional weighted frames used */ + uint64_t accBits; /* total bits output thus far */ + + /* new statistic member variables must be added below this line */ +} x265_stats; + +/* String values accepted by x265_param_parse() (and CLI) for various parameters */ +static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "full", 0 }; +static const char * const x265_source_csp_names[] = { "i400", "i420", "i422", "i444", "nv12", "nv16", 0 }; +static const char * const x265_video_format_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 }; +static const char * const x265_fullrange_names[] = { "limited", "full", 0 }; +static const char * const x265_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 }; +static const char * const x265_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", + "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 }; +static const char * const x265_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", + "YCgCo", "bt2020nc", "bt2020c", 0 }; +static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11", + "32:11", "80:33", "18:11", "15:11", "64:33", "160:99", "4:3", "3:2", "2:1", 0 }; +static const char * const x265_interlace_names[] = { "prog", "tff", "bff", 0 }; +static const char * const x265_analysis_names[] = { "off", "save", "load", 0 }; + +/* x265 input parameters + * + * For version safety you may use x265_param_alloc/free() to manage the + * allocation of x265_param instances, and x265_param_parse() to assign values + * by name. By never dereferencing param fields in your own code you can treat + * x265_param as an opaque data structure */ +typedef struct x265_param +{ + /*== Encoder Environment ==*/ + + /* x265_param_default() will auto-detect this cpu capability bitmap. it is + * recommended to not change this value unless you know the cpu detection is + * somehow flawed on your target hardware. The asm function tables are + * process global, the first encoder configures them for all encoders */ + int cpuid; + + /* Enable wavefront parallel processing, greatly increases parallelism for + * less than 1% compression efficiency loss */ + int bEnableWavefront; + + /* Number of threads to allocate for the process global thread pool, if no + * thread pool has yet been created. 0 implies auto-detection. By default + * x265 will try to allocate one worker thread per CPU core */ + int poolNumThreads; + + /* Number of concurrently encoded frames, 0 implies auto-detection. By + * default x265 will use a number of frame threads emperically determined to + * be optimal for your CPU core count, between 2 and 6. Using more than one + * frame thread causes motion search in the down direction to be clamped but + * otherwise encode behavior is unaffected. With CQP rate control the output + * bitstream is deterministic for all values of frameNumThreads greater than + * 1. All other forms of rate-control can be negatively impacted by + * increases to the number of frame threads because the extra concurrency + * adds uncertainty to the bitrate estimations. There is no limit to the + * number of frame threads you use for each encoder, but frame parallelism + * is generally limited by the the number of CU rows */ + int frameNumThreads; + + /* Use multiple threads to measure CU mode costs. Recommended for many core + * CPUs. On RD levels less than 5, it may not offload enough work to warrant + * the overhead. It is useful with the slow preset since it has the + * rectangular predictions enabled. At RD level 5 and 6 (preset slower and + * below), this feature should be an unambiguous win if you have CPU + * cores available for work. Default disabled */ + int bDistributeModeAnalysis; + + /* Use multiple threads to perform motion estimation to (ME to one reference + * per thread). Recommended for many core CPUs. The more references the more + * motion searches there will be to distribute. This option is often not a + * win, particularly in video sequences with low motion. Default disabled */ + int bDistributeMotionEstimation; + + /* The level of logging detail emitted by the encoder. X265_LOG_NONE to + * X265_LOG_FULL, default is X265_LOG_INFO */ + int logLevel; + + /* Enable analysis and logging distribution of Cus encoded across various + * modes during mode decision. Default disabled */ + int bLogCuStats; + + /* Enable the measurement and reporting of PSNR. Default is enabled */ + int bEnablePsnr; + + /* Enable the measurement and reporting of SSIM. Default is disabled */ + int bEnableSsim; + + /* filename of CSV log. If logLevel is X265_LOG_DEBUG, the encoder will emit + * per-slice statistics to this log file in encode order. Otherwise the + * encoder will emit per-stream statistics into the log file when + * x265_encoder_log is called (presumably at the end of the encode) */ + const char *csvfn; + + /* Enable the generation of SEI messages for each encoded frame containing + * the hashes of the three reconstructed picture planes. Most decoders will + * validate those hashes against the reconstructed images it generates and + * report any mismatches. This is essentially a debugging feature. Hash + * types are MD5(1), CRC(2), Checksum(3). Default is 0, none */ + int decodedPictureHashSEI; + + /*== Internal Picture Specification ==*/ + + /* Internal encoder bit depth. If x265 was compiled to use 8bit pixels + * (HIGH_BIT_DEPTH=0), this field must be 8, else this field must be 10. + * Future builds may support 12bit pixels. */ + int internalBitDepth; + + /* Color space of internal pictures. Only X265_CSP_I420 and X265_CSP_I444 + * are supported. Eventually, i422 will also be supported as an internal + * color space and other packed formats will be supported in + * x265_picture.colorSpace */ + int internalCsp; + + /* Numerator and denominator of frame rate */ + uint32_t fpsNum; + uint32_t fpsDenom; + + /* Width (in pixels) of the source pictures. If this width is not an even + * multiple of 4, the encoder will pad the pictures internally to meet this + * minimum requirement. All valid HEVC widths are supported */ + int sourceWidth; + + /* Height (in pixels) of the source pictures. If this height is not an even + * multiple of 4, the encoder will pad the pictures internally to meet this + * minimum requirement. All valid HEVC heights are supported */ + int sourceHeight; + + /* Minimum decoder requirement level. Defaults to 0, which implies auto- + * detection by the encoder. If specified, the encoder will attempt to bring + * the encode specifications within that specified level. If the encoder is + * unable to reach the level it issues a warning and emits the actual + * decoder requirement. If the requested requirement level is higher than + * the actual level, the actual requirement level is signaled. The value is + * an specified as an integer with the level times 10, for example level + * "5.1" is specified as 51, and level "5.0" is specified as 50. */ + int levelIdc; + + /* if levelIdc is specified (non-zero) this flag will differentiate between + * Main (0) and High (1) tier. Default is Main tier (0) */ + int bHighTier; + + /* Interlace type of source pictures. 0 - progressive pictures (default). + * 1 - top field first, 2 - bottom field first. HEVC encodes interlaced + * content as fields, they must be provided to the encoder in the correct + * temporal order. EXPERIMENTAL */ + int interlaceMode; + + /* Flag indicating whether VPS, SPS and PPS headers should be output with + * each keyframe. Default false */ + int bRepeatHeaders; + + /* Flag indicating whether the encoder should emit an Access Unit Delimiter + * NAL at the start of every access unit. Default false */ + int bEnableAccessUnitDelimiters; + + /* Enables the buffering period SEI and picture timing SEI to signal the HRD + * parameteres. Default is disabled */ + int bEmitHRDSEI; + + /* Enables the emission of a user data SEI with the stream headers which + * describes the encoder version, build info, and parameters. This is + * very helpful for debugging, but may interfere with regression tests. + * Default enabled */ + int bEmitInfoSEI; + + /*== Coding Unit (CU) definitions ==*/ + + /* Maxiumum CU width and height in pixels. The size must be 64, 32, or 16. + * The higher the size, the more efficiently x265 can encode areas of low + * complexity, greatly improving compression efficiency at large + * resolutions. The smaller the size, the more effective wavefront and + * frame parallelism will become because of the increase in rows. default 64 */ + uint32_t maxCUSize; + + /* The additional depth the residual quadtree is allowed to recurse beyond + * the coding quadtree, for inter coded blocks. This must be between 1 and + * 3. The higher the value the more efficiently the residual can be + * compressed by the DCT transforms, at the expense of much more compute */ + uint32_t tuQTMaxInterDepth; + + /* The additional depth the residual quadtree is allowed to recurse beyond + * the coding quadtree, for intra coded blocks. This must be between 1 and + * 3. The higher the value the more efficiently the residual can be + * compressed by the DCT transforms, at the expense of much more compute */ + uint32_t tuQTMaxIntraDepth; + + /*== GOP Structure and Lokoahead ==*/ + + /* Enable open GOP - meaning I slices are not necessariy IDR and thus frames + * encoded after an I slice may reference frames encoded prior to the I + * frame which have remained in the decoded picture buffer. Open GOP + * generally has better compression efficiency and negligable encoder + * performance impact, but the use case may preclude it. Default true */ + int bOpenGOP; + + /* Scenecuts closer together than this are coded as I, not IDR. */ + int keyframeMin; + + /* Maximum keyframe distance or intra period in number of frames. If 0 or 1, + * all frames are I frames. A negative value is casted to MAX_INT internally + * which effectively makes frame 0 the only I frame. Default is 250 */ + int keyframeMax; + + /* The maximum number of L0 references a P or B slice may use. This + * influences the size of the decoded picture buffer. The higher this + * number, the more reference frames there will be available for motion + * search, improving compression efficiency of most video at a cost of + * performance. Value must be between 1 and 16, default is 3 */ + int maxNumReferences; + + /* Sets the operating mode of the lookahead. With b-adapt 0, the GOP + * structure is fixed based on the values of keyframeMax and bframes. + * With b-adapt 1 a light lookahead is used to chose B frame placement. + * With b-adapt 2 (trellis) a viterbi B path selection is performed */ + int bFrameAdaptive; + + /* Maximum consecutive B frames that can be emitted by the lookehead. When + * b-adapt is 0 and keyframMax is greater than bframes, the lookahead emits + * a fixed pattern of `bframes` B frames between each P. With b-adapt 1 the + * lookahead ignores the value of bframes for the most part. With b-adapt 2 + * the value of bframes determines the search (POC) distance performeed in + * both directions, quadradically increasing the compute load of the + * lookahead. The higher the value, the more B frames the lookahead may + * possibly use consecutively, usually improving compression. Default is 3, + * maximum is 16 */ + int bframes; + + /* Total Number of frames to be encoded, caclulated from the user input + * (--frames) and (--seek). In case, the input is read from a pipe, this can + * remain as 0. It is later used in 2 pass RateControl, hence storing the + * value in param */ + int totalFrames; + + /* When enabled, the encoder will use the B frame in the middle of each + * mini-GOP larger than 2 B frames as a motion reference for the surrounding + * B frames. This improves compression efficiency for a small performance + * penalty. Referenced B frames are treated somewhere between a B and a P + * frame by rate control. Default is enabled. */ + int bBPyramid; + + /* The number of frames that must be queued in the lookahead before it may + * make slice decisions. Increasing this value directly increases the encode + * latency. The longer the queue the more optimally the lookahead may make + * slice decisions, particularly with b-adapt 2. When mb-tree is enabled, + * the length of the queue linearly increases the effectiveness of the + * mb-tree analysis. Default is 40 frames, maximum is 250 */ + int lookaheadDepth; + + /* A value which is added to the cost estimate of B frames in the lookahead. + * It may be a positive value (making B frames appear more expensive, which + * causes the lookahead to chose more P frames) or negative, which makes the + * lookahead chose more B frames. Default is 0, there are no limits */ + int bFrameBias; + + /* An arbitrary threshold which determines how agressively the lookahead + * should detect scene cuts. The default (40) is recommended. */ + int scenecutThreshold; + + /*== Intra Coding Tools ==*/ + + /* Enable constrained intra prediction. This causes intra prediction to + * input samples that were inter predicted. For some use cases this is + * believed to me more robust to stream errors, but it has a compression + * penalty on P and (particularly) B slices. Defaults to diabled */ + int bEnableConstrainedIntra; + + /* Enable strong intra smoothing for 32x32 blocks where the reference + * samples are flat. It may or may not improve compression efficiency, + * depending on your source material. Defaults to disabled */ + int bEnableStrongIntraSmoothing; + + /* Use a faster search method to find the best intra mode. Default is 0 */ + int bEnableFastIntra; + + /*== Inter Coding Tools ==*/ + + /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns + * (methods) are sorted in increasing complexity, with diamond being the + * simplest and fastest and full being the slowest. DIA, HEX, and UMH were + * adapted from x264 directly. STAR is an adaption of the HEVC reference + * encoder's three step search, while full is a naive exhaustive search. The + * default is the star search, it has a good balance of performance and + * compression efficiecy */ + int searchMethod; + + /* A value between 0 and X265_MAX_SUBPEL_LEVEL which adjusts the amount of + * effort performed during subpel refine. Default is 5 */ + int subpelRefine; + + /* The maximum distance from the motion prediction that the full pel motion + * search is allowed to progress before terminating. This value can have an + * effect on frame parallelism, as referenced frames must be at least this + * many rows of reconstructed pixels ahead of the referencee at all times. + * (When considering reference lag, the motion prediction must be ignored + * because it cannot be known ahead of time). Default is 60, which is the + * default max CU size (64) minus the luma HPEL half-filter length (4). If a + * smaller CU size is used, the search range should be similarly reduced */ + int searchRange; + + /* The maximum number of merge candidates that are considered during inter + * analysis. This number (between 1 and 5) is signaled in the stream + * headers and determines the number of bits required to signal a merge so + * it can have significant trade-offs. The smaller this number the higher + * the performance but the less compression efficiency. Default is 3 */ + uint32_t maxNumMergeCand; + + /* Disable availability of temporal motion vector for AMVP */ + int bEnableTemporalMvp; + + /* Enable weighted prediction in P slices. This enables weighting analysis + * in the lookahead, which influences slice decisions, and enables weighting + * analysis in the main encoder which allows P reference samples to have a + * weight function applied to them prior to using them for motion + * compensation. In video which has lighting changes, it can give a large + * improvement in compression efficiency. Default is enabled */ + int bEnableWeightedPred; + + /* Enable weighted prediction in B slices. Default is disabled */ + int bEnableWeightedBiPred; + + /*== Analysis tools ==*/ + + /* Enable asymmetrical motion predictions. At CU depths 64, 32, and 16, it + * is possible to use 25%/75% split partitions in the up, down, right, left + * directions. For some material this can improve compression efficiency at + * the cost of extra analysis. bEnableRectInter must be enabled for this + * feature to be used. Default enabled */ + int bEnableAMP; + + /* Enable rectangular motion prediction partitions (vertical and + * horizontal), available at all CU depths from 64x64 to 8x8. Default is + * enabled */ + int bEnableRectInter; + + /* Enable the use of `coded block flags` (flags set to true when a residual + * has been coded for a given block) to avoid intra analysis in likely skip + * blocks. Default is disabled */ + int bEnableCbfFastMode; + + /* Enable early skip decisions to avoid intra and inter analysis in likely + * skip blocks. Default is disabled */ + int bEnableEarlySkip; + + /* Apply an optional penalty to the estimated cost of 32x32 intra blocks in + * non-intra slices. 0 is disabled, 1 enables a small penalty, and 2 enables + * a full penalty. This favors inter-coding and its low bitrate over + * potential increases in distortion, but usually improves performance. + * Default is 0 */ + int rdPenalty; + + /* A value betwen X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines + * the level of rate distortion optimizations to perform during mode + * decisions and quantization. The more RDO the better the compression + * efficiency at a major cost of performance. Default is no RDO (0) */ + int rdLevel; + + /* Psycho-visual rate-distortion strength. Only has an effect in presets + * which use RDO. It makes mode decision favor options which preserve the + * energy of the source, at the cost of lost compression. The value must + * be between 0 and 2.0, 1.0 is typical. Default 0.0 */ + double psyRd; + + /* Quantization scaling lists. HEVC supports 6 quantization scaling lists to + * be defined; one each for Y, Cb, Cr for intra prediction and one each for + * inter prediction. + * + * - NULL and "off" will disable quant scaling (default) + * - "default" will enable the HEVC default scaling lists, which + * do not need to be signaled since they are specified + * - all other strings indicate a filename containing custom scaling lists + * in the HM format. The encode will fail if the file is not parsed + * correctly. Custom lists must be signaled in the SPS. */ + const char *scalingLists; + + /* Strength of psycho-visual optimizations in quantization. Only has an + * effect in presets which use RDOQ (rd-levels 4 and 5). The value must be + * between 0 and 50, 1.0 is typical. Default 0.0 */ + double psyRdoq; + + /* If X265_ANALYSIS_SAVE, write per-frame analysis information into analysis + * buffers. if X265_ANALYSIS_LOAD, read analysis information into analysis + * buffer and use this analysis information to reduce the amount of work + * the encoder must perform. Default X265_ANALYSIS_OFF */ + int analysisMode; + + /*== Coding tools ==*/ + + /* Enable the implicit signaling of the sign bit of the last coefficient of + * each transform unit. This saves one bit per TU at the expense of figuring + * out which coefficient can be toggled with the least distortion. + * Default is enabled */ + int bEnableSignHiding; + + /* Allow intra coded blocks to be encoded directly as residual without the + * DCT transform, when this improves efficiency. Checking whether the block + * will benefit from this option incurs a performance penalty. Default is + * enabled */ + int bEnableTransformSkip; + + /* Enable a faster determination of whether skippig the DCT transform will + * be beneficial. Slight performance gain for some compression loss. Default + * is enabled */ + int bEnableTSkipFast; + + /* Enable the deblocking loop filter, which improves visual quality by + * reducing blocking effects at block edges, particularly at lower bitrates + * or higher QP. When enabled it adds another CU row of reference lag, + * reducing frame parallelism effectiveness. Default is enabled */ + int bEnableLoopFilter; + + /* Enable the Sample Adaptive Offset loop filter, which reduces distortion + * effects by adjusting reconstructed sample values based on histogram + * analysis to better approximate the original samples. When enabled it adds + * a CU row of reference lag, reducing frame parallelism effectiveness. + * Default is enabled */ + int bEnableSAO; + + /* Note: when deblocking and SAO are both enabled, the loop filter CU lag is + * only one row, as they operate in series on the same row. */ + + /* Select the method in which SAO deals with deblocking boundary pixels. If + * disabled the right and bottom boundary areas are skipped. If enabled, + * non-deblocked pixels are used entirely. Default is disabled */ + int bSaoNonDeblocked; + + /* Generally a small signed integer which offsets the QP used to quantize + * the Cb chroma residual (delta from luma QP specified by rate-control). + * Default is 0, which is recommended */ + int cbQpOffset; + + /* Generally a small signed integer which offsets the QP used to quantize + * the Cr chroma residual (delta from luma QP specified by rate-control). + * Default is 0, which is recommended */ + int crQpOffset; + + /* Specify whether to attempt to encode intra modes in B frames. By default + * enabled, but only applicable for the presets which use rdLevel 5 or 6 + * (veryslow and placebo). All other presets will not try intra in B frames + * regardless of this setting. */ + int bIntraInBFrames; + + /* An integer value in range of 100 to 1000, which denotes strength of noise + * reduction */ + int noiseReduction; + + /* The lossless flag enables true lossless coding, by bypassing scaling, + * transform, quantization and in-loop filter processes. This is used for + * ultra-high bitrates with zero loss of quality. */ + int bLossless; + + /* The CU Lossless flag, when enabled, compares the rate-distortion costs + * for normal and lossless encoding, and chooses the best mode for each CU. + * If lossless mode is chosen, the cu-transquant-bypass flag is set for that + * CU. */ + int bCULossless; + + /*== Rate Control ==*/ + + struct + { + /* Explicit mode of rate-control, necessary for API users. It must + * be one of the X265_RC_METHODS enum values. */ + int rateControlMode; + + /* Base QP to use for Constant QP rate control. Adaptive QP may alter + * the QP used for each block. If a QP is specified on the command line + * CQP rate control is implied. Default: 32 */ + int qp; + + /* target bitrate for Average BitRate (ABR) rate control. If a non- zero + * bitrate is specified on the command line, ABR is implied. Default 0 */ + int bitrate; + + /* The degree of rate fluctuation that x265 tolerates. Rate tolerance is used + * alongwith overflow (difference between actual and target bitrate), to adjust + * qp. Default is 1.0 */ + double rateTolerance; + + /* qComp sets the quantizer curve compression factor. It weights the frame + * quantizer based on the complexity of residual (measured by lookahead). + * Default value is 0.6. Increasing it to 1 will effectively generate CQP */ + double qCompress; + + /* QP offset between I/P and P/B frames. Default ipfactor: 1.4 + * Default pbFactor: 1.3 */ + double ipFactor; + double pbFactor; + + /* Max QP difference between frames. Default: 4 */ + int qpStep; + + /* Ratefactor constant: targets a certain constant "quality". + * Acceptable values between 0 and 51. Default value: 28 */ + double rfConstant; + + /* Enable adaptive quantization. This mode distributes available bits between all + * macroblocks of a frame, assigning more bits to low complexity areas. Turning + * this ON will usually affect PSNR negatively, however SSIM and visual quality + * generally improves. Default: X265_AQ_AUTO_VARIANCE */ + int aqMode; + + /* Sets the strength of AQ bias towards low detail macroblocks. Valid only if + * AQ is enabled. Default value: 1.0. Acceptable values between 0.0 and 3.0 */ + double aqStrength; + + /* Sets the maximum rate the VBV buffer should be assumed to refill at + * Default is zero */ + int vbvMaxBitrate; + + /* Sets the size of the VBV buffer in kilobits. Default is zero */ + int vbvBufferSize; + + /* Sets how full the VBV buffer must be before playback starts. If it is less than + * 1, then the initial fill is vbv-init * vbvBufferSize. Otherwise, it is + * interpreted as the initial fill in kbits. Default is 0.9 */ + double vbvBufferInit; + + /* Enable CUTree ratecontrol. This keeps track of the CUs that propagate temporally + * across frames and assigns more bits to these CUs. Improves encode efficiency. + * Default: enabled */ + int cuTree; + + /* In CRF mode, maximum CRF as caused by VBV. 0 implies no limit */ + double rfConstantMax; + + /* In CRF mode, minimum CRF as caused by VBV */ + double rfConstantMin; + + /* Two pass (INCOMPLETE) */ + /* Enable writing the stats in a multipass encode to the stat output file */ + int bStatWrite; + + /* Enable loading data from the stat input file in a multi pass encode */ + int bStatRead; + + /* Filename of the 2pass output/input stats file */ + char* statFileName; + + /* temporally blur quants */ + double qblur; + + /* temporally blur complexity */ + double complexityBlur; + + /* Enable slow and a more detailed first pass encode in multi pass rate control */ + int bEnableSlowFirstPass; + + /* specify a text file which contains MAX_MAX_QP + 1 floating point + * values to be copied into x265_lambda_tab and a second set of + * MAX_MAX_QP + 1 floating point values for x265_lambda2_tab. All values + * are separated by comma, space or newline. Text after a hash (#) is + * ignored. The lambda tables are process-global, so these new lambda + * values will affect all encoders in the same process */ + const char* lambdaFileName; + } rc; + + /*== Video Usability Information ==*/ + struct + { + /* Aspect ratio idc to be added to the VUI. The default is 0 indicating + * the apsect ratio is unspecified. If set to X265_EXTENDED_SAR then + * sarWidth and sarHeight must also be set */ + int aspectRatioIdc; + + /* Sample Aspect Ratio width in arbitrary units to be added to the VUI + * only if aspectRatioIdc is set to X265_EXTENDED_SAR. This is the width + * of an individual pixel. If this is set then sarHeight must also be set */ + int sarWidth; + + /* Sample Aspect Ratio height in arbitrary units to be added to the VUI. + * only if aspectRatioIdc is set to X265_EXTENDED_SAR. This is the width + * of an individual pixel. If this is set then sarWidth must also be set */ + int sarHeight; + + /* Enable overscan info present flag in the VUI. If this is set then + * bEnabledOverscanAppropriateFlag will be added to the VUI. The default + * is false */ + int bEnableOverscanInfoPresentFlag; + + /* Enable overscan appropriate flag. The status of this flag is added + * to the VUI only if bEnableOverscanInfoPresentFlag is set. If this + * flag is set then cropped decoded pictures may be output for display. + * The default is false */ + int bEnableOverscanAppropriateFlag; + + /* Video signal type present flag of the VUI. If this is set then + * videoFormat, bEnableVideoFullRangeFlag and + * bEnableColorDescriptionPresentFlag will be added to the VUI. The + * default is false */ + int bEnableVideoSignalTypePresentFlag; + + /* Video format of the source video. 0 = component, 1 = PAL, 2 = NTSC, + * 3 = SECAM, 4 = MAC, 5 = unspecified video format is the default */ + int videoFormat; + + /* Video full range flag indicates the black level and range of the luma + * and chroma signals as derived from E′Y, E′PB, and E′PR or E′R, E′G, + * and E′B real-valued component signals. The default is false */ + int bEnableVideoFullRangeFlag; + + /* Color description present flag in the VUI. If this is set then + * color_primaries, transfer_characteristics and matrix_coeffs are to be + * added to the VUI. The default is false */ + int bEnableColorDescriptionPresentFlag; + + /* Color primaries holds the chromacity coordinates of the source + * primaries. The default is 2 */ + int colorPrimaries; + + /* Transfer characteristics indicates the opto-electronic transfer + * characteristic of the source picture. The default is 2 */ + int transferCharacteristics; + + /* Matrix coefficients used to derive the luma and chroma signals from + * the red, blue and green primaries. The default is 2 */ + int matrixCoeffs; + + /* Chroma location info present flag adds chroma_sample_loc_type_top_field and + * chroma_sample_loc_type_bottom_field to the VUI. The default is false */ + int bEnableChromaLocInfoPresentFlag; + + /* Chroma sample location type top field holds the chroma location in + * the top field. The default is 0 */ + int chromaSampleLocTypeTopField; + + /* Chroma sample location type bottom field holds the chroma location in + * the bottom field. The default is 0 */ + int chromaSampleLocTypeBottomField; + + /* Default display window flag adds def_disp_win_left_offset, + * def_disp_win_right_offset, def_disp_win_top_offset and + * def_disp_win_bottom_offset to the VUI. The default is false */ + int bEnableDefaultDisplayWindowFlag; + + /* Default display window left offset holds the left offset with the + * conformance cropping window to further crop the displayed window */ + int defDispWinLeftOffset; + + /* Default display window right offset holds the right offset with the + * conformance cropping window to further crop the displayed window */ + int defDispWinRightOffset; + + /* Default display window top offset holds the top offset with the + * conformance cropping window to further crop the displayed window */ + int defDispWinTopOffset; + + /* Default display window bottom offset holds the bottom offset with the + * conformance cropping window to further crop the displayed window */ + int defDispWinBottomOffset; + } vui; +} x265_param; + +/*** + * If not called, first encoder allocated will auto-detect the CPU and + * initialize performance primitives, which are process global. + * DEPRECATED: use x265_param.cpuid to specify CPU */ +void x265_setup_primitives(x265_param *param, int cpu); + +/* x265_param_alloc: + * Allocates an x265_param instance. The returned param structure is not + * special in any way, but using this method together with x265_param_free() + * and x265_param_parse() to set values by name allows the application to treat + * x265_param as an opaque data struct for version safety */ +x265_param *x265_param_alloc(); + +/* x265_param_free: + * Use x265_param_free() to release storage for an x265_param instance + * allocated by x265_param_alloc() */ +void x265_param_free(x265_param *); + +/*** + * Initialize an x265_param structure to default values + */ +void x265_param_default(x265_param *param); + +/* x265_param_parse: + * set one parameter by name. + * returns 0 on success, or returns one of the following errors. + * note: BAD_VALUE occurs only if it can't even parse the value, + * numerical range is not checked until x265_encoder_open(). + * value=NULL means "true" for boolean options, but is a BAD_VALUE for non-booleans. */ +#define X265_PARAM_BAD_NAME (-1) +#define X265_PARAM_BAD_VALUE (-2) +int x265_param_parse(x265_param *p, const char *name, const char *value); + +/* x265_param_apply_profile: + * Applies the restrictions of the given profile. (one of below) */ +static const char * const x265_profile_names[] = { "main", "main10", "mainstillpicture", 0 }; + +/* (can be NULL, in which case the function will do nothing) + * returns 0 on success, negative on failure (e.g. invalid profile name). */ +int x265_param_apply_profile(x265_param *, const char *profile); + +/* x265_param_default_preset: + * The same as x265_param_default, but also use the passed preset and tune + * to modify the default settings. + * (either can be NULL, which implies no preset or no tune, respectively) + * + * Currently available presets are, ordered from fastest to slowest: */ +static const char * const x265_preset_names[] = { "ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo", 0 }; + +/* The presets can also be indexed numerically, as in: + * x265_param_default_preset( ¶m, "3", ... ) + * with ultrafast mapping to "0" and placebo mapping to "9". This mapping may + * of course change if new presets are added in between, but will always be + * ordered from fastest to slowest. + * + * Warning: the speed of these presets scales dramatically. Ultrafast is a full + * 100 times faster than placebo! + * + * Currently available tunings are: */ +static const char * const x265_tune_names[] = { "psnr", "ssim", "zerolatency", "fastdecode", 0 }; + +/* returns 0 on success, negative on failure (e.g. invalid preset/tune name). */ +int x265_param_default_preset(x265_param *, const char *preset, const char *tune); + +/* x265_picture_alloc: + * Allocates an x265_picture instance. The returned picture structure is not + * special in any way, but using this method together with x265_picture_free() + * and x265_picture_init() allows some version safety. New picture fields will + * always be added to the end of x265_picture */ +x265_picture *x265_picture_alloc(); + +/* x265_picture_free: + * Use x265_picture_free() to release storage for an x265_picture instance + * allocated by x265_picture_alloc() */ +void x265_picture_free(x265_picture *); + +/* x265_alloc_analysis_data: + * Allocate memory to hold analysis data, returns 0 on success else negative */ +int x265_alloc_analysis_data(x265_picture*); + +/* x265_free_analysis_data: + * Use x265_free_analysis_data to release storage of members allocated by + * x265_alloc_analysis_data */ +void x265_free_analysis_data(x265_picture*); + +/*** + * Initialize an x265_picture structure to default values. It sets the pixel + * depth and color space to the encoder's internal values and sets the slice + * type to auto - so the lookahead will determine slice type. + */ +void x265_picture_init(x265_param *param, x265_picture *pic); + +/* x265_max_bit_depth: + * Specifies the maximum number of bits per pixel that x265 can input. This + * is also the max bit depth that x265 encodes in. When x265_max_bit_depth + * is 8, the internal and input bit depths must be 8. When + * x265_max_bit_depth is 12, the internal and input bit depths can be + * either 8, 10, or 12. Note that the internal bit depth must be the same + * for all encoders allocated in the same process. */ +X265_API extern const int x265_max_bit_depth; + +/* x265_version_str: + * A static string containing the version of this compiled x265 library */ +X265_API extern const char *x265_version_str; + +/* x265_build_info: + * A static string describing the compiler and target architecture */ +X265_API extern const char *x265_build_info_str; + +/* Force a link error in the case of linking against an incompatible API version. + * Glue #defines exist to force correct macro expansion; the final output of the macro + * is x265_encoder_open_##X265_BUILD (for purposes of dlopen). */ +#define x265_encoder_glue1(x, y) x ## y +#define x265_encoder_glue2(x, y) x265_encoder_glue1(x, y) +#define x265_encoder_open x265_encoder_glue2(x265_encoder_open_, X265_BUILD) + +/* x265_encoder_open: + * create a new encoder handler, all parameters from x265_param are copied */ +x265_encoder* x265_encoder_open(x265_param *); + +/* x265_encoder_parameters: + * copies the current internal set of parameters to the pointer provided + * by the caller. useful when the calling application needs to know + * how x265_encoder_open has changed the parameters. + * note that the data accessible through pointers in the returned param struct + * (e.g. filenames) should not be modified by the calling application. */ +void x265_encoder_parameters(x265_encoder *, x265_param *); + +/* x265_encoder_headers: + * return the SPS and PPS that will be used for the whole stream. + * *pi_nal is the number of NAL units outputted in pp_nal. + * returns negative on error, total byte size of payload data on success + * the payloads of all output NALs are guaranteed to be sequential in memory. */ +int x265_encoder_headers(x265_encoder *, x265_nal **pp_nal, uint32_t *pi_nal); + +/* x265_encoder_encode: + * encode one picture. + * *pi_nal is the number of NAL units outputted in pp_nal. + * returns negative on error, 1 if a picture and access unit were output, + * or zero if the encoder pipeline is still filling or is empty after flushing. + * the payloads of all output NALs are guaranteed to be sequential in memory. + * To flush the encoder and retrieve delayed output pictures, pass pic_in as NULL. + * Once flushing has begun, all subsequent calls must pass pic_in as NULL. */ +int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out); + +/* x265_encoder_get_stats: + * returns encoder statistics */ +void x265_encoder_get_stats(x265_encoder *encoder, x265_stats *, uint32_t statsSizeBytes); + +/* x265_encoder_log: + * write a line to the configured CSV file. If a CSV filename was not + * configured, or file open failed, or the log level indicated frame level + * logging, this function will perform no write. */ +void x265_encoder_log(x265_encoder *encoder, int argc, char **argv); + +/* x265_encoder_close: + * close an encoder handler */ +void x265_encoder_close(x265_encoder *); + +/*** + * Release library static allocations + */ +void x265_cleanup(void); + +#ifdef __cplusplus +} +#endif + +#endif // X265_H diff --git a/source/x265.pc.in b/source/x265.pc.in new file mode 100644 index 0000000..0bf99e9 --- /dev/null +++ b/source/x265.pc.in @@ -0,0 +1,11 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${exec_prefix}/@LIB_INSTALL_DIR@ +includedir=${prefix}/include + +Name: @CMAKE_PROJECT_NAME@ +Description: H.265/HEVC video encoder +Version: @X265_LATEST_TAG@ +Libs: -L${libdir} -lx265 +Libs.private: @PRIVATE_LIBS@ +Cflags: -I${includedir} diff --git a/source/x265.rc.in b/source/x265.rc.in new file mode 100644 index 0000000..c737d94 --- /dev/null +++ b/source/x265.rc.in @@ -0,0 +1,32 @@ +#include + +VS_VERSION_INFO VERSIONINFO + FILEVERSION @X265_VERSION_MAJOR@,@X265_VERSION_MINOR@,@X265_BRANCH_ID@,@X265_TAG_DISTANCE@ + PRODUCTVERSION @X265_VERSION_MAJOR@,@X265_VERSION_MINOR@,@X265_BRANCH_ID@,@X265_TAG_DISTANCE@ + FILEFLAGSMASK VS_FFI_FILEFLAGSMASK + FILEOS VOS_NT_WINDOWS32 +#ifdef OPENOBEX_EXPORTS + FILETYPE VFT_DLL +#else + FILETYPE VFT_STATIC_LIB +#endif + FILESUBTYPE VFT2_UNKNOWN + BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "04090000" + BEGIN + VALUE "FileDescription", "HEVC video encoder" + VALUE "FileVersion", "@X265_VERSION@" + VALUE "InternalName", "x265" + VALUE "LegalCopyright", "Multicoreware: GPLv2 or commercial" + VALUE "OriginalFilename", "libx265.dll" + VALUE "ProductName", "x265" + VALUE "ProductVersion", "@X265_VERSION@" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1200 + END +END diff --git a/source/x265_config.h.in b/source/x265_config.h.in new file mode 100644 index 0000000..eacdc58 --- /dev/null +++ b/source/x265_config.h.in @@ -0,0 +1,34 @@ +/***************************************************************************** + * Copyright (C) 2013 x265 project + * + * Authors: Steve Borho + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_CONFIG_H +#define X265_CONFIG_H + +/* Defines generated at build time */ + +/* Incremented each time public API is changed, X265_BUILD is used as + * the shared library SONAME on platforms which support it. It also + * prevents linking against a different version of the static lib */ +#define X265_BUILD ${X265_BUILD} + +#endif -- 2.34.1