Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / asm-primitives.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6 * Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26#include "common.h"
27#include "primitives.h"
28#include "x265.h"
29#include "cpu.h"
30
31extern "C" {
32#include "pixel.h"
33#include "pixel-util.h"
34#include "mc.h"
35#include "ipfilter8.h"
36#include "loopfilter.h"
37#include "blockcopy8.h"
38#include "intrapred.h"
39#include "dct8.h"
40}
41
42#define INIT2_NAME(name1, name2, cpu) \
43 p.name1[LUMA_16x16] = x265_pixel_ ## name2 ## _16x16 ## cpu; \
44 p.name1[LUMA_16x8] = x265_pixel_ ## name2 ## _16x8 ## cpu;
45#define INIT4_NAME(name1, name2, cpu) \
46 INIT2_NAME(name1, name2, cpu) \
47 p.name1[LUMA_8x16] = x265_pixel_ ## name2 ## _8x16 ## cpu; \
48 p.name1[LUMA_8x8] = x265_pixel_ ## name2 ## _8x8 ## cpu;
49#define INIT5_NAME(name1, name2, cpu) \
50 INIT4_NAME(name1, name2, cpu) \
51 p.name1[LUMA_8x4] = x265_pixel_ ## name2 ## _8x4 ## cpu;
52#define INIT6_NAME(name1, name2, cpu) \
53 INIT5_NAME(name1, name2, cpu) \
54 p.name1[LUMA_4x8] = x265_pixel_ ## name2 ## _4x8 ## cpu;
55#define INIT7_NAME(name1, name2, cpu) \
56 INIT6_NAME(name1, name2, cpu) \
57 p.name1[LUMA_4x4] = x265_pixel_ ## name2 ## _4x4 ## cpu;
58#define INIT8_NAME(name1, name2, cpu) \
59 INIT7_NAME(name1, name2, cpu) \
60 p.name1[LUMA_4x16] = x265_pixel_ ## name2 ## _4x16 ## cpu;
61#define INIT2(name, cpu) INIT2_NAME(name, name, cpu)
62#define INIT4(name, cpu) INIT4_NAME(name, name, cpu)
63#define INIT5(name, cpu) INIT5_NAME(name, name, cpu)
64#define INIT6(name, cpu) INIT6_NAME(name, name, cpu)
65#define INIT7(name, cpu) INIT7_NAME(name, name, cpu)
66#define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
67
68#define HEVC_SATD(cpu) \
69 p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
70 p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
71 p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
72 p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \
73 p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \
74 p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \
75 p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
76 p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
77 p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \
78 p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
79 p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \
80 p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \
81 p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \
82 p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
83 p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \
84 p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \
85 p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \
86 p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
87 p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
88 p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
89 p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \
90 p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
91 p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
92 p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu;
93
94#define SAD_X3(cpu) \
95 p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
96 p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
97 p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
98 p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
99 p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
100 p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
101 p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
102 p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
103 p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
104 p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
105 p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
106 p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
107 p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
108 p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
109 p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
110 p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
111
112#define SAD_X4(cpu) \
113 p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
114 p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
115 p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
116 p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
117 p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
118 p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
119 p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
120 p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
121 p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
122 p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
123 p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
124 p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
125 p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
126 p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
127 p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
128 p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
129
130#define SAD(cpu) \
131 p.sad[LUMA_8x32] = x265_pixel_sad_8x32_ ## cpu; \
132 p.sad[LUMA_16x4] = x265_pixel_sad_16x4_ ## cpu; \
133 p.sad[LUMA_16x12] = x265_pixel_sad_16x12_ ## cpu; \
134 p.sad[LUMA_16x32] = x265_pixel_sad_16x32_ ## cpu; \
135 p.sad[LUMA_16x64] = x265_pixel_sad_16x64_ ## cpu; \
136 p.sad[LUMA_32x8] = x265_pixel_sad_32x8_ ## cpu; \
137 p.sad[LUMA_32x16] = x265_pixel_sad_32x16_ ## cpu; \
138 p.sad[LUMA_32x24] = x265_pixel_sad_32x24_ ## cpu; \
139 p.sad[LUMA_32x32] = x265_pixel_sad_32x32_ ## cpu; \
140 p.sad[LUMA_32x64] = x265_pixel_sad_32x64_ ## cpu; \
141 p.sad[LUMA_64x16] = x265_pixel_sad_64x16_ ## cpu; \
142 p.sad[LUMA_64x32] = x265_pixel_sad_64x32_ ## cpu; \
143 p.sad[LUMA_64x48] = x265_pixel_sad_64x48_ ## cpu; \
144 p.sad[LUMA_64x64] = x265_pixel_sad_64x64_ ## cpu; \
145 p.sad[LUMA_48x64] = x265_pixel_sad_48x64_ ## cpu; \
146 p.sad[LUMA_24x32] = x265_pixel_sad_24x32_ ## cpu; \
147 p.sad[LUMA_12x16] = x265_pixel_sad_12x16_ ## cpu
148
149#define ASSGN_SSE(cpu) \
150 p.sse_pp[LUMA_8x8] = x265_pixel_ssd_8x8_ ## cpu; \
151 p.sse_pp[LUMA_8x4] = x265_pixel_ssd_8x4_ ## cpu; \
152 p.sse_pp[LUMA_16x16] = x265_pixel_ssd_16x16_ ## cpu; \
153 p.sse_pp[LUMA_16x4] = x265_pixel_ssd_16x4_ ## cpu; \
154 p.sse_pp[LUMA_16x8] = x265_pixel_ssd_16x8_ ## cpu; \
155 p.sse_pp[LUMA_8x16] = x265_pixel_ssd_8x16_ ## cpu; \
156 p.sse_pp[LUMA_16x12] = x265_pixel_ssd_16x12_ ## cpu; \
157 p.sse_pp[LUMA_32x32] = x265_pixel_ssd_32x32_ ## cpu; \
158 p.sse_pp[LUMA_32x16] = x265_pixel_ssd_32x16_ ## cpu; \
159 p.sse_pp[LUMA_16x32] = x265_pixel_ssd_16x32_ ## cpu; \
160 p.sse_pp[LUMA_8x32] = x265_pixel_ssd_8x32_ ## cpu; \
161 p.sse_pp[LUMA_32x8] = x265_pixel_ssd_32x8_ ## cpu; \
162 p.sse_pp[LUMA_32x24] = x265_pixel_ssd_32x24_ ## cpu; \
163 p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
164 p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
165
166#define ASSGN_SSE_SS(cpu) \
167 p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_ ## cpu; \
168 p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_ ## cpu; \
169 p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_ ## cpu; \
170 p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_ ## cpu; \
171 p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_ ## cpu; \
172 p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_ ## cpu; \
173 p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_ ## cpu; \
174 p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_ ## cpu; \
175 p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_ ## cpu; \
176 p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_ ## cpu; \
177 p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_ ## cpu; \
178 p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_ ## cpu; \
179 p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_ ## cpu; \
180 p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_ ## cpu; \
181 p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_ ## cpu; \
182 p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_ ## cpu; \
183 p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_ ## cpu; \
184 p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_ ## cpu; \
185 p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_ ## cpu; \
186 p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_ ## cpu; \
187 p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_ ## cpu; \
188 p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_ ## cpu; \
189 p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_ ## cpu; \
190 p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_ ## cpu; \
191 p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_ ## cpu;
192
193#define SA8D_INTER_FROM_BLOCK(cpu) \
194 p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
195 p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
196 p.sa8d_inter[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
197 p.sa8d_inter[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
198 p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
199 p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_ ## cpu; \
200 p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_ ## cpu; \
201 p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
202 p.sa8d_inter[LUMA_16x8] = x265_pixel_sa8d_16x8_ ## cpu; \
203 p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_ ## cpu; \
204 p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_ ## cpu; \
205 p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_ ## cpu; \
206 p.sa8d_inter[LUMA_32x8] = x265_pixel_sa8d_32x8_ ## cpu; \
207 p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_ ## cpu; \
208 p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
209 p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_ ## cpu; \
210 p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_ ## cpu; \
211 p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
212 p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \
213 p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \
214 p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \
215 p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_ ## cpu; \
216 p.sa8d_inter[LUMA_64x16] = x265_pixel_sa8d_64x16_ ## cpu; \
217 p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_ ## cpu;
218
219#define PIXEL_AVG(cpu) \
220 p.pixelavg_pp[LUMA_64x64] = x265_pixel_avg_64x64_ ## cpu; \
221 p.pixelavg_pp[LUMA_64x48] = x265_pixel_avg_64x48_ ## cpu; \
222 p.pixelavg_pp[LUMA_64x32] = x265_pixel_avg_64x32_ ## cpu; \
223 p.pixelavg_pp[LUMA_64x16] = x265_pixel_avg_64x16_ ## cpu; \
224 p.pixelavg_pp[LUMA_48x64] = x265_pixel_avg_48x64_ ## cpu; \
225 p.pixelavg_pp[LUMA_32x64] = x265_pixel_avg_32x64_ ## cpu; \
226 p.pixelavg_pp[LUMA_32x32] = x265_pixel_avg_32x32_ ## cpu; \
227 p.pixelavg_pp[LUMA_32x24] = x265_pixel_avg_32x24_ ## cpu; \
228 p.pixelavg_pp[LUMA_32x16] = x265_pixel_avg_32x16_ ## cpu; \
229 p.pixelavg_pp[LUMA_32x8] = x265_pixel_avg_32x8_ ## cpu; \
230 p.pixelavg_pp[LUMA_24x32] = x265_pixel_avg_24x32_ ## cpu; \
231 p.pixelavg_pp[LUMA_16x64] = x265_pixel_avg_16x64_ ## cpu; \
232 p.pixelavg_pp[LUMA_16x32] = x265_pixel_avg_16x32_ ## cpu; \
233 p.pixelavg_pp[LUMA_16x16] = x265_pixel_avg_16x16_ ## cpu; \
234 p.pixelavg_pp[LUMA_16x12] = x265_pixel_avg_16x12_ ## cpu; \
235 p.pixelavg_pp[LUMA_16x8] = x265_pixel_avg_16x8_ ## cpu; \
236 p.pixelavg_pp[LUMA_16x4] = x265_pixel_avg_16x4_ ## cpu; \
237 p.pixelavg_pp[LUMA_12x16] = x265_pixel_avg_12x16_ ## cpu; \
238 p.pixelavg_pp[LUMA_8x32] = x265_pixel_avg_8x32_ ## cpu; \
239 p.pixelavg_pp[LUMA_8x16] = x265_pixel_avg_8x16_ ## cpu; \
240 p.pixelavg_pp[LUMA_8x8] = x265_pixel_avg_8x8_ ## cpu; \
241 p.pixelavg_pp[LUMA_8x4] = x265_pixel_avg_8x4_ ## cpu;
242
243#define PIXEL_AVG_W4(cpu) \
244 p.pixelavg_pp[LUMA_4x4] = x265_pixel_avg_4x4_ ## cpu; \
245 p.pixelavg_pp[LUMA_4x8] = x265_pixel_avg_4x8_ ## cpu; \
246 p.pixelavg_pp[LUMA_4x16] = x265_pixel_avg_4x16_ ## cpu;
247
248#define SETUP_CHROMA_FUNC_DEF_420(W, H, cpu) \
249 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
250 p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
251 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
252 p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
253
254#define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
255 p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
256 p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
257 p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
258 p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
259
260#define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
261 p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
262 p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
263 p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
264 p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
265
266#define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
267 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
268
269#define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
270 p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
271
272#define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
273 p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
274
275#define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
276 p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
277
278#define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
279 p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
280
281#define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
282 p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
283
284#define CHROMA_FILTERS_420(cpu) \
285 SETUP_CHROMA_FUNC_DEF_420(4, 4, cpu); \
286 SETUP_CHROMA_FUNC_DEF_420(4, 2, cpu); \
287 SETUP_CHROMA_FUNC_DEF_420(2, 4, cpu); \
288 SETUP_CHROMA_FUNC_DEF_420(8, 8, cpu); \
289 SETUP_CHROMA_FUNC_DEF_420(8, 4, cpu); \
290 SETUP_CHROMA_FUNC_DEF_420(4, 8, cpu); \
291 SETUP_CHROMA_FUNC_DEF_420(8, 6, cpu); \
292 SETUP_CHROMA_FUNC_DEF_420(6, 8, cpu); \
293 SETUP_CHROMA_FUNC_DEF_420(8, 2, cpu); \
294 SETUP_CHROMA_FUNC_DEF_420(2, 8, cpu); \
295 SETUP_CHROMA_FUNC_DEF_420(16, 16, cpu); \
296 SETUP_CHROMA_FUNC_DEF_420(16, 8, cpu); \
297 SETUP_CHROMA_FUNC_DEF_420(8, 16, cpu); \
298 SETUP_CHROMA_FUNC_DEF_420(16, 12, cpu); \
299 SETUP_CHROMA_FUNC_DEF_420(12, 16, cpu); \
300 SETUP_CHROMA_FUNC_DEF_420(16, 4, cpu); \
301 SETUP_CHROMA_FUNC_DEF_420(4, 16, cpu); \
302 SETUP_CHROMA_FUNC_DEF_420(32, 32, cpu); \
303 SETUP_CHROMA_FUNC_DEF_420(32, 16, cpu); \
304 SETUP_CHROMA_FUNC_DEF_420(16, 32, cpu); \
305 SETUP_CHROMA_FUNC_DEF_420(32, 24, cpu); \
306 SETUP_CHROMA_FUNC_DEF_420(24, 32, cpu); \
307 SETUP_CHROMA_FUNC_DEF_420(32, 8, cpu); \
308 SETUP_CHROMA_FUNC_DEF_420(8, 32, cpu);
309
310#define CHROMA_FILTERS_422(cpu) \
311 SETUP_CHROMA_FUNC_DEF_422(4, 8, cpu); \
312 SETUP_CHROMA_FUNC_DEF_422(4, 4, cpu); \
313 SETUP_CHROMA_FUNC_DEF_422(2, 8, cpu); \
314 SETUP_CHROMA_FUNC_DEF_422(8, 16, cpu); \
315 SETUP_CHROMA_FUNC_DEF_422(8, 8, cpu); \
316 SETUP_CHROMA_FUNC_DEF_422(4, 16, cpu); \
317 SETUP_CHROMA_FUNC_DEF_422(8, 12, cpu); \
318 SETUP_CHROMA_FUNC_DEF_422(6, 16, cpu); \
319 SETUP_CHROMA_FUNC_DEF_422(8, 4, cpu); \
320 SETUP_CHROMA_FUNC_DEF_422(2, 16, cpu); \
321 SETUP_CHROMA_FUNC_DEF_422(16, 32, cpu); \
322 SETUP_CHROMA_FUNC_DEF_422(16, 16, cpu); \
323 SETUP_CHROMA_FUNC_DEF_422(8, 32, cpu); \
324 SETUP_CHROMA_FUNC_DEF_422(16, 24, cpu); \
325 SETUP_CHROMA_FUNC_DEF_422(12, 32, cpu); \
326 SETUP_CHROMA_FUNC_DEF_422(16, 8, cpu); \
327 SETUP_CHROMA_FUNC_DEF_422(4, 32, cpu); \
328 SETUP_CHROMA_FUNC_DEF_422(32, 64, cpu); \
329 SETUP_CHROMA_FUNC_DEF_422(32, 32, cpu); \
330 SETUP_CHROMA_FUNC_DEF_422(16, 64, cpu); \
331 SETUP_CHROMA_FUNC_DEF_422(32, 48, cpu); \
332 SETUP_CHROMA_FUNC_DEF_422(24, 64, cpu); \
333 SETUP_CHROMA_FUNC_DEF_422(32, 16, cpu); \
334 SETUP_CHROMA_FUNC_DEF_422(8, 64, cpu);
335
336#define CHROMA_FILTERS_444(cpu) \
337 SETUP_CHROMA_FUNC_DEF_444(8, 8, cpu); \
338 SETUP_CHROMA_FUNC_DEF_444(8, 4, cpu); \
339 SETUP_CHROMA_FUNC_DEF_444(4, 8, cpu); \
340 SETUP_CHROMA_FUNC_DEF_444(16, 16, cpu); \
341 SETUP_CHROMA_FUNC_DEF_444(16, 8, cpu); \
342 SETUP_CHROMA_FUNC_DEF_444(8, 16, cpu); \
343 SETUP_CHROMA_FUNC_DEF_444(16, 12, cpu); \
344 SETUP_CHROMA_FUNC_DEF_444(12, 16, cpu); \
345 SETUP_CHROMA_FUNC_DEF_444(16, 4, cpu); \
346 SETUP_CHROMA_FUNC_DEF_444(4, 16, cpu); \
347 SETUP_CHROMA_FUNC_DEF_444(32, 32, cpu); \
348 SETUP_CHROMA_FUNC_DEF_444(32, 16, cpu); \
349 SETUP_CHROMA_FUNC_DEF_444(16, 32, cpu); \
350 SETUP_CHROMA_FUNC_DEF_444(32, 24, cpu); \
351 SETUP_CHROMA_FUNC_DEF_444(24, 32, cpu); \
352 SETUP_CHROMA_FUNC_DEF_444(32, 8, cpu); \
353 SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu); \
354 SETUP_CHROMA_FUNC_DEF_444(64, 64, cpu); \
355 SETUP_CHROMA_FUNC_DEF_444(64, 32, cpu); \
356 SETUP_CHROMA_FUNC_DEF_444(32, 64, cpu); \
357 SETUP_CHROMA_FUNC_DEF_444(64, 48, cpu); \
358 SETUP_CHROMA_FUNC_DEF_444(48, 64, cpu); \
359 SETUP_CHROMA_FUNC_DEF_444(64, 16, cpu); \
360 SETUP_CHROMA_FUNC_DEF_444(16, 64, cpu);
361
362#define CHROMA_SP_FILTERS_SSE4_420(cpu) \
363 SETUP_CHROMA_SP_FUNC_DEF_420(4, 4, cpu); \
364 SETUP_CHROMA_SP_FUNC_DEF_420(4, 2, cpu); \
365 SETUP_CHROMA_SP_FUNC_DEF_420(2, 4, cpu); \
366 SETUP_CHROMA_SP_FUNC_DEF_420(4, 8, cpu); \
367 SETUP_CHROMA_SP_FUNC_DEF_420(6, 8, cpu); \
368 SETUP_CHROMA_SP_FUNC_DEF_420(2, 8, cpu); \
369 SETUP_CHROMA_SP_FUNC_DEF_420(16, 16, cpu); \
370 SETUP_CHROMA_SP_FUNC_DEF_420(16, 8, cpu); \
371 SETUP_CHROMA_SP_FUNC_DEF_420(16, 12, cpu); \
372 SETUP_CHROMA_SP_FUNC_DEF_420(12, 16, cpu); \
373 SETUP_CHROMA_SP_FUNC_DEF_420(16, 4, cpu); \
374 SETUP_CHROMA_SP_FUNC_DEF_420(4, 16, cpu); \
375 SETUP_CHROMA_SP_FUNC_DEF_420(32, 32, cpu); \
376 SETUP_CHROMA_SP_FUNC_DEF_420(32, 16, cpu); \
377 SETUP_CHROMA_SP_FUNC_DEF_420(16, 32, cpu); \
378 SETUP_CHROMA_SP_FUNC_DEF_420(32, 24, cpu); \
379 SETUP_CHROMA_SP_FUNC_DEF_420(24, 32, cpu); \
380 SETUP_CHROMA_SP_FUNC_DEF_420(32, 8, cpu);
381
382#define CHROMA_SP_FILTERS_420(cpu) \
383 SETUP_CHROMA_SP_FUNC_DEF_420(8, 2, cpu); \
384 SETUP_CHROMA_SP_FUNC_DEF_420(8, 4, cpu); \
385 SETUP_CHROMA_SP_FUNC_DEF_420(8, 6, cpu); \
386 SETUP_CHROMA_SP_FUNC_DEF_420(8, 8, cpu); \
387 SETUP_CHROMA_SP_FUNC_DEF_420(8, 16, cpu); \
388 SETUP_CHROMA_SP_FUNC_DEF_420(8, 32, cpu);
389
390#define CHROMA_SP_FILTERS_SSE4_422(cpu) \
391 SETUP_CHROMA_SP_FUNC_DEF_422(4, 8, cpu); \
392 SETUP_CHROMA_SP_FUNC_DEF_422(4, 4, cpu); \
393 SETUP_CHROMA_SP_FUNC_DEF_422(2, 8, cpu); \
394 SETUP_CHROMA_SP_FUNC_DEF_422(4, 16, cpu); \
395 SETUP_CHROMA_SP_FUNC_DEF_422(6, 16, cpu); \
396 SETUP_CHROMA_SP_FUNC_DEF_422(2, 16, cpu); \
397 SETUP_CHROMA_SP_FUNC_DEF_422(16, 32, cpu); \
398 SETUP_CHROMA_SP_FUNC_DEF_422(16, 16, cpu); \
399 SETUP_CHROMA_SP_FUNC_DEF_422(16, 24, cpu); \
400 SETUP_CHROMA_SP_FUNC_DEF_422(12, 32, cpu); \
401 SETUP_CHROMA_SP_FUNC_DEF_422(16, 8, cpu); \
402 SETUP_CHROMA_SP_FUNC_DEF_422(4, 32, cpu); \
403 SETUP_CHROMA_SP_FUNC_DEF_422(32, 64, cpu); \
404 SETUP_CHROMA_SP_FUNC_DEF_422(32, 32, cpu); \
405 SETUP_CHROMA_SP_FUNC_DEF_422(16, 64, cpu); \
406 SETUP_CHROMA_SP_FUNC_DEF_422(32, 48, cpu); \
407 SETUP_CHROMA_SP_FUNC_DEF_422(24, 64, cpu); \
408 SETUP_CHROMA_SP_FUNC_DEF_422(32, 16, cpu);
409
410#define CHROMA_SP_FILTERS_422(cpu) \
411 SETUP_CHROMA_SP_FUNC_DEF_422(8, 4, cpu); \
412 SETUP_CHROMA_SP_FUNC_DEF_422(8, 8, cpu); \
413 SETUP_CHROMA_SP_FUNC_DEF_422(8, 12, cpu); \
414 SETUP_CHROMA_SP_FUNC_DEF_422(8, 16, cpu); \
415 SETUP_CHROMA_SP_FUNC_DEF_422(8, 32, cpu); \
416 SETUP_CHROMA_SP_FUNC_DEF_422(8, 64, cpu);
417
418#define CHROMA_SP_FILTERS_SSE4_444(cpu) \
419 SETUP_CHROMA_SP_FUNC_DEF_444(4, 8, cpu); \
420 SETUP_CHROMA_SP_FUNC_DEF_444(16, 16, cpu); \
421 SETUP_CHROMA_SP_FUNC_DEF_444(16, 8, cpu); \
422 SETUP_CHROMA_SP_FUNC_DEF_444(16, 12, cpu); \
423 SETUP_CHROMA_SP_FUNC_DEF_444(12, 16, cpu); \
424 SETUP_CHROMA_SP_FUNC_DEF_444(16, 4, cpu); \
425 SETUP_CHROMA_SP_FUNC_DEF_444(4, 16, cpu); \
426 SETUP_CHROMA_SP_FUNC_DEF_444(32, 32, cpu); \
427 SETUP_CHROMA_SP_FUNC_DEF_444(32, 16, cpu); \
428 SETUP_CHROMA_SP_FUNC_DEF_444(16, 32, cpu); \
429 SETUP_CHROMA_SP_FUNC_DEF_444(32, 24, cpu); \
430 SETUP_CHROMA_SP_FUNC_DEF_444(24, 32, cpu); \
431 SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu); \
432 SETUP_CHROMA_SP_FUNC_DEF_444(64, 64, cpu); \
433 SETUP_CHROMA_SP_FUNC_DEF_444(64, 32, cpu); \
434 SETUP_CHROMA_SP_FUNC_DEF_444(32, 64, cpu); \
435 SETUP_CHROMA_SP_FUNC_DEF_444(64, 48, cpu); \
436 SETUP_CHROMA_SP_FUNC_DEF_444(48, 64, cpu); \
437 SETUP_CHROMA_SP_FUNC_DEF_444(64, 16, cpu); \
438 SETUP_CHROMA_SP_FUNC_DEF_444(16, 64, cpu);
439
440#define CHROMA_SP_FILTERS_444(cpu) \
441 SETUP_CHROMA_SP_FUNC_DEF_444(8, 8, cpu); \
442 SETUP_CHROMA_SP_FUNC_DEF_444(8, 4, cpu); \
443 SETUP_CHROMA_SP_FUNC_DEF_444(8, 16, cpu); \
444 SETUP_CHROMA_SP_FUNC_DEF_444(8, 32, cpu);
445
446#define CHROMA_SS_FILTERS_420(cpu) \
447 SETUP_CHROMA_SS_FUNC_DEF_420(4, 4, cpu); \
448 SETUP_CHROMA_SS_FUNC_DEF_420(4, 2, cpu); \
449 SETUP_CHROMA_SS_FUNC_DEF_420(8, 8, cpu); \
450 SETUP_CHROMA_SS_FUNC_DEF_420(8, 4, cpu); \
451 SETUP_CHROMA_SS_FUNC_DEF_420(4, 8, cpu); \
452 SETUP_CHROMA_SS_FUNC_DEF_420(8, 6, cpu); \
453 SETUP_CHROMA_SS_FUNC_DEF_420(8, 2, cpu); \
454 SETUP_CHROMA_SS_FUNC_DEF_420(16, 16, cpu); \
455 SETUP_CHROMA_SS_FUNC_DEF_420(16, 8, cpu); \
456 SETUP_CHROMA_SS_FUNC_DEF_420(8, 16, cpu); \
457 SETUP_CHROMA_SS_FUNC_DEF_420(16, 12, cpu); \
458 SETUP_CHROMA_SS_FUNC_DEF_420(12, 16, cpu); \
459 SETUP_CHROMA_SS_FUNC_DEF_420(16, 4, cpu); \
460 SETUP_CHROMA_SS_FUNC_DEF_420(4, 16, cpu); \
461 SETUP_CHROMA_SS_FUNC_DEF_420(32, 32, cpu); \
462 SETUP_CHROMA_SS_FUNC_DEF_420(32, 16, cpu); \
463 SETUP_CHROMA_SS_FUNC_DEF_420(16, 32, cpu); \
464 SETUP_CHROMA_SS_FUNC_DEF_420(32, 24, cpu); \
465 SETUP_CHROMA_SS_FUNC_DEF_420(24, 32, cpu); \
466 SETUP_CHROMA_SS_FUNC_DEF_420(32, 8, cpu); \
467 SETUP_CHROMA_SS_FUNC_DEF_420(8, 32, cpu);
468
469#define CHROMA_SS_FILTERS_SSE4_420(cpu) \
470 SETUP_CHROMA_SS_FUNC_DEF_420(2, 4, cpu); \
471 SETUP_CHROMA_SS_FUNC_DEF_420(2, 8, cpu); \
472 SETUP_CHROMA_SS_FUNC_DEF_420(6, 8, cpu);
473
474#define CHROMA_SS_FILTERS_422(cpu) \
475 SETUP_CHROMA_SS_FUNC_DEF_422(4, 8, cpu); \
476 SETUP_CHROMA_SS_FUNC_DEF_422(4, 4, cpu); \
477 SETUP_CHROMA_SS_FUNC_DEF_422(8, 16, cpu); \
478 SETUP_CHROMA_SS_FUNC_DEF_422(8, 8, cpu); \
479 SETUP_CHROMA_SS_FUNC_DEF_422(4, 16, cpu); \
480 SETUP_CHROMA_SS_FUNC_DEF_422(8, 12, cpu); \
481 SETUP_CHROMA_SS_FUNC_DEF_422(8, 4, cpu); \
482 SETUP_CHROMA_SS_FUNC_DEF_422(16, 32, cpu); \
483 SETUP_CHROMA_SS_FUNC_DEF_422(16, 16, cpu); \
484 SETUP_CHROMA_SS_FUNC_DEF_422(8, 32, cpu); \
485 SETUP_CHROMA_SS_FUNC_DEF_422(16, 24, cpu); \
486 SETUP_CHROMA_SS_FUNC_DEF_422(12, 32, cpu); \
487 SETUP_CHROMA_SS_FUNC_DEF_422(16, 8, cpu); \
488 SETUP_CHROMA_SS_FUNC_DEF_422(4, 32, cpu); \
489 SETUP_CHROMA_SS_FUNC_DEF_422(32, 64, cpu); \
490 SETUP_CHROMA_SS_FUNC_DEF_422(32, 32, cpu); \
491 SETUP_CHROMA_SS_FUNC_DEF_422(16, 64, cpu); \
492 SETUP_CHROMA_SS_FUNC_DEF_422(32, 48, cpu); \
493 SETUP_CHROMA_SS_FUNC_DEF_422(24, 64, cpu); \
494 SETUP_CHROMA_SS_FUNC_DEF_422(32, 16, cpu); \
495 SETUP_CHROMA_SS_FUNC_DEF_422(8, 64, cpu);
496
497#define CHROMA_SS_FILTERS_SSE4_422(cpu) \
498 SETUP_CHROMA_SS_FUNC_DEF_422(2, 8, cpu); \
499 SETUP_CHROMA_SS_FUNC_DEF_422(2, 16, cpu); \
500 SETUP_CHROMA_SS_FUNC_DEF_422(6, 16, cpu);
501
502#define CHROMA_SS_FILTERS_444(cpu) \
503 SETUP_CHROMA_SS_FUNC_DEF_444(8, 8, cpu); \
504 SETUP_CHROMA_SS_FUNC_DEF_444(8, 4, cpu); \
505 SETUP_CHROMA_SS_FUNC_DEF_444(4, 8, cpu); \
506 SETUP_CHROMA_SS_FUNC_DEF_444(16, 16, cpu); \
507 SETUP_CHROMA_SS_FUNC_DEF_444(16, 8, cpu); \
508 SETUP_CHROMA_SS_FUNC_DEF_444(8, 16, cpu); \
509 SETUP_CHROMA_SS_FUNC_DEF_444(16, 12, cpu); \
510 SETUP_CHROMA_SS_FUNC_DEF_444(12, 16, cpu); \
511 SETUP_CHROMA_SS_FUNC_DEF_444(16, 4, cpu); \
512 SETUP_CHROMA_SS_FUNC_DEF_444(4, 16, cpu); \
513 SETUP_CHROMA_SS_FUNC_DEF_444(32, 32, cpu); \
514 SETUP_CHROMA_SS_FUNC_DEF_444(32, 16, cpu); \
515 SETUP_CHROMA_SS_FUNC_DEF_444(16, 32, cpu); \
516 SETUP_CHROMA_SS_FUNC_DEF_444(32, 24, cpu); \
517 SETUP_CHROMA_SS_FUNC_DEF_444(24, 32, cpu); \
518 SETUP_CHROMA_SS_FUNC_DEF_444(32, 8, cpu); \
519 SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu); \
520 SETUP_CHROMA_SS_FUNC_DEF_444(64, 64, cpu); \
521 SETUP_CHROMA_SS_FUNC_DEF_444(64, 32, cpu); \
522 SETUP_CHROMA_SS_FUNC_DEF_444(32, 64, cpu); \
523 SETUP_CHROMA_SS_FUNC_DEF_444(64, 48, cpu); \
524 SETUP_CHROMA_SS_FUNC_DEF_444(48, 64, cpu); \
525 SETUP_CHROMA_SS_FUNC_DEF_444(64, 16, cpu); \
526 SETUP_CHROMA_SS_FUNC_DEF_444(16, 64, cpu);
527
528#if HIGH_BIT_DEPTH // temporary, until all 10bit functions are completed
529#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
530 p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
531 p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
532 p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
533 p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
534 p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
535#else
536#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
537 p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
538 p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
539 p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
540 p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;
541#endif // if HIGH_BIT_DEPTH
542
543#define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
544 p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
545 p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
546
547#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
548 p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
549
550#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
551 p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
552
553#define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
554 p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
555
556#define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
557 p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
558
559#define CHROMA_BLOCKCOPY(type, cpu) \
560 SETUP_CHROMA_BLOCKCOPY(type, 2, 4, cpu); \
561 SETUP_CHROMA_BLOCKCOPY(type, 2, 8, cpu); \
562 SETUP_CHROMA_BLOCKCOPY(type, 4, 2, cpu); \
563 SETUP_CHROMA_BLOCKCOPY(type, 4, 4, cpu); \
564 SETUP_CHROMA_BLOCKCOPY(type, 4, 8, cpu); \
565 SETUP_CHROMA_BLOCKCOPY(type, 4, 16, cpu); \
566 SETUP_CHROMA_BLOCKCOPY(type, 6, 8, cpu); \
567 SETUP_CHROMA_BLOCKCOPY(type, 8, 2, cpu); \
568 SETUP_CHROMA_BLOCKCOPY(type, 8, 4, cpu); \
569 SETUP_CHROMA_BLOCKCOPY(type, 8, 6, cpu); \
570 SETUP_CHROMA_BLOCKCOPY(type, 8, 8, cpu); \
571 SETUP_CHROMA_BLOCKCOPY(type, 8, 16, cpu); \
572 SETUP_CHROMA_BLOCKCOPY(type, 8, 32, cpu); \
573 SETUP_CHROMA_BLOCKCOPY(type, 12, 16, cpu); \
574 SETUP_CHROMA_BLOCKCOPY(type, 16, 4, cpu); \
575 SETUP_CHROMA_BLOCKCOPY(type, 16, 8, cpu); \
576 SETUP_CHROMA_BLOCKCOPY(type, 16, 12, cpu); \
577 SETUP_CHROMA_BLOCKCOPY(type, 16, 16, cpu); \
578 SETUP_CHROMA_BLOCKCOPY(type, 16, 32, cpu); \
579 SETUP_CHROMA_BLOCKCOPY(type, 24, 32, cpu); \
580 SETUP_CHROMA_BLOCKCOPY(type, 32, 8, cpu); \
581 SETUP_CHROMA_BLOCKCOPY(type, 32, 16, cpu); \
582 SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
583 SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
584
585#define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
586 p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
587
588#define CHROMA_BLOCKCOPY_422(type, cpu) \
589 SETUP_CHROMA_BLOCKCOPY_422(type, 2, 8, cpu); \
590 SETUP_CHROMA_BLOCKCOPY_422(type, 2, 16, cpu); \
591 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 4, cpu); \
592 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 8, cpu); \
593 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 16, cpu); \
594 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 32, cpu); \
595 SETUP_CHROMA_BLOCKCOPY_422(type, 6, 16, cpu); \
596 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 4, cpu); \
597 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 8, cpu); \
598 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 12, cpu); \
599 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 16, cpu); \
600 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 32, cpu); \
601 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 64, cpu); \
602 SETUP_CHROMA_BLOCKCOPY_422(type, 12, 32, cpu); \
603 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 8, cpu); \
604 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 16, cpu); \
605 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 24, cpu); \
606 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 32, cpu); \
607 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 64, cpu); \
608 SETUP_CHROMA_BLOCKCOPY_422(type, 24, 64, cpu); \
609 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 16, cpu); \
610 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 32, cpu); \
611 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 48, cpu); \
612 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 64, cpu);
613
614#define LUMA_BLOCKCOPY(type, cpu) \
615 SETUP_LUMA_BLOCKCOPY(type, 4, 4, cpu); \
616 SETUP_LUMA_BLOCKCOPY(type, 8, 8, cpu); \
617 SETUP_LUMA_BLOCKCOPY(type, 8, 4, cpu); \
618 SETUP_LUMA_BLOCKCOPY(type, 4, 8, cpu); \
619 SETUP_LUMA_BLOCKCOPY(type, 16, 16, cpu); \
620 SETUP_LUMA_BLOCKCOPY(type, 16, 8, cpu); \
621 SETUP_LUMA_BLOCKCOPY(type, 8, 16, cpu); \
622 SETUP_LUMA_BLOCKCOPY(type, 16, 12, cpu); \
623 SETUP_LUMA_BLOCKCOPY(type, 12, 16, cpu); \
624 SETUP_LUMA_BLOCKCOPY(type, 16, 4, cpu); \
625 SETUP_LUMA_BLOCKCOPY(type, 4, 16, cpu); \
626 SETUP_LUMA_BLOCKCOPY(type, 32, 32, cpu); \
627 SETUP_LUMA_BLOCKCOPY(type, 32, 16, cpu); \
628 SETUP_LUMA_BLOCKCOPY(type, 16, 32, cpu); \
629 SETUP_LUMA_BLOCKCOPY(type, 32, 24, cpu); \
630 SETUP_LUMA_BLOCKCOPY(type, 24, 32, cpu); \
631 SETUP_LUMA_BLOCKCOPY(type, 32, 8, cpu); \
632 SETUP_LUMA_BLOCKCOPY(type, 8, 32, cpu); \
633 SETUP_LUMA_BLOCKCOPY(type, 64, 64, cpu); \
634 SETUP_LUMA_BLOCKCOPY(type, 64, 32, cpu); \
635 SETUP_LUMA_BLOCKCOPY(type, 32, 64, cpu); \
636 SETUP_LUMA_BLOCKCOPY(type, 64, 48, cpu); \
637 SETUP_LUMA_BLOCKCOPY(type, 48, 64, cpu); \
638 SETUP_LUMA_BLOCKCOPY(type, 64, 16, cpu); \
639 SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
640
641#define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
642 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
643
644#define CHROMA_BLOCKCOPY_SP(cpu) \
645 SETUP_CHROMA_BLOCKCOPY_SP(2, 4, cpu); \
646 SETUP_CHROMA_BLOCKCOPY_SP(2, 8, cpu); \
647 SETUP_CHROMA_BLOCKCOPY_SP(4, 2, cpu); \
648 SETUP_CHROMA_BLOCKCOPY_SP(4, 4, cpu); \
649 SETUP_CHROMA_BLOCKCOPY_SP(4, 8, cpu); \
650 SETUP_CHROMA_BLOCKCOPY_SP(4, 16, cpu); \
651 SETUP_CHROMA_BLOCKCOPY_SP(6, 8, cpu); \
652 SETUP_CHROMA_BLOCKCOPY_SP(8, 2, cpu); \
653 SETUP_CHROMA_BLOCKCOPY_SP(8, 4, cpu); \
654 SETUP_CHROMA_BLOCKCOPY_SP(8, 6, cpu); \
655 SETUP_CHROMA_BLOCKCOPY_SP(8, 8, cpu); \
656 SETUP_CHROMA_BLOCKCOPY_SP(8, 16, cpu); \
657 SETUP_CHROMA_BLOCKCOPY_SP(8, 32, cpu); \
658 SETUP_CHROMA_BLOCKCOPY_SP(12, 16, cpu); \
659 SETUP_CHROMA_BLOCKCOPY_SP(16, 4, cpu); \
660 SETUP_CHROMA_BLOCKCOPY_SP(16, 8, cpu); \
661 SETUP_CHROMA_BLOCKCOPY_SP(16, 12, cpu); \
662 SETUP_CHROMA_BLOCKCOPY_SP(16, 16, cpu); \
663 SETUP_CHROMA_BLOCKCOPY_SP(16, 32, cpu); \
664 SETUP_CHROMA_BLOCKCOPY_SP(24, 32, cpu); \
665 SETUP_CHROMA_BLOCKCOPY_SP(32, 8, cpu); \
666 SETUP_CHROMA_BLOCKCOPY_SP(32, 16, cpu); \
667 SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
668 SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
669
670#define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
671 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
672
673#define CHROMA_BLOCKCOPY_SP_422(cpu) \
674 SETUP_CHROMA_BLOCKCOPY_SP_422(2, 8, cpu); \
675 SETUP_CHROMA_BLOCKCOPY_SP_422(2, 16, cpu); \
676 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 4, cpu); \
677 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 8, cpu); \
678 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 16, cpu); \
679 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 32, cpu); \
680 SETUP_CHROMA_BLOCKCOPY_SP_422(6, 16, cpu); \
681 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 4, cpu); \
682 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 8, cpu); \
683 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 12, cpu); \
684 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 16, cpu); \
685 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 32, cpu); \
686 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 64, cpu); \
687 SETUP_CHROMA_BLOCKCOPY_SP_422(12, 32, cpu); \
688 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 8, cpu); \
689 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 16, cpu); \
690 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 24, cpu); \
691 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 32, cpu); \
692 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 64, cpu); \
693 SETUP_CHROMA_BLOCKCOPY_SP_422(24, 64, cpu); \
694 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 16, cpu); \
695 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 32, cpu); \
696 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 48, cpu); \
697 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
698
699#define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
700 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
701 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
702
703#define CHROMA_PIXELSUB_PS(cpu) \
704 SETUP_CHROMA_PIXELSUB(4, 4, cpu); \
705 SETUP_CHROMA_PIXELSUB(8, 8, cpu); \
706 SETUP_CHROMA_PIXELSUB(16, 16, cpu); \
707 SETUP_CHROMA_PIXELSUB(32, 32, cpu);
708
709#define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
710 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
711 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
712
713#define CHROMA_PIXELSUB_PS_422(cpu) \
714 SETUP_CHROMA_PIXELSUB_422(4, 8, cpu); \
715 SETUP_CHROMA_PIXELSUB_422(8, 16, cpu); \
716 SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \
717 SETUP_CHROMA_PIXELSUB_422(32, 64, cpu);
718
719#define LUMA_FILTERS(cpu) \
720 SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
721 SETUP_LUMA_FUNC_DEF(8, 8, cpu); \
722 SETUP_LUMA_FUNC_DEF(8, 4, cpu); \
723 SETUP_LUMA_FUNC_DEF(4, 8, cpu); \
724 SETUP_LUMA_FUNC_DEF(16, 16, cpu); \
725 SETUP_LUMA_FUNC_DEF(16, 8, cpu); \
726 SETUP_LUMA_FUNC_DEF(8, 16, cpu); \
727 SETUP_LUMA_FUNC_DEF(16, 12, cpu); \
728 SETUP_LUMA_FUNC_DEF(12, 16, cpu); \
729 SETUP_LUMA_FUNC_DEF(16, 4, cpu); \
730 SETUP_LUMA_FUNC_DEF(4, 16, cpu); \
731 SETUP_LUMA_FUNC_DEF(32, 32, cpu); \
732 SETUP_LUMA_FUNC_DEF(32, 16, cpu); \
733 SETUP_LUMA_FUNC_DEF(16, 32, cpu); \
734 SETUP_LUMA_FUNC_DEF(32, 24, cpu); \
735 SETUP_LUMA_FUNC_DEF(24, 32, cpu); \
736 SETUP_LUMA_FUNC_DEF(32, 8, cpu); \
737 SETUP_LUMA_FUNC_DEF(8, 32, cpu); \
738 SETUP_LUMA_FUNC_DEF(64, 64, cpu); \
739 SETUP_LUMA_FUNC_DEF(64, 32, cpu); \
740 SETUP_LUMA_FUNC_DEF(32, 64, cpu); \
741 SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
742 SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
743 SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
744 SETUP_LUMA_FUNC_DEF(16, 64, cpu);
745
746#define LUMA_PIXELSUB(cpu) \
747 SETUP_LUMA_SUB_FUNC_DEF(4, 4, cpu); \
748 SETUP_LUMA_SUB_FUNC_DEF(8, 8, cpu); \
749 SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \
750 SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \
751 SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu);
752
753#define LUMA_SP_FILTERS(cpu) \
754 SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
755 SETUP_LUMA_SP_FUNC_DEF(8, 8, cpu); \
756 SETUP_LUMA_SP_FUNC_DEF(8, 4, cpu); \
757 SETUP_LUMA_SP_FUNC_DEF(4, 8, cpu); \
758 SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \
759 SETUP_LUMA_SP_FUNC_DEF(16, 8, cpu); \
760 SETUP_LUMA_SP_FUNC_DEF(8, 16, cpu); \
761 SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \
762 SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \
763 SETUP_LUMA_SP_FUNC_DEF(16, 4, cpu); \
764 SETUP_LUMA_SP_FUNC_DEF(4, 16, cpu); \
765 SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \
766 SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \
767 SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \
768 SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \
769 SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \
770 SETUP_LUMA_SP_FUNC_DEF(32, 8, cpu); \
771 SETUP_LUMA_SP_FUNC_DEF(8, 32, cpu); \
772 SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \
773 SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \
774 SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \
775 SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \
776 SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \
777 SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \
778 SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
779
780#define LUMA_SS_FILTERS(cpu) \
781 SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \
782 SETUP_LUMA_SS_FUNC_DEF(8, 8, cpu); \
783 SETUP_LUMA_SS_FUNC_DEF(8, 4, cpu); \
784 SETUP_LUMA_SS_FUNC_DEF(4, 8, cpu); \
785 SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \
786 SETUP_LUMA_SS_FUNC_DEF(16, 8, cpu); \
787 SETUP_LUMA_SS_FUNC_DEF(8, 16, cpu); \
788 SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \
789 SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \
790 SETUP_LUMA_SS_FUNC_DEF(16, 4, cpu); \
791 SETUP_LUMA_SS_FUNC_DEF(4, 16, cpu); \
792 SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \
793 SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \
794 SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \
795 SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \
796 SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \
797 SETUP_LUMA_SS_FUNC_DEF(32, 8, cpu); \
798 SETUP_LUMA_SS_FUNC_DEF(8, 32, cpu); \
799 SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \
800 SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \
801 SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \
802 SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \
803 SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \
804 SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
805 SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
806
807#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
808 p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
809
810#define LUMA_VAR(cpu) \
811 SETUP_PIXEL_VAR_DEF(8, 8, cpu); \
812 SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
813 SETUP_PIXEL_VAR_DEF(32, 32, cpu); \
814 SETUP_PIXEL_VAR_DEF(64, 64, cpu);
815
816#define SETUP_PIXEL_SSE_SP_DEF(W, H, cpu) \
817 p.sse_sp[LUMA_ ## W ## x ## H] = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
818
819#define LUMA_SSE_SP(cpu) \
820 SETUP_PIXEL_SSE_SP_DEF(4, 4, cpu); \
821 SETUP_PIXEL_SSE_SP_DEF(8, 8, cpu); \
822 SETUP_PIXEL_SSE_SP_DEF(8, 4, cpu); \
823 SETUP_PIXEL_SSE_SP_DEF(4, 8, cpu); \
824 SETUP_PIXEL_SSE_SP_DEF(16, 16, cpu); \
825 SETUP_PIXEL_SSE_SP_DEF(16, 8, cpu); \
826 SETUP_PIXEL_SSE_SP_DEF(8, 16, cpu); \
827 SETUP_PIXEL_SSE_SP_DEF(16, 12, cpu); \
828 SETUP_PIXEL_SSE_SP_DEF(12, 16, cpu); \
829 SETUP_PIXEL_SSE_SP_DEF(16, 4, cpu); \
830 SETUP_PIXEL_SSE_SP_DEF(4, 16, cpu); \
831 SETUP_PIXEL_SSE_SP_DEF(32, 32, cpu); \
832 SETUP_PIXEL_SSE_SP_DEF(32, 16, cpu); \
833 SETUP_PIXEL_SSE_SP_DEF(16, 32, cpu); \
834 SETUP_PIXEL_SSE_SP_DEF(32, 24, cpu); \
835 SETUP_PIXEL_SSE_SP_DEF(24, 32, cpu); \
836 SETUP_PIXEL_SSE_SP_DEF(32, 8, cpu); \
837 SETUP_PIXEL_SSE_SP_DEF(8, 32, cpu); \
838 SETUP_PIXEL_SSE_SP_DEF(64, 64, cpu); \
839 SETUP_PIXEL_SSE_SP_DEF(64, 32, cpu); \
840 SETUP_PIXEL_SSE_SP_DEF(32, 64, cpu); \
841 SETUP_PIXEL_SSE_SP_DEF(64, 48, cpu); \
842 SETUP_PIXEL_SSE_SP_DEF(48, 64, cpu); \
843 SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
844 SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
845
846#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
847 p.luma_addAvg[LUMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
848
849#define LUMA_ADDAVG(cpu) \
850 SETUP_LUMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
851 SETUP_LUMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
852 SETUP_LUMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
853 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
854 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
855 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
856 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
857 SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
858 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
859 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
860 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
861 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
862 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
863 SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
864 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
865 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
866 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
867 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
868 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
869 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
870 SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
871 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
872 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
873 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
874 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
875
876#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
877 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
878
879#define CHROMA_ADDAVG(cpu) \
880 SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 4, cpu); \
881 SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 8, cpu); \
882 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \
883 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
884 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
885 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
886 SETUP_CHROMA_ADDAVG_FUNC_DEF(6, 8, cpu); \
887 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \
888 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
889 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \
890 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
891 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
892 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
893 SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
894 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
895 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
896 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
897 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
898 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
899 SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
900 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
901 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
902 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
903 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
904
905#define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
906 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
907
908#define CHROMA_ADDAVG_422(cpu) \
909 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 8, cpu); \
910 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 16, cpu); \
911 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 4, cpu); \
912 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 8, cpu); \
913 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 16, cpu); \
914 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 32, cpu); \
915 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(6, 16, cpu); \
916 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 4, cpu); \
917 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 8, cpu); \
918 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 12, cpu); \
919 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 16, cpu); \
920 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 32, cpu); \
921 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 64, cpu); \
922 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(12, 32, cpu); \
923 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 8, cpu); \
924 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 16, cpu); \
925 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 24, cpu); \
926 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 32, cpu); \
927 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 64, cpu); \
928 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(24, 64, cpu); \
929 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 16, cpu); \
930 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 32, cpu); \
931 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
932 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
933
934#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
935 p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
936 p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
937 p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
938 p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
939
940#define SETUP_INTRA_ANG(mode, fno, cpu) \
941 p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
942 p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
943 p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
944
945#define SETUP_INTRA_ANG4(mode, fno, cpu) \
946 p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
947
948#define SETUP_INTRA_ANG16_32(mode, fno, cpu) \
949 p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
950 p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
951
952#define SETUP_INTRA_ANG4_8(mode, fno, cpu) \
953 p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
954 p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
955
956#define INTRA_ANG_SSSE3(cpu) \
957 SETUP_INTRA_ANG_COMMON(2, 2, cpu); \
958 SETUP_INTRA_ANG_COMMON(34, 2, cpu);
959
960#define INTRA_ANG_SSE4_COMMON(cpu) \
961 SETUP_INTRA_ANG_COMMON(3, 3, cpu); \
962 SETUP_INTRA_ANG_COMMON(4, 4, cpu); \
963 SETUP_INTRA_ANG_COMMON(5, 5, cpu); \
964 SETUP_INTRA_ANG_COMMON(6, 6, cpu); \
965 SETUP_INTRA_ANG_COMMON(7, 7, cpu); \
966 SETUP_INTRA_ANG_COMMON(8, 8, cpu); \
967 SETUP_INTRA_ANG_COMMON(9, 9, cpu); \
968 SETUP_INTRA_ANG_COMMON(10, 10, cpu); \
969 SETUP_INTRA_ANG_COMMON(11, 11, cpu); \
970 SETUP_INTRA_ANG_COMMON(12, 12, cpu); \
971 SETUP_INTRA_ANG_COMMON(13, 13, cpu); \
972 SETUP_INTRA_ANG_COMMON(14, 14, cpu); \
973 SETUP_INTRA_ANG_COMMON(15, 15, cpu); \
974 SETUP_INTRA_ANG_COMMON(16, 16, cpu); \
975 SETUP_INTRA_ANG_COMMON(17, 17, cpu); \
976 SETUP_INTRA_ANG_COMMON(18, 18, cpu);
977
978#define INTRA_ANG_SSE4_HIGH(cpu) \
979 SETUP_INTRA_ANG(19, 19, cpu); \
980 SETUP_INTRA_ANG(20, 20, cpu); \
981 SETUP_INTRA_ANG(21, 21, cpu); \
982 SETUP_INTRA_ANG(22, 22, cpu); \
983 SETUP_INTRA_ANG(23, 23, cpu); \
984 SETUP_INTRA_ANG(24, 24, cpu); \
985 SETUP_INTRA_ANG(25, 25, cpu); \
986 SETUP_INTRA_ANG(26, 26, cpu); \
987 SETUP_INTRA_ANG(27, 27, cpu); \
988 SETUP_INTRA_ANG(28, 28, cpu); \
989 SETUP_INTRA_ANG(29, 29, cpu); \
990 SETUP_INTRA_ANG(30, 30, cpu); \
991 SETUP_INTRA_ANG(31, 31, cpu); \
992 SETUP_INTRA_ANG(32, 32, cpu); \
993 SETUP_INTRA_ANG(33, 33, cpu); \
994 SETUP_INTRA_ANG4(19, 17, cpu); \
995 SETUP_INTRA_ANG4(20, 16, cpu); \
996 SETUP_INTRA_ANG4(21, 15, cpu); \
997 SETUP_INTRA_ANG4(22, 14, cpu); \
998 SETUP_INTRA_ANG4(23, 13, cpu); \
999 SETUP_INTRA_ANG4(24, 12, cpu); \
1000 SETUP_INTRA_ANG4(25, 11, cpu); \
1001 SETUP_INTRA_ANG4(26, 26, cpu); \
1002 SETUP_INTRA_ANG4(27, 9, cpu); \
1003 SETUP_INTRA_ANG4(28, 8, cpu); \
1004 SETUP_INTRA_ANG4(29, 7, cpu); \
1005 SETUP_INTRA_ANG4(30, 6, cpu); \
1006 SETUP_INTRA_ANG4(31, 5, cpu); \
1007 SETUP_INTRA_ANG4(32, 4, cpu); \
1008 SETUP_INTRA_ANG4(33, 3, cpu);
1009
1010#define INTRA_ANG_SSE4(cpu) \
1011 SETUP_INTRA_ANG4_8(19, 17, cpu); \
1012 SETUP_INTRA_ANG4_8(20, 16, cpu); \
1013 SETUP_INTRA_ANG4_8(21, 15, cpu); \
1014 SETUP_INTRA_ANG4_8(22, 14, cpu); \
1015 SETUP_INTRA_ANG4_8(23, 13, cpu); \
1016 SETUP_INTRA_ANG4_8(24, 12, cpu); \
1017 SETUP_INTRA_ANG4_8(25, 11, cpu); \
1018 SETUP_INTRA_ANG4_8(26, 26, cpu); \
1019 SETUP_INTRA_ANG4_8(27, 9, cpu); \
1020 SETUP_INTRA_ANG4_8(28, 8, cpu); \
1021 SETUP_INTRA_ANG4_8(29, 7, cpu); \
1022 SETUP_INTRA_ANG4_8(30, 6, cpu); \
1023 SETUP_INTRA_ANG4_8(31, 5, cpu); \
1024 SETUP_INTRA_ANG4_8(32, 4, cpu); \
1025 SETUP_INTRA_ANG4_8(33, 3, cpu); \
1026 SETUP_INTRA_ANG16_32(19, 19, cpu); \
1027 SETUP_INTRA_ANG16_32(20, 20, cpu); \
1028 SETUP_INTRA_ANG16_32(21, 21, cpu); \
1029 SETUP_INTRA_ANG16_32(22, 22, cpu); \
1030 SETUP_INTRA_ANG16_32(23, 23, cpu); \
1031 SETUP_INTRA_ANG16_32(24, 24, cpu); \
1032 SETUP_INTRA_ANG16_32(25, 25, cpu); \
1033 SETUP_INTRA_ANG16_32(26, 26, cpu); \
1034 SETUP_INTRA_ANG16_32(27, 27, cpu); \
1035 SETUP_INTRA_ANG16_32(28, 28, cpu); \
1036 SETUP_INTRA_ANG16_32(29, 29, cpu); \
1037 SETUP_INTRA_ANG16_32(30, 30, cpu); \
1038 SETUP_INTRA_ANG16_32(31, 31, cpu); \
1039 SETUP_INTRA_ANG16_32(32, 32, cpu); \
1040 SETUP_INTRA_ANG16_32(33, 33, cpu);
1041
1042#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
1043 p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
1044 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
1045 p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
1046 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
1047
1048#define CHROMA_VERT_FILTERS(cpu) \
1049 SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
1050 SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
1051 SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
1052 SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
1053 SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \
1054 SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \
1055 SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
1056 SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
1057 SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
1058 SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
1059 SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
1060 SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
1061 SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
1062 SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
1063 SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
1064 SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
1065 SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
1066 SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
1067 SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
1068 SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu);
1069
1070#define CHROMA_VERT_FILTERS_SSE4(cpu) \
1071 SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \
1072 SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
1073 SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
1074 SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
1075
1076#define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
1077 p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
1078 p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
1079 p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
1080 p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
1081
1082#define CHROMA_VERT_FILTERS_422(cpu) \
1083 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
1084 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 16, cpu); \
1085 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 8, cpu); \
1086 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 16, cpu); \
1087 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 12, cpu); \
1088 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 4, cpu); \
1089 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 32, cpu); \
1090 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 16, cpu); \
1091 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 32, cpu); \
1092 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 24, cpu); \
1093 SETUP_CHROMA_VERT_FUNC_DEF_422(12, 32, cpu); \
1094 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 8, cpu); \
1095 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 32, cpu); \
1096 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 64, cpu); \
1097 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 32, cpu); \
1098 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 64, cpu); \
1099 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 48, cpu); \
1100 SETUP_CHROMA_VERT_FUNC_DEF_422(24, 64, cpu); \
1101 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 16, cpu); \
1102 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 64, cpu);
1103
1104#define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
1105 SETUP_CHROMA_VERT_FUNC_DEF_422(2, 8, cpu); \
1106 SETUP_CHROMA_VERT_FUNC_DEF_422(2, 16, cpu); \
1107 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 4, cpu); \
1108 SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
1109
1110#define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
1111 p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
1112 p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
1113 p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
1114 p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
1115
1116#define CHROMA_VERT_FILTERS_444(cpu) \
1117 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
1118 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 4, cpu); \
1119 SETUP_CHROMA_VERT_FUNC_DEF_444(4, 8, cpu); \
1120 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 16, cpu); \
1121 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 8, cpu); \
1122 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 16, cpu); \
1123 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 12, cpu); \
1124 SETUP_CHROMA_VERT_FUNC_DEF_444(12, 16, cpu); \
1125 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 4, cpu); \
1126 SETUP_CHROMA_VERT_FUNC_DEF_444(4, 16, cpu); \
1127 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 32, cpu); \
1128 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 16, cpu); \
1129 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 32, cpu); \
1130 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 24, cpu); \
1131 SETUP_CHROMA_VERT_FUNC_DEF_444(24, 32, cpu); \
1132 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 8, cpu); \
1133 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 32, cpu); \
1134 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 64, cpu); \
1135 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 32, cpu); \
1136 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 64, cpu); \
1137 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 48, cpu); \
1138 SETUP_CHROMA_VERT_FUNC_DEF_444(48, 64, cpu); \
1139 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 16, cpu); \
1140 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu);
1141
1142#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
1143 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
1144 p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
1145
1146#define CHROMA_HORIZ_FILTERS(cpu) \
1147 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
1148 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \
1149 SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \
1150 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
1151 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
1152 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
1153 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \
1154 SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \
1155 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \
1156 SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
1157 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
1158 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
1159 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
1160 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
1161 SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
1162 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
1163 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
1164 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
1165 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
1166 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
1167 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
1168 SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
1169 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
1170 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu);
1171
1172#define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
1173 p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
1174 p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
1175
1176#define CHROMA_HORIZ_FILTERS_422(cpu) \
1177 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
1178 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 4, cpu); \
1179 SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 8, cpu); \
1180 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 16, cpu); \
1181 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 8, cpu); \
1182 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 16, cpu); \
1183 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 12, cpu); \
1184 SETUP_CHROMA_HORIZ_FUNC_DEF_422(6, 16, cpu); \
1185 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 4, cpu); \
1186 SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 16, cpu); \
1187 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 32, cpu); \
1188 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 16, cpu); \
1189 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 32, cpu); \
1190 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 24, cpu); \
1191 SETUP_CHROMA_HORIZ_FUNC_DEF_422(12, 32, cpu); \
1192 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 8, cpu); \
1193 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 32, cpu); \
1194 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 64, cpu); \
1195 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 32, cpu); \
1196 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 64, cpu); \
1197 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 48, cpu); \
1198 SETUP_CHROMA_HORIZ_FUNC_DEF_422(24, 64, cpu); \
1199 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 16, cpu); \
1200 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu);
1201
1202#define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
1203 p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
1204 p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
1205
1206#define CHROMA_HORIZ_FILTERS_444(cpu) \
1207 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
1208 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 4, cpu); \
1209 SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 8, cpu); \
1210 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 16, cpu); \
1211 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 8, cpu); \
1212 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 16, cpu); \
1213 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 12, cpu); \
1214 SETUP_CHROMA_HORIZ_FUNC_DEF_444(12, 16, cpu); \
1215 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 4, cpu); \
1216 SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 16, cpu); \
1217 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 32, cpu); \
1218 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 16, cpu); \
1219 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 32, cpu); \
1220 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 24, cpu); \
1221 SETUP_CHROMA_HORIZ_FUNC_DEF_444(24, 32, cpu); \
1222 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 8, cpu); \
1223 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 32, cpu); \
1224 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 64, cpu); \
1225 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 32, cpu); \
1226 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 64, cpu); \
1227 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 48, cpu); \
1228 SETUP_CHROMA_HORIZ_FUNC_DEF_444(48, 64, cpu); \
1229 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 16, cpu); \
1230 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 64, cpu);
1231
1232namespace x265 {
1233// private x265 namespace
1234
1235#if HIGH_BIT_DEPTH
1236/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
1237template<int log2Size>
1238void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
1239{
1240 const int size = 1 << log2Size;
1241 const int sizeIdx = log2Size - 2;
1242 ALIGN_VAR_32(pixel, buffer[32 * 32]);
1243
1244 for (int mode = 2; mode <= 34; mode++)
1245 {
1246 pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0);
1247 pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0);
1248 pixel *out = dest + ((mode - 2) << (log2Size * 2));
1249
1250 if (mode < 18)
1251 {
1252 primitives.intra_pred[mode][sizeIdx](buffer, size, left, above, mode, bLuma);
1253 primitives.transpose[sizeIdx](out, buffer, size);
1254 }
1255 else
1256 primitives.intra_pred[mode][sizeIdx](out, size, left, above, mode, bLuma);
1257 }
1258}
1259#endif
1260
1261void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
1262{
1263#if HIGH_BIT_DEPTH
1264 if (cpuMask & X265_CPU_SSE2)
1265 {
1266 INIT8(sad, _mmx2);
1267 INIT2(sad, _sse2);
1268 SAD(sse2);
1269
1270 INIT6(satd, _sse2);
1271 HEVC_SATD(sse2);
1272 p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1273
1274 p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1275 SA8D_INTER_FROM_BLOCK(sse2);
1276 p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
1277 p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
1278
1279 p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_mmx2;
1280 p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_mmx2;
1281 p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_mmx2;
1282 p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_sse2;
1283 p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2;
1284 p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2;
1285 p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2;
1286 p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2;
1287 p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2;
1288 p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2;
1289 p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2;
1290 p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2;
1291 p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2;
1292 p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2;
1293 p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2;
1294 p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2;
1295 p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2;
1296 p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2;
1297 p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
1298 p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
1299 p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2;
1300 p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2;
1301 p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2;
1302 p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2;
1303 p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2;
1304
1305 p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
1306 p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
1307 p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
1308 p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
1309 p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
1310
1311 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
1312 p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
1313 PIXEL_AVG(sse2);
1314 PIXEL_AVG_W4(mmx2);
1315 LUMA_VAR(_sse2);
1316
1317 SAD_X3(sse2);
1318 p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
1319 p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
1320 p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
1321 p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
1322 p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
1323 p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
1324 p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
1325 p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
1326 p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
1327
1328 SAD_X4(sse2);
1329 p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
1330 p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
1331 p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
1332 p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
1333 p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
1334 p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
1335 p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
1336 p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
1337 p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
1338
1339 p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
1340 p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
1341 p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
1342 p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
1343 p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
1344
1345 CHROMA_PIXELSUB_PS(_sse2);
1346 CHROMA_PIXELSUB_PS_422(_sse2);
1347 LUMA_PIXELSUB(_sse2);
1348
1349 CHROMA_BLOCKCOPY(ss, _sse2);
1350 CHROMA_BLOCKCOPY_422(ss, _sse2);
1351 LUMA_BLOCKCOPY(ss, _sse2);
1352
1353 CHROMA_VERT_FILTERS(_sse2);
1354 CHROMA_VERT_FILTERS_422(_sse2);
1355 CHROMA_VERT_FILTERS_444(_sse2);
1356 p.luma_p2s = x265_luma_p2s_sse2;
1357 p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
1358 p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_sse2;
1359 p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
1360
1361 p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
1362 p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
1363 p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
1364 p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
1365
1366 // TODO: overflow on 12-bits mode!
1367 p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
1368 p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
1369 p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
1370 p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
1371
1372 p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
1373 p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
1374 p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
1375 p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
1376
1377 p.dct[DCT_4x4] = x265_dct4_sse2;
1378 p.idct[IDCT_4x4] = x265_idct4_sse2;
1379 p.idct[IDST_4x4] = x265_idst4_sse2;
1380
1381 LUMA_SS_FILTERS(_sse2);
1382 }
1383 if (cpuMask & X265_CPU_SSSE3)
1384 {
1385 p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
1386 p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
1387
1388 INTRA_ANG_SSSE3(ssse3);
1389
1390 p.dct[DST_4x4] = x265_dst4_ssse3;
1391 p.idct[IDCT_8x8] = x265_idct8_ssse3;
1392 p.count_nonzero = x265_count_nonzero_ssse3;
1393 }
1394 if (cpuMask & X265_CPU_SSE4)
1395 {
1396 LUMA_ADDAVG(_sse4);
1397 CHROMA_ADDAVG(_sse4);
1398 CHROMA_ADDAVG_422(_sse4);
1399 LUMA_FILTERS(_sse4);
1400 CHROMA_HORIZ_FILTERS(_sse4);
1401 CHROMA_VERT_FILTERS_SSE4(_sse4);
1402 CHROMA_HORIZ_FILTERS_422(_sse4);
1403 CHROMA_VERT_FILTERS_SSE4_422(_sse4);
1404 CHROMA_HORIZ_FILTERS_444(_sse4);
1405
1406 p.dct[DCT_8x8] = x265_dct8_sse4;
1407 p.quant = x265_quant_sse4;
1408 p.nquant = x265_nquant_sse4;
1409 p.dequant_normal = x265_dequant_normal_sse4;
1410 p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
1411 p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
1412 p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
1413 p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
1414 p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
1415 p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
1416 p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
1417 p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
1418 p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4;
1419
1420 p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4;
1421 p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
1422 p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
1423 p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
1424 p.planecopy_cp = x265_upShift_8_sse4;
1425
1426 INTRA_ANG_SSE4_COMMON(sse4);
1427 INTRA_ANG_SSE4_HIGH(sse4);
1428 }
1429 if (cpuMask & X265_CPU_XOP)
1430 {
1431 p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
1432 SA8D_INTER_FROM_BLOCK(xop);
1433 INIT7(satd, _xop);
1434 HEVC_SATD(xop);
1435 }
1436 if (cpuMask & X265_CPU_AVX2)
1437 {
1438 p.dct[DCT_4x4] = x265_dct4_avx2;
1439 p.quant = x265_quant_avx2;
1440 p.nquant = x265_nquant_avx2;
1441 p.dequant_normal = x265_dequant_normal_avx2;
1442 p.scale1D_128to64 = x265_scale1D_128to64_avx2;
1443#if X86_64
1444 p.dct[DCT_8x8] = x265_dct8_avx2;
1445 p.dct[DCT_16x16] = x265_dct16_avx2;
1446 p.dct[DCT_32x32] = x265_dct32_avx2;
1447 p.idct[IDCT_4x4] = x265_idct4_avx2;
1448 p.idct[IDCT_8x8] = x265_idct8_avx2;
1449 p.idct[IDCT_16x16] = x265_idct16_avx2;
1450 p.idct[IDCT_32x32] = x265_idct32_avx2;
1451
1452 p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
1453 p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
1454 p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
1455 p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
1456#endif
1457 }
1458 /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
1459 for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
1460 {
1461 p.sse_pp[i] = (pixelcmp_t)p.sse_ss[i];
1462 p.sse_sp[i] = (pixelcmp_sp_t)p.sse_ss[i];
1463 }
1464
1465 for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
1466 {
1467 p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
1468 p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
1469 p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
1470 }
1471
1472 for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
1473 {
1474 p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
1475 p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
1476 p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
1477 }
1478
1479 for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
1480 {
1481 p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
1482 p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
1483 p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
1484 }
1485
1486 if (p.intra_pred[0][0] && p.transpose[0])
1487 {
1488 p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
1489 p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
1490 p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
1491 p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
1492 }
1493
1494#else // if HIGH_BIT_DEPTH
1495 if (cpuMask & X265_CPU_SSE2)
1496 {
1497 INIT8_NAME(sse_pp, ssd, _mmx);
1498 INIT8(sad, _mmx2);
1499 INIT8(sad_x3, _mmx2);
1500 INIT8(sad_x4, _mmx2);
1501 p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1502 p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1503 p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
1504
1505 PIXEL_AVG(sse2);
1506 PIXEL_AVG_W4(mmx2);
1507
1508 LUMA_VAR(_sse2);
1509
1510 ASSGN_SSE(sse2);
1511 ASSGN_SSE_SS(sse2);
1512 INIT2(sad, _sse2);
1513 SAD(sse2);
1514 INIT2(sad_x3, _sse2);
1515 INIT2(sad_x4, _sse2);
1516 HEVC_SATD(sse2);
1517
1518 CHROMA_BLOCKCOPY(ss, _sse2);
1519 CHROMA_BLOCKCOPY(pp, _sse2);
1520 CHROMA_BLOCKCOPY_422(ss, _sse2);
1521 CHROMA_BLOCKCOPY_422(pp, _sse2);
1522 LUMA_BLOCKCOPY(ss, _sse2);
1523 LUMA_BLOCKCOPY(pp, _sse2);
1524 LUMA_BLOCKCOPY(sp, _sse2);
1525 CHROMA_BLOCKCOPY_SP(_sse2);
1526 CHROMA_BLOCKCOPY_SP_422(_sse2);
1527
1528 CHROMA_SS_FILTERS_420(_sse2);
1529 CHROMA_SS_FILTERS_422(_sse2);
1530 CHROMA_SS_FILTERS_444(_sse2);
1531 CHROMA_SP_FILTERS_420(_sse2);
1532 CHROMA_SP_FILTERS_422(_sse2);
1533 CHROMA_SP_FILTERS_444(_sse2);
1534 LUMA_SS_FILTERS(_sse2);
1535
1536 // This function pointer initialization is temporary will be removed
1537 // later with macro definitions. It is used to avoid linker errors
1538 // until all partitions are coded and commit smaller patches, easier to
1539 // review.
1540
1541 p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
1542 p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
1543 p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
1544 p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
1545
1546 p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
1547 p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
1548 p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
1549 p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
1550
1551 p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
1552 SA8D_INTER_FROM_BLOCK(sse2);
1553
1554 p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
1555 p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
1556 p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
1557 p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
1558 p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
1559 p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
1560 p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
1561 p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
1562 p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
1563 p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
1564 p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
1565 p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
1566 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
1567 p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
1568 p.dct[DCT_4x4] = x265_dct4_sse2;
1569 p.idct[IDCT_4x4] = x265_idct4_sse2;
1570 p.idct[IDST_4x4] = x265_idst4_sse2;
1571 p.planecopy_sp = x265_downShift_16_sse2;
1572 p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
1573 p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
1574 p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
1575 p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
1576 }
1577 if (cpuMask & X265_CPU_SSSE3)
1578 {
1579 p.frame_init_lowres_core = x265_frame_init_lowres_core_ssse3;
1580 SA8D_INTER_FROM_BLOCK(ssse3);
1581 p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
1582 ASSGN_SSE(ssse3);
1583 PIXEL_AVG(ssse3);
1584 PIXEL_AVG_W4(ssse3);
1585
1586 INTRA_ANG_SSSE3(ssse3);
1587
1588 p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
1589 p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
1590 SAD_X3(ssse3);
1591 SAD_X4(ssse3);
1592 p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
1593 p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
1594 p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
1595 p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3;
1596 p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_ssse3;
1597 p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_ssse3;
1598
1599 p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
1600 p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
1601
1602 p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
1603 p.luma_p2s = x265_luma_p2s_ssse3;
1604 p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_ssse3;
1605 p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
1606 p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
1607
1608 p.dct[DST_4x4] = x265_dst4_ssse3;
1609 p.idct[IDCT_8x8] = x265_idct8_ssse3;
1610 p.count_nonzero = x265_count_nonzero_ssse3;
1611 }
1612 if (cpuMask & X265_CPU_SSE4)
1613 {
1614 p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
1615
1616 LUMA_ADDAVG(_sse4);
1617 CHROMA_ADDAVG(_sse4);
1618 CHROMA_ADDAVG_422(_sse4);
1619 p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
1620 p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
1621 p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
1622 p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
1623 p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
1624
1625 // TODO: check POPCNT flag!
1626 p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
1627 p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_sse4;
1628 p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_sse4;
1629 p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_sse4;
1630
1631 HEVC_SATD(sse4);
1632 SA8D_INTER_FROM_BLOCK(sse4);
1633
1634 p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
1635 p.sse_pp[LUMA_24x32] = x265_pixel_ssd_24x32_sse4;
1636 p.sse_pp[LUMA_48x64] = x265_pixel_ssd_48x64_sse4;
1637 p.sse_pp[LUMA_64x16] = x265_pixel_ssd_64x16_sse4;
1638 p.sse_pp[LUMA_64x32] = x265_pixel_ssd_64x32_sse4;
1639 p.sse_pp[LUMA_64x48] = x265_pixel_ssd_64x48_sse4;
1640 p.sse_pp[LUMA_64x64] = x265_pixel_ssd_64x64_sse4;
1641
1642 LUMA_SSE_SP(_sse4);
1643
1644 CHROMA_PIXELSUB_PS(_sse4);
1645 CHROMA_PIXELSUB_PS_422(_sse4);
1646 LUMA_PIXELSUB(_sse4);
1647
1648 CHROMA_FILTERS_420(_sse4);
1649 CHROMA_FILTERS_422(_sse4);
1650 CHROMA_FILTERS_444(_sse4);
1651 CHROMA_SS_FILTERS_SSE4_420(_sse4);
1652 CHROMA_SS_FILTERS_SSE4_422(_sse4);
1653 CHROMA_SP_FILTERS_SSE4_420(_sse4);
1654 CHROMA_SP_FILTERS_SSE4_422(_sse4);
1655 CHROMA_SP_FILTERS_SSE4_444(_sse4);
1656 LUMA_SP_FILTERS(_sse4);
1657 LUMA_FILTERS(_sse4);
1658 ASSGN_SSE_SS(sse4);
1659
1660 p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
1661 p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
1662 p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
1663 CHROMA_BLOCKCOPY(ps, _sse4);
1664 CHROMA_BLOCKCOPY_422(ps, _sse4);
1665 LUMA_BLOCKCOPY(ps, _sse4);
1666
1667 p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
1668 p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
1669 p.quant = x265_quant_sse4;
1670 p.nquant = x265_nquant_sse4;
1671 p.dequant_normal = x265_dequant_normal_sse4;
1672 p.weight_pp = x265_weight_pp_sse4;
1673 p.weight_sp = x265_weight_sp_sse4;
1674 p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
1675 p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
1676 p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
1677 p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4;
1678
1679 p.intra_pred_allangs[BLOCK_4x4] = x265_all_angs_pred_4x4_sse4;
1680 p.intra_pred_allangs[BLOCK_8x8] = x265_all_angs_pred_8x8_sse4;
1681 p.intra_pred_allangs[BLOCK_16x16] = x265_all_angs_pred_16x16_sse4;
1682 p.intra_pred_allangs[BLOCK_32x32] = x265_all_angs_pred_32x32_sse4;
1683
1684 p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4;
1685 p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
1686 p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
1687 p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
1688
1689 INTRA_ANG_SSE4_COMMON(sse4);
1690 INTRA_ANG_SSE4(sse4);
1691
1692 p.dct[DCT_8x8] = x265_dct8_sse4;
1693 p.copy_shr = x265_copy_shr_sse4;
1694 p.denoiseDct = x265_denoise_dct_sse4;
1695 }
1696 if (cpuMask & X265_CPU_AVX)
1697 {
1698 p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
1699 HEVC_SATD(avx);
1700 SA8D_INTER_FROM_BLOCK(avx);
1701 ASSGN_SSE(avx);
1702
1703 ASSGN_SSE_SS(avx);
1704 SAD_X3(avx);
1705 SAD_X4(avx);
1706 p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
1707 p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
1708 p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
1709 p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
1710
1711 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
1712 p.ssim_end_4 = x265_pixel_ssim_end4_avx;
1713 p.luma_copy_ss[LUMA_64x16] = x265_blockcopy_ss_64x16_avx;
1714 p.luma_copy_ss[LUMA_64x32] = x265_blockcopy_ss_64x32_avx;
1715 p.luma_copy_ss[LUMA_64x48] = x265_blockcopy_ss_64x48_avx;
1716 p.luma_copy_ss[LUMA_64x64] = x265_blockcopy_ss_64x64_avx;
1717
1718 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x8] = x265_blockcopy_pp_32x8_avx;
1719 p.luma_copy_pp[LUMA_32x8] = x265_blockcopy_pp_32x8_avx;
1720
1721 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x16] = x265_blockcopy_pp_32x16_avx;
1722 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x16] = x265_blockcopy_pp_32x16_avx;
1723 p.luma_copy_pp[LUMA_32x16] = x265_blockcopy_pp_32x16_avx;
1724
1725 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x24] = x265_blockcopy_pp_32x24_avx;
1726 p.luma_copy_pp[LUMA_32x24] = x265_blockcopy_pp_32x24_avx;
1727
1728 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x32] = x265_blockcopy_pp_32x32_avx;
1729 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x32] = x265_blockcopy_pp_32x32_avx;
1730 p.luma_copy_pp[LUMA_32x32] = x265_blockcopy_pp_32x32_avx;
1731
1732 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x48] = x265_blockcopy_pp_32x48_avx;
1733
1734 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x64] = x265_blockcopy_pp_32x64_avx;
1735 p.luma_copy_pp[LUMA_32x64] = x265_blockcopy_pp_32x64_avx;
1736 }
1737 if (cpuMask & X265_CPU_XOP)
1738 {
1739 p.frame_init_lowres_core = x265_frame_init_lowres_core_xop;
1740 SA8D_INTER_FROM_BLOCK(xop);
1741 INIT7(satd, _xop);
1742 INIT5_NAME(sse_pp, ssd, _xop);
1743 HEVC_SATD(xop);
1744 }
1745 if (cpuMask & X265_CPU_AVX2)
1746 {
1747 INIT2(sad_x4, _avx2);
1748 INIT4(satd, _avx2);
1749 INIT2_NAME(sse_pp, ssd, _avx2);
1750 p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
1751 p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
1752 p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
1753
1754 /* Need to update assembly code as per changed interface of the copy_cnt primitive, once
1755 * code is updated, avx2 version will be enabled */
1756
1757 p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
1758 p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
1759 p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
1760
1761 p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
1762 p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
1763
1764 p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
1765 p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
1766 p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
1767 p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
1768 p.denoiseDct = x265_denoise_dct_avx2;
1769 p.dct[DCT_4x4] = x265_dct4_avx2;
1770 p.quant = x265_quant_avx2;
1771 p.nquant = x265_nquant_avx2;
1772 p.dequant_normal = x265_dequant_normal_avx2;
1773 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
1774 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
1775 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
1776 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x16] = x265_blockcopy_ss_16x16_avx;
1777 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x32] = x265_blockcopy_ss_16x32_avx;
1778 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x8] = x265_blockcopy_ss_16x8_avx;
1779 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x16] = x265_blockcopy_ss_16x16_avx;
1780 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x24] = x265_blockcopy_ss_16x24_avx;
1781 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx;
1782 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx;
1783 p.scale1D_128to64 = x265_scale1D_128to64_avx2;
1784
1785 p.weight_pp = x265_weight_pp_avx2;
1786
1787#if X86_64
1788 p.dct[DCT_8x8] = x265_dct8_avx2;
1789 p.dct[DCT_16x16] = x265_dct16_avx2;
1790 p.dct[DCT_32x32] = x265_dct32_avx2;
1791 p.idct[IDCT_4x4] = x265_idct4_avx2;
1792 p.idct[IDCT_8x8] = x265_idct8_avx2;
1793 p.idct[IDCT_16x16] = x265_idct16_avx2;
1794 p.idct[IDCT_32x32] = x265_idct32_avx2;
1795
1796 p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
1797 p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
1798 p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
1799 p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
1800#endif
1801 p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
1802 }
1803#endif // if HIGH_BIT_DEPTH
1804}
1805}
1806
1807extern "C" {
1808#ifdef __INTEL_COMPILER
1809
1810/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
1811 * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
1812 * adapted to x265's cpu schema. */
1813
1814// Global variable indicating cpu
1815int __intel_cpu_indicator = 0;
1816// CPU dispatcher function
1817void x265_intel_cpu_indicator_init(void)
1818{
1819 uint32_t cpu = x265::cpu_detect();
1820
1821 if (cpu & X265_CPU_AVX)
1822 __intel_cpu_indicator = 0x20000;
1823 else if (cpu & X265_CPU_SSE42)
1824 __intel_cpu_indicator = 0x8000;
1825 else if (cpu & X265_CPU_SSE4)
1826 __intel_cpu_indicator = 0x2000;
1827 else if (cpu & X265_CPU_SSSE3)
1828 __intel_cpu_indicator = 0x1000;
1829 else if (cpu & X265_CPU_SSE3)
1830 __intel_cpu_indicator = 0x800;
1831 else if (cpu & X265_CPU_SSE2 && !(cpu & X265_CPU_SSE2_IS_SLOW))
1832 __intel_cpu_indicator = 0x200;
1833 else if (cpu & X265_CPU_SSE)
1834 __intel_cpu_indicator = 0x80;
1835 else if (cpu & X265_CPU_MMX2)
1836 __intel_cpu_indicator = 8;
1837 else
1838 __intel_cpu_indicator = 1;
1839}
1840
1841/* __intel_cpu_indicator_init appears to have a non-standard calling convention that
1842 * assumes certain registers aren't preserved, so we'll route it through a function
1843 * that backs up all the registers. */
1844void __intel_cpu_indicator_init(void)
1845{
1846 x265_safe_intel_cpu_indicator_init();
1847}
1848
1849#else // ifdef __INTEL_COMPILER
1850void x265_intel_cpu_indicator_init(void) {}
1851
1852#endif // ifdef __INTEL_COMPILER
1853}