Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / asm-primitives.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6 * Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "primitives.h"
28 #include "x265.h"
29 #include "cpu.h"
30
31 extern "C" {
32 #include "pixel.h"
33 #include "pixel-util.h"
34 #include "mc.h"
35 #include "ipfilter8.h"
36 #include "loopfilter.h"
37 #include "blockcopy8.h"
38 #include "intrapred.h"
39 #include "dct8.h"
40 }
41
42 #define INIT2_NAME(name1, name2, cpu) \
43 p.name1[LUMA_16x16] = x265_pixel_ ## name2 ## _16x16 ## cpu; \
44 p.name1[LUMA_16x8] = x265_pixel_ ## name2 ## _16x8 ## cpu;
45 #define INIT4_NAME(name1, name2, cpu) \
46 INIT2_NAME(name1, name2, cpu) \
47 p.name1[LUMA_8x16] = x265_pixel_ ## name2 ## _8x16 ## cpu; \
48 p.name1[LUMA_8x8] = x265_pixel_ ## name2 ## _8x8 ## cpu;
49 #define INIT5_NAME(name1, name2, cpu) \
50 INIT4_NAME(name1, name2, cpu) \
51 p.name1[LUMA_8x4] = x265_pixel_ ## name2 ## _8x4 ## cpu;
52 #define INIT6_NAME(name1, name2, cpu) \
53 INIT5_NAME(name1, name2, cpu) \
54 p.name1[LUMA_4x8] = x265_pixel_ ## name2 ## _4x8 ## cpu;
55 #define INIT7_NAME(name1, name2, cpu) \
56 INIT6_NAME(name1, name2, cpu) \
57 p.name1[LUMA_4x4] = x265_pixel_ ## name2 ## _4x4 ## cpu;
58 #define INIT8_NAME(name1, name2, cpu) \
59 INIT7_NAME(name1, name2, cpu) \
60 p.name1[LUMA_4x16] = x265_pixel_ ## name2 ## _4x16 ## cpu;
61 #define INIT2(name, cpu) INIT2_NAME(name, name, cpu)
62 #define INIT4(name, cpu) INIT4_NAME(name, name, cpu)
63 #define INIT5(name, cpu) INIT5_NAME(name, name, cpu)
64 #define INIT6(name, cpu) INIT6_NAME(name, name, cpu)
65 #define INIT7(name, cpu) INIT7_NAME(name, name, cpu)
66 #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
67
68 #define HEVC_SATD(cpu) \
69 p.satd[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
70 p.satd[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
71 p.satd[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
72 p.satd[LUMA_8x8] = x265_pixel_satd_8x8_ ## cpu; \
73 p.satd[LUMA_8x16] = x265_pixel_satd_8x16_ ## cpu; \
74 p.satd[LUMA_8x32] = x265_pixel_satd_8x32_ ## cpu; \
75 p.satd[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
76 p.satd[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
77 p.satd[LUMA_16x8] = x265_pixel_satd_16x8_ ## cpu; \
78 p.satd[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
79 p.satd[LUMA_16x16] = x265_pixel_satd_16x16_ ## cpu; \
80 p.satd[LUMA_16x32] = x265_pixel_satd_16x32_ ## cpu; \
81 p.satd[LUMA_16x64] = x265_pixel_satd_16x64_ ## cpu; \
82 p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
83 p.satd[LUMA_32x8] = x265_pixel_satd_32x8_ ## cpu; \
84 p.satd[LUMA_32x16] = x265_pixel_satd_32x16_ ## cpu; \
85 p.satd[LUMA_32x24] = x265_pixel_satd_32x24_ ## cpu; \
86 p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
87 p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
88 p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
89 p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu; \
90 p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
91 p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
92 p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu;
93
94 #define SAD_X3(cpu) \
95 p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
96 p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
97 p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
98 p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
99 p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
100 p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
101 p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
102 p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
103 p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
104 p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
105 p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
106 p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
107 p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
108 p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
109 p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
110 p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
111
112 #define SAD_X4(cpu) \
113 p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
114 p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
115 p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
116 p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
117 p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
118 p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
119 p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
120 p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
121 p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
122 p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
123 p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
124 p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
125 p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
126 p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
127 p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
128 p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
129
130 #define SAD(cpu) \
131 p.sad[LUMA_8x32] = x265_pixel_sad_8x32_ ## cpu; \
132 p.sad[LUMA_16x4] = x265_pixel_sad_16x4_ ## cpu; \
133 p.sad[LUMA_16x12] = x265_pixel_sad_16x12_ ## cpu; \
134 p.sad[LUMA_16x32] = x265_pixel_sad_16x32_ ## cpu; \
135 p.sad[LUMA_16x64] = x265_pixel_sad_16x64_ ## cpu; \
136 p.sad[LUMA_32x8] = x265_pixel_sad_32x8_ ## cpu; \
137 p.sad[LUMA_32x16] = x265_pixel_sad_32x16_ ## cpu; \
138 p.sad[LUMA_32x24] = x265_pixel_sad_32x24_ ## cpu; \
139 p.sad[LUMA_32x32] = x265_pixel_sad_32x32_ ## cpu; \
140 p.sad[LUMA_32x64] = x265_pixel_sad_32x64_ ## cpu; \
141 p.sad[LUMA_64x16] = x265_pixel_sad_64x16_ ## cpu; \
142 p.sad[LUMA_64x32] = x265_pixel_sad_64x32_ ## cpu; \
143 p.sad[LUMA_64x48] = x265_pixel_sad_64x48_ ## cpu; \
144 p.sad[LUMA_64x64] = x265_pixel_sad_64x64_ ## cpu; \
145 p.sad[LUMA_48x64] = x265_pixel_sad_48x64_ ## cpu; \
146 p.sad[LUMA_24x32] = x265_pixel_sad_24x32_ ## cpu; \
147 p.sad[LUMA_12x16] = x265_pixel_sad_12x16_ ## cpu
148
149 #define ASSGN_SSE(cpu) \
150 p.sse_pp[LUMA_8x8] = x265_pixel_ssd_8x8_ ## cpu; \
151 p.sse_pp[LUMA_8x4] = x265_pixel_ssd_8x4_ ## cpu; \
152 p.sse_pp[LUMA_16x16] = x265_pixel_ssd_16x16_ ## cpu; \
153 p.sse_pp[LUMA_16x4] = x265_pixel_ssd_16x4_ ## cpu; \
154 p.sse_pp[LUMA_16x8] = x265_pixel_ssd_16x8_ ## cpu; \
155 p.sse_pp[LUMA_8x16] = x265_pixel_ssd_8x16_ ## cpu; \
156 p.sse_pp[LUMA_16x12] = x265_pixel_ssd_16x12_ ## cpu; \
157 p.sse_pp[LUMA_32x32] = x265_pixel_ssd_32x32_ ## cpu; \
158 p.sse_pp[LUMA_32x16] = x265_pixel_ssd_32x16_ ## cpu; \
159 p.sse_pp[LUMA_16x32] = x265_pixel_ssd_16x32_ ## cpu; \
160 p.sse_pp[LUMA_8x32] = x265_pixel_ssd_8x32_ ## cpu; \
161 p.sse_pp[LUMA_32x8] = x265_pixel_ssd_32x8_ ## cpu; \
162 p.sse_pp[LUMA_32x24] = x265_pixel_ssd_32x24_ ## cpu; \
163 p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
164 p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
165
166 #define ASSGN_SSE_SS(cpu) \
167 p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_ ## cpu; \
168 p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_ ## cpu; \
169 p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_ ## cpu; \
170 p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_ ## cpu; \
171 p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_ ## cpu; \
172 p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_ ## cpu; \
173 p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_ ## cpu; \
174 p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_ ## cpu; \
175 p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_ ## cpu; \
176 p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_ ## cpu; \
177 p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_ ## cpu; \
178 p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_ ## cpu; \
179 p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_ ## cpu; \
180 p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_ ## cpu; \
181 p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_ ## cpu; \
182 p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_ ## cpu; \
183 p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_ ## cpu; \
184 p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_ ## cpu; \
185 p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_ ## cpu; \
186 p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_ ## cpu; \
187 p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_ ## cpu; \
188 p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_ ## cpu; \
189 p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_ ## cpu; \
190 p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_ ## cpu; \
191 p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_ ## cpu;
192
193 #define SA8D_INTER_FROM_BLOCK(cpu) \
194 p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
195 p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
196 p.sa8d_inter[LUMA_4x16] = x265_pixel_satd_4x16_ ## cpu; \
197 p.sa8d_inter[LUMA_16x4] = x265_pixel_satd_16x4_ ## cpu; \
198 p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_ ## cpu; \
199 p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_ ## cpu; \
200 p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_ ## cpu; \
201 p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_ ## cpu; \
202 p.sa8d_inter[LUMA_16x8] = x265_pixel_sa8d_16x8_ ## cpu; \
203 p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_ ## cpu; \
204 p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_ ## cpu; \
205 p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_ ## cpu; \
206 p.sa8d_inter[LUMA_32x8] = x265_pixel_sa8d_32x8_ ## cpu; \
207 p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_ ## cpu; \
208 p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
209 p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_ ## cpu; \
210 p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_ ## cpu; \
211 p.sa8d_inter[LUMA_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
212 p.sa8d_inter[LUMA_64x32] = x265_pixel_sa8d_64x32_ ## cpu; \
213 p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_ ## cpu; \
214 p.sa8d_inter[LUMA_64x48] = x265_pixel_sa8d_64x48_ ## cpu; \
215 p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_ ## cpu; \
216 p.sa8d_inter[LUMA_64x16] = x265_pixel_sa8d_64x16_ ## cpu; \
217 p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_ ## cpu;
218
219 #define PIXEL_AVG(cpu) \
220 p.pixelavg_pp[LUMA_64x64] = x265_pixel_avg_64x64_ ## cpu; \
221 p.pixelavg_pp[LUMA_64x48] = x265_pixel_avg_64x48_ ## cpu; \
222 p.pixelavg_pp[LUMA_64x32] = x265_pixel_avg_64x32_ ## cpu; \
223 p.pixelavg_pp[LUMA_64x16] = x265_pixel_avg_64x16_ ## cpu; \
224 p.pixelavg_pp[LUMA_48x64] = x265_pixel_avg_48x64_ ## cpu; \
225 p.pixelavg_pp[LUMA_32x64] = x265_pixel_avg_32x64_ ## cpu; \
226 p.pixelavg_pp[LUMA_32x32] = x265_pixel_avg_32x32_ ## cpu; \
227 p.pixelavg_pp[LUMA_32x24] = x265_pixel_avg_32x24_ ## cpu; \
228 p.pixelavg_pp[LUMA_32x16] = x265_pixel_avg_32x16_ ## cpu; \
229 p.pixelavg_pp[LUMA_32x8] = x265_pixel_avg_32x8_ ## cpu; \
230 p.pixelavg_pp[LUMA_24x32] = x265_pixel_avg_24x32_ ## cpu; \
231 p.pixelavg_pp[LUMA_16x64] = x265_pixel_avg_16x64_ ## cpu; \
232 p.pixelavg_pp[LUMA_16x32] = x265_pixel_avg_16x32_ ## cpu; \
233 p.pixelavg_pp[LUMA_16x16] = x265_pixel_avg_16x16_ ## cpu; \
234 p.pixelavg_pp[LUMA_16x12] = x265_pixel_avg_16x12_ ## cpu; \
235 p.pixelavg_pp[LUMA_16x8] = x265_pixel_avg_16x8_ ## cpu; \
236 p.pixelavg_pp[LUMA_16x4] = x265_pixel_avg_16x4_ ## cpu; \
237 p.pixelavg_pp[LUMA_12x16] = x265_pixel_avg_12x16_ ## cpu; \
238 p.pixelavg_pp[LUMA_8x32] = x265_pixel_avg_8x32_ ## cpu; \
239 p.pixelavg_pp[LUMA_8x16] = x265_pixel_avg_8x16_ ## cpu; \
240 p.pixelavg_pp[LUMA_8x8] = x265_pixel_avg_8x8_ ## cpu; \
241 p.pixelavg_pp[LUMA_8x4] = x265_pixel_avg_8x4_ ## cpu;
242
243 #define PIXEL_AVG_W4(cpu) \
244 p.pixelavg_pp[LUMA_4x4] = x265_pixel_avg_4x4_ ## cpu; \
245 p.pixelavg_pp[LUMA_4x8] = x265_pixel_avg_4x8_ ## cpu; \
246 p.pixelavg_pp[LUMA_4x16] = x265_pixel_avg_4x16_ ## cpu;
247
248 #define SETUP_CHROMA_FUNC_DEF_420(W, H, cpu) \
249 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
250 p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
251 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
252 p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
253
254 #define SETUP_CHROMA_FUNC_DEF_422(W, H, cpu) \
255 p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
256 p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
257 p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
258 p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
259
260 #define SETUP_CHROMA_FUNC_DEF_444(W, H, cpu) \
261 p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
262 p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
263 p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
264 p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu;
265
266 #define SETUP_CHROMA_SP_FUNC_DEF_420(W, H, cpu) \
267 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
268
269 #define SETUP_CHROMA_SP_FUNC_DEF_422(W, H, cpu) \
270 p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
271
272 #define SETUP_CHROMA_SP_FUNC_DEF_444(W, H, cpu) \
273 p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
274
275 #define SETUP_CHROMA_SS_FUNC_DEF_420(W, H, cpu) \
276 p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
277
278 #define SETUP_CHROMA_SS_FUNC_DEF_422(W, H, cpu) \
279 p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
280
281 #define SETUP_CHROMA_SS_FUNC_DEF_444(W, H, cpu) \
282 p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu;
283
284 #define CHROMA_FILTERS_420(cpu) \
285 SETUP_CHROMA_FUNC_DEF_420(4, 4, cpu); \
286 SETUP_CHROMA_FUNC_DEF_420(4, 2, cpu); \
287 SETUP_CHROMA_FUNC_DEF_420(2, 4, cpu); \
288 SETUP_CHROMA_FUNC_DEF_420(8, 8, cpu); \
289 SETUP_CHROMA_FUNC_DEF_420(8, 4, cpu); \
290 SETUP_CHROMA_FUNC_DEF_420(4, 8, cpu); \
291 SETUP_CHROMA_FUNC_DEF_420(8, 6, cpu); \
292 SETUP_CHROMA_FUNC_DEF_420(6, 8, cpu); \
293 SETUP_CHROMA_FUNC_DEF_420(8, 2, cpu); \
294 SETUP_CHROMA_FUNC_DEF_420(2, 8, cpu); \
295 SETUP_CHROMA_FUNC_DEF_420(16, 16, cpu); \
296 SETUP_CHROMA_FUNC_DEF_420(16, 8, cpu); \
297 SETUP_CHROMA_FUNC_DEF_420(8, 16, cpu); \
298 SETUP_CHROMA_FUNC_DEF_420(16, 12, cpu); \
299 SETUP_CHROMA_FUNC_DEF_420(12, 16, cpu); \
300 SETUP_CHROMA_FUNC_DEF_420(16, 4, cpu); \
301 SETUP_CHROMA_FUNC_DEF_420(4, 16, cpu); \
302 SETUP_CHROMA_FUNC_DEF_420(32, 32, cpu); \
303 SETUP_CHROMA_FUNC_DEF_420(32, 16, cpu); \
304 SETUP_CHROMA_FUNC_DEF_420(16, 32, cpu); \
305 SETUP_CHROMA_FUNC_DEF_420(32, 24, cpu); \
306 SETUP_CHROMA_FUNC_DEF_420(24, 32, cpu); \
307 SETUP_CHROMA_FUNC_DEF_420(32, 8, cpu); \
308 SETUP_CHROMA_FUNC_DEF_420(8, 32, cpu);
309
310 #define CHROMA_FILTERS_422(cpu) \
311 SETUP_CHROMA_FUNC_DEF_422(4, 8, cpu); \
312 SETUP_CHROMA_FUNC_DEF_422(4, 4, cpu); \
313 SETUP_CHROMA_FUNC_DEF_422(2, 8, cpu); \
314 SETUP_CHROMA_FUNC_DEF_422(8, 16, cpu); \
315 SETUP_CHROMA_FUNC_DEF_422(8, 8, cpu); \
316 SETUP_CHROMA_FUNC_DEF_422(4, 16, cpu); \
317 SETUP_CHROMA_FUNC_DEF_422(8, 12, cpu); \
318 SETUP_CHROMA_FUNC_DEF_422(6, 16, cpu); \
319 SETUP_CHROMA_FUNC_DEF_422(8, 4, cpu); \
320 SETUP_CHROMA_FUNC_DEF_422(2, 16, cpu); \
321 SETUP_CHROMA_FUNC_DEF_422(16, 32, cpu); \
322 SETUP_CHROMA_FUNC_DEF_422(16, 16, cpu); \
323 SETUP_CHROMA_FUNC_DEF_422(8, 32, cpu); \
324 SETUP_CHROMA_FUNC_DEF_422(16, 24, cpu); \
325 SETUP_CHROMA_FUNC_DEF_422(12, 32, cpu); \
326 SETUP_CHROMA_FUNC_DEF_422(16, 8, cpu); \
327 SETUP_CHROMA_FUNC_DEF_422(4, 32, cpu); \
328 SETUP_CHROMA_FUNC_DEF_422(32, 64, cpu); \
329 SETUP_CHROMA_FUNC_DEF_422(32, 32, cpu); \
330 SETUP_CHROMA_FUNC_DEF_422(16, 64, cpu); \
331 SETUP_CHROMA_FUNC_DEF_422(32, 48, cpu); \
332 SETUP_CHROMA_FUNC_DEF_422(24, 64, cpu); \
333 SETUP_CHROMA_FUNC_DEF_422(32, 16, cpu); \
334 SETUP_CHROMA_FUNC_DEF_422(8, 64, cpu);
335
336 #define CHROMA_FILTERS_444(cpu) \
337 SETUP_CHROMA_FUNC_DEF_444(8, 8, cpu); \
338 SETUP_CHROMA_FUNC_DEF_444(8, 4, cpu); \
339 SETUP_CHROMA_FUNC_DEF_444(4, 8, cpu); \
340 SETUP_CHROMA_FUNC_DEF_444(16, 16, cpu); \
341 SETUP_CHROMA_FUNC_DEF_444(16, 8, cpu); \
342 SETUP_CHROMA_FUNC_DEF_444(8, 16, cpu); \
343 SETUP_CHROMA_FUNC_DEF_444(16, 12, cpu); \
344 SETUP_CHROMA_FUNC_DEF_444(12, 16, cpu); \
345 SETUP_CHROMA_FUNC_DEF_444(16, 4, cpu); \
346 SETUP_CHROMA_FUNC_DEF_444(4, 16, cpu); \
347 SETUP_CHROMA_FUNC_DEF_444(32, 32, cpu); \
348 SETUP_CHROMA_FUNC_DEF_444(32, 16, cpu); \
349 SETUP_CHROMA_FUNC_DEF_444(16, 32, cpu); \
350 SETUP_CHROMA_FUNC_DEF_444(32, 24, cpu); \
351 SETUP_CHROMA_FUNC_DEF_444(24, 32, cpu); \
352 SETUP_CHROMA_FUNC_DEF_444(32, 8, cpu); \
353 SETUP_CHROMA_FUNC_DEF_444(8, 32, cpu); \
354 SETUP_CHROMA_FUNC_DEF_444(64, 64, cpu); \
355 SETUP_CHROMA_FUNC_DEF_444(64, 32, cpu); \
356 SETUP_CHROMA_FUNC_DEF_444(32, 64, cpu); \
357 SETUP_CHROMA_FUNC_DEF_444(64, 48, cpu); \
358 SETUP_CHROMA_FUNC_DEF_444(48, 64, cpu); \
359 SETUP_CHROMA_FUNC_DEF_444(64, 16, cpu); \
360 SETUP_CHROMA_FUNC_DEF_444(16, 64, cpu);
361
362 #define CHROMA_SP_FILTERS_SSE4_420(cpu) \
363 SETUP_CHROMA_SP_FUNC_DEF_420(4, 4, cpu); \
364 SETUP_CHROMA_SP_FUNC_DEF_420(4, 2, cpu); \
365 SETUP_CHROMA_SP_FUNC_DEF_420(2, 4, cpu); \
366 SETUP_CHROMA_SP_FUNC_DEF_420(4, 8, cpu); \
367 SETUP_CHROMA_SP_FUNC_DEF_420(6, 8, cpu); \
368 SETUP_CHROMA_SP_FUNC_DEF_420(2, 8, cpu); \
369 SETUP_CHROMA_SP_FUNC_DEF_420(16, 16, cpu); \
370 SETUP_CHROMA_SP_FUNC_DEF_420(16, 8, cpu); \
371 SETUP_CHROMA_SP_FUNC_DEF_420(16, 12, cpu); \
372 SETUP_CHROMA_SP_FUNC_DEF_420(12, 16, cpu); \
373 SETUP_CHROMA_SP_FUNC_DEF_420(16, 4, cpu); \
374 SETUP_CHROMA_SP_FUNC_DEF_420(4, 16, cpu); \
375 SETUP_CHROMA_SP_FUNC_DEF_420(32, 32, cpu); \
376 SETUP_CHROMA_SP_FUNC_DEF_420(32, 16, cpu); \
377 SETUP_CHROMA_SP_FUNC_DEF_420(16, 32, cpu); \
378 SETUP_CHROMA_SP_FUNC_DEF_420(32, 24, cpu); \
379 SETUP_CHROMA_SP_FUNC_DEF_420(24, 32, cpu); \
380 SETUP_CHROMA_SP_FUNC_DEF_420(32, 8, cpu);
381
382 #define CHROMA_SP_FILTERS_420(cpu) \
383 SETUP_CHROMA_SP_FUNC_DEF_420(8, 2, cpu); \
384 SETUP_CHROMA_SP_FUNC_DEF_420(8, 4, cpu); \
385 SETUP_CHROMA_SP_FUNC_DEF_420(8, 6, cpu); \
386 SETUP_CHROMA_SP_FUNC_DEF_420(8, 8, cpu); \
387 SETUP_CHROMA_SP_FUNC_DEF_420(8, 16, cpu); \
388 SETUP_CHROMA_SP_FUNC_DEF_420(8, 32, cpu);
389
390 #define CHROMA_SP_FILTERS_SSE4_422(cpu) \
391 SETUP_CHROMA_SP_FUNC_DEF_422(4, 8, cpu); \
392 SETUP_CHROMA_SP_FUNC_DEF_422(4, 4, cpu); \
393 SETUP_CHROMA_SP_FUNC_DEF_422(2, 8, cpu); \
394 SETUP_CHROMA_SP_FUNC_DEF_422(4, 16, cpu); \
395 SETUP_CHROMA_SP_FUNC_DEF_422(6, 16, cpu); \
396 SETUP_CHROMA_SP_FUNC_DEF_422(2, 16, cpu); \
397 SETUP_CHROMA_SP_FUNC_DEF_422(16, 32, cpu); \
398 SETUP_CHROMA_SP_FUNC_DEF_422(16, 16, cpu); \
399 SETUP_CHROMA_SP_FUNC_DEF_422(16, 24, cpu); \
400 SETUP_CHROMA_SP_FUNC_DEF_422(12, 32, cpu); \
401 SETUP_CHROMA_SP_FUNC_DEF_422(16, 8, cpu); \
402 SETUP_CHROMA_SP_FUNC_DEF_422(4, 32, cpu); \
403 SETUP_CHROMA_SP_FUNC_DEF_422(32, 64, cpu); \
404 SETUP_CHROMA_SP_FUNC_DEF_422(32, 32, cpu); \
405 SETUP_CHROMA_SP_FUNC_DEF_422(16, 64, cpu); \
406 SETUP_CHROMA_SP_FUNC_DEF_422(32, 48, cpu); \
407 SETUP_CHROMA_SP_FUNC_DEF_422(24, 64, cpu); \
408 SETUP_CHROMA_SP_FUNC_DEF_422(32, 16, cpu);
409
410 #define CHROMA_SP_FILTERS_422(cpu) \
411 SETUP_CHROMA_SP_FUNC_DEF_422(8, 4, cpu); \
412 SETUP_CHROMA_SP_FUNC_DEF_422(8, 8, cpu); \
413 SETUP_CHROMA_SP_FUNC_DEF_422(8, 12, cpu); \
414 SETUP_CHROMA_SP_FUNC_DEF_422(8, 16, cpu); \
415 SETUP_CHROMA_SP_FUNC_DEF_422(8, 32, cpu); \
416 SETUP_CHROMA_SP_FUNC_DEF_422(8, 64, cpu);
417
418 #define CHROMA_SP_FILTERS_SSE4_444(cpu) \
419 SETUP_CHROMA_SP_FUNC_DEF_444(4, 8, cpu); \
420 SETUP_CHROMA_SP_FUNC_DEF_444(16, 16, cpu); \
421 SETUP_CHROMA_SP_FUNC_DEF_444(16, 8, cpu); \
422 SETUP_CHROMA_SP_FUNC_DEF_444(16, 12, cpu); \
423 SETUP_CHROMA_SP_FUNC_DEF_444(12, 16, cpu); \
424 SETUP_CHROMA_SP_FUNC_DEF_444(16, 4, cpu); \
425 SETUP_CHROMA_SP_FUNC_DEF_444(4, 16, cpu); \
426 SETUP_CHROMA_SP_FUNC_DEF_444(32, 32, cpu); \
427 SETUP_CHROMA_SP_FUNC_DEF_444(32, 16, cpu); \
428 SETUP_CHROMA_SP_FUNC_DEF_444(16, 32, cpu); \
429 SETUP_CHROMA_SP_FUNC_DEF_444(32, 24, cpu); \
430 SETUP_CHROMA_SP_FUNC_DEF_444(24, 32, cpu); \
431 SETUP_CHROMA_SP_FUNC_DEF_444(32, 8, cpu); \
432 SETUP_CHROMA_SP_FUNC_DEF_444(64, 64, cpu); \
433 SETUP_CHROMA_SP_FUNC_DEF_444(64, 32, cpu); \
434 SETUP_CHROMA_SP_FUNC_DEF_444(32, 64, cpu); \
435 SETUP_CHROMA_SP_FUNC_DEF_444(64, 48, cpu); \
436 SETUP_CHROMA_SP_FUNC_DEF_444(48, 64, cpu); \
437 SETUP_CHROMA_SP_FUNC_DEF_444(64, 16, cpu); \
438 SETUP_CHROMA_SP_FUNC_DEF_444(16, 64, cpu);
439
440 #define CHROMA_SP_FILTERS_444(cpu) \
441 SETUP_CHROMA_SP_FUNC_DEF_444(8, 8, cpu); \
442 SETUP_CHROMA_SP_FUNC_DEF_444(8, 4, cpu); \
443 SETUP_CHROMA_SP_FUNC_DEF_444(8, 16, cpu); \
444 SETUP_CHROMA_SP_FUNC_DEF_444(8, 32, cpu);
445
446 #define CHROMA_SS_FILTERS_420(cpu) \
447 SETUP_CHROMA_SS_FUNC_DEF_420(4, 4, cpu); \
448 SETUP_CHROMA_SS_FUNC_DEF_420(4, 2, cpu); \
449 SETUP_CHROMA_SS_FUNC_DEF_420(8, 8, cpu); \
450 SETUP_CHROMA_SS_FUNC_DEF_420(8, 4, cpu); \
451 SETUP_CHROMA_SS_FUNC_DEF_420(4, 8, cpu); \
452 SETUP_CHROMA_SS_FUNC_DEF_420(8, 6, cpu); \
453 SETUP_CHROMA_SS_FUNC_DEF_420(8, 2, cpu); \
454 SETUP_CHROMA_SS_FUNC_DEF_420(16, 16, cpu); \
455 SETUP_CHROMA_SS_FUNC_DEF_420(16, 8, cpu); \
456 SETUP_CHROMA_SS_FUNC_DEF_420(8, 16, cpu); \
457 SETUP_CHROMA_SS_FUNC_DEF_420(16, 12, cpu); \
458 SETUP_CHROMA_SS_FUNC_DEF_420(12, 16, cpu); \
459 SETUP_CHROMA_SS_FUNC_DEF_420(16, 4, cpu); \
460 SETUP_CHROMA_SS_FUNC_DEF_420(4, 16, cpu); \
461 SETUP_CHROMA_SS_FUNC_DEF_420(32, 32, cpu); \
462 SETUP_CHROMA_SS_FUNC_DEF_420(32, 16, cpu); \
463 SETUP_CHROMA_SS_FUNC_DEF_420(16, 32, cpu); \
464 SETUP_CHROMA_SS_FUNC_DEF_420(32, 24, cpu); \
465 SETUP_CHROMA_SS_FUNC_DEF_420(24, 32, cpu); \
466 SETUP_CHROMA_SS_FUNC_DEF_420(32, 8, cpu); \
467 SETUP_CHROMA_SS_FUNC_DEF_420(8, 32, cpu);
468
469 #define CHROMA_SS_FILTERS_SSE4_420(cpu) \
470 SETUP_CHROMA_SS_FUNC_DEF_420(2, 4, cpu); \
471 SETUP_CHROMA_SS_FUNC_DEF_420(2, 8, cpu); \
472 SETUP_CHROMA_SS_FUNC_DEF_420(6, 8, cpu);
473
474 #define CHROMA_SS_FILTERS_422(cpu) \
475 SETUP_CHROMA_SS_FUNC_DEF_422(4, 8, cpu); \
476 SETUP_CHROMA_SS_FUNC_DEF_422(4, 4, cpu); \
477 SETUP_CHROMA_SS_FUNC_DEF_422(8, 16, cpu); \
478 SETUP_CHROMA_SS_FUNC_DEF_422(8, 8, cpu); \
479 SETUP_CHROMA_SS_FUNC_DEF_422(4, 16, cpu); \
480 SETUP_CHROMA_SS_FUNC_DEF_422(8, 12, cpu); \
481 SETUP_CHROMA_SS_FUNC_DEF_422(8, 4, cpu); \
482 SETUP_CHROMA_SS_FUNC_DEF_422(16, 32, cpu); \
483 SETUP_CHROMA_SS_FUNC_DEF_422(16, 16, cpu); \
484 SETUP_CHROMA_SS_FUNC_DEF_422(8, 32, cpu); \
485 SETUP_CHROMA_SS_FUNC_DEF_422(16, 24, cpu); \
486 SETUP_CHROMA_SS_FUNC_DEF_422(12, 32, cpu); \
487 SETUP_CHROMA_SS_FUNC_DEF_422(16, 8, cpu); \
488 SETUP_CHROMA_SS_FUNC_DEF_422(4, 32, cpu); \
489 SETUP_CHROMA_SS_FUNC_DEF_422(32, 64, cpu); \
490 SETUP_CHROMA_SS_FUNC_DEF_422(32, 32, cpu); \
491 SETUP_CHROMA_SS_FUNC_DEF_422(16, 64, cpu); \
492 SETUP_CHROMA_SS_FUNC_DEF_422(32, 48, cpu); \
493 SETUP_CHROMA_SS_FUNC_DEF_422(24, 64, cpu); \
494 SETUP_CHROMA_SS_FUNC_DEF_422(32, 16, cpu); \
495 SETUP_CHROMA_SS_FUNC_DEF_422(8, 64, cpu);
496
497 #define CHROMA_SS_FILTERS_SSE4_422(cpu) \
498 SETUP_CHROMA_SS_FUNC_DEF_422(2, 8, cpu); \
499 SETUP_CHROMA_SS_FUNC_DEF_422(2, 16, cpu); \
500 SETUP_CHROMA_SS_FUNC_DEF_422(6, 16, cpu);
501
502 #define CHROMA_SS_FILTERS_444(cpu) \
503 SETUP_CHROMA_SS_FUNC_DEF_444(8, 8, cpu); \
504 SETUP_CHROMA_SS_FUNC_DEF_444(8, 4, cpu); \
505 SETUP_CHROMA_SS_FUNC_DEF_444(4, 8, cpu); \
506 SETUP_CHROMA_SS_FUNC_DEF_444(16, 16, cpu); \
507 SETUP_CHROMA_SS_FUNC_DEF_444(16, 8, cpu); \
508 SETUP_CHROMA_SS_FUNC_DEF_444(8, 16, cpu); \
509 SETUP_CHROMA_SS_FUNC_DEF_444(16, 12, cpu); \
510 SETUP_CHROMA_SS_FUNC_DEF_444(12, 16, cpu); \
511 SETUP_CHROMA_SS_FUNC_DEF_444(16, 4, cpu); \
512 SETUP_CHROMA_SS_FUNC_DEF_444(4, 16, cpu); \
513 SETUP_CHROMA_SS_FUNC_DEF_444(32, 32, cpu); \
514 SETUP_CHROMA_SS_FUNC_DEF_444(32, 16, cpu); \
515 SETUP_CHROMA_SS_FUNC_DEF_444(16, 32, cpu); \
516 SETUP_CHROMA_SS_FUNC_DEF_444(32, 24, cpu); \
517 SETUP_CHROMA_SS_FUNC_DEF_444(24, 32, cpu); \
518 SETUP_CHROMA_SS_FUNC_DEF_444(32, 8, cpu); \
519 SETUP_CHROMA_SS_FUNC_DEF_444(8, 32, cpu); \
520 SETUP_CHROMA_SS_FUNC_DEF_444(64, 64, cpu); \
521 SETUP_CHROMA_SS_FUNC_DEF_444(64, 32, cpu); \
522 SETUP_CHROMA_SS_FUNC_DEF_444(32, 64, cpu); \
523 SETUP_CHROMA_SS_FUNC_DEF_444(64, 48, cpu); \
524 SETUP_CHROMA_SS_FUNC_DEF_444(48, 64, cpu); \
525 SETUP_CHROMA_SS_FUNC_DEF_444(64, 16, cpu); \
526 SETUP_CHROMA_SS_FUNC_DEF_444(16, 64, cpu);
527
528 #if HIGH_BIT_DEPTH // temporary, until all 10bit functions are completed
529 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
530 p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
531 p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
532 p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
533 p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
534 p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
535 #else
536 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
537 p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
538 p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
539 p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
540 p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;
541 #endif // if HIGH_BIT_DEPTH
542
543 #define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
544 p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
545 p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
546
547 #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
548 p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
549
550 #define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
551 p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
552
553 #define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
554 p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
555
556 #define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
557 p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
558
559 #define CHROMA_BLOCKCOPY(type, cpu) \
560 SETUP_CHROMA_BLOCKCOPY(type, 2, 4, cpu); \
561 SETUP_CHROMA_BLOCKCOPY(type, 2, 8, cpu); \
562 SETUP_CHROMA_BLOCKCOPY(type, 4, 2, cpu); \
563 SETUP_CHROMA_BLOCKCOPY(type, 4, 4, cpu); \
564 SETUP_CHROMA_BLOCKCOPY(type, 4, 8, cpu); \
565 SETUP_CHROMA_BLOCKCOPY(type, 4, 16, cpu); \
566 SETUP_CHROMA_BLOCKCOPY(type, 6, 8, cpu); \
567 SETUP_CHROMA_BLOCKCOPY(type, 8, 2, cpu); \
568 SETUP_CHROMA_BLOCKCOPY(type, 8, 4, cpu); \
569 SETUP_CHROMA_BLOCKCOPY(type, 8, 6, cpu); \
570 SETUP_CHROMA_BLOCKCOPY(type, 8, 8, cpu); \
571 SETUP_CHROMA_BLOCKCOPY(type, 8, 16, cpu); \
572 SETUP_CHROMA_BLOCKCOPY(type, 8, 32, cpu); \
573 SETUP_CHROMA_BLOCKCOPY(type, 12, 16, cpu); \
574 SETUP_CHROMA_BLOCKCOPY(type, 16, 4, cpu); \
575 SETUP_CHROMA_BLOCKCOPY(type, 16, 8, cpu); \
576 SETUP_CHROMA_BLOCKCOPY(type, 16, 12, cpu); \
577 SETUP_CHROMA_BLOCKCOPY(type, 16, 16, cpu); \
578 SETUP_CHROMA_BLOCKCOPY(type, 16, 32, cpu); \
579 SETUP_CHROMA_BLOCKCOPY(type, 24, 32, cpu); \
580 SETUP_CHROMA_BLOCKCOPY(type, 32, 8, cpu); \
581 SETUP_CHROMA_BLOCKCOPY(type, 32, 16, cpu); \
582 SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
583 SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
584
585 #define SETUP_CHROMA_BLOCKCOPY_422(type, W, H, cpu) \
586 p.chroma[X265_CSP_I422].copy_ ## type[CHROMA422_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
587
588 #define CHROMA_BLOCKCOPY_422(type, cpu) \
589 SETUP_CHROMA_BLOCKCOPY_422(type, 2, 8, cpu); \
590 SETUP_CHROMA_BLOCKCOPY_422(type, 2, 16, cpu); \
591 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 4, cpu); \
592 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 8, cpu); \
593 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 16, cpu); \
594 SETUP_CHROMA_BLOCKCOPY_422(type, 4, 32, cpu); \
595 SETUP_CHROMA_BLOCKCOPY_422(type, 6, 16, cpu); \
596 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 4, cpu); \
597 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 8, cpu); \
598 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 12, cpu); \
599 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 16, cpu); \
600 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 32, cpu); \
601 SETUP_CHROMA_BLOCKCOPY_422(type, 8, 64, cpu); \
602 SETUP_CHROMA_BLOCKCOPY_422(type, 12, 32, cpu); \
603 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 8, cpu); \
604 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 16, cpu); \
605 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 24, cpu); \
606 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 32, cpu); \
607 SETUP_CHROMA_BLOCKCOPY_422(type, 16, 64, cpu); \
608 SETUP_CHROMA_BLOCKCOPY_422(type, 24, 64, cpu); \
609 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 16, cpu); \
610 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 32, cpu); \
611 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 48, cpu); \
612 SETUP_CHROMA_BLOCKCOPY_422(type, 32, 64, cpu);
613
614 #define LUMA_BLOCKCOPY(type, cpu) \
615 SETUP_LUMA_BLOCKCOPY(type, 4, 4, cpu); \
616 SETUP_LUMA_BLOCKCOPY(type, 8, 8, cpu); \
617 SETUP_LUMA_BLOCKCOPY(type, 8, 4, cpu); \
618 SETUP_LUMA_BLOCKCOPY(type, 4, 8, cpu); \
619 SETUP_LUMA_BLOCKCOPY(type, 16, 16, cpu); \
620 SETUP_LUMA_BLOCKCOPY(type, 16, 8, cpu); \
621 SETUP_LUMA_BLOCKCOPY(type, 8, 16, cpu); \
622 SETUP_LUMA_BLOCKCOPY(type, 16, 12, cpu); \
623 SETUP_LUMA_BLOCKCOPY(type, 12, 16, cpu); \
624 SETUP_LUMA_BLOCKCOPY(type, 16, 4, cpu); \
625 SETUP_LUMA_BLOCKCOPY(type, 4, 16, cpu); \
626 SETUP_LUMA_BLOCKCOPY(type, 32, 32, cpu); \
627 SETUP_LUMA_BLOCKCOPY(type, 32, 16, cpu); \
628 SETUP_LUMA_BLOCKCOPY(type, 16, 32, cpu); \
629 SETUP_LUMA_BLOCKCOPY(type, 32, 24, cpu); \
630 SETUP_LUMA_BLOCKCOPY(type, 24, 32, cpu); \
631 SETUP_LUMA_BLOCKCOPY(type, 32, 8, cpu); \
632 SETUP_LUMA_BLOCKCOPY(type, 8, 32, cpu); \
633 SETUP_LUMA_BLOCKCOPY(type, 64, 64, cpu); \
634 SETUP_LUMA_BLOCKCOPY(type, 64, 32, cpu); \
635 SETUP_LUMA_BLOCKCOPY(type, 32, 64, cpu); \
636 SETUP_LUMA_BLOCKCOPY(type, 64, 48, cpu); \
637 SETUP_LUMA_BLOCKCOPY(type, 48, 64, cpu); \
638 SETUP_LUMA_BLOCKCOPY(type, 64, 16, cpu); \
639 SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
640
641 #define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
642 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
643
644 #define CHROMA_BLOCKCOPY_SP(cpu) \
645 SETUP_CHROMA_BLOCKCOPY_SP(2, 4, cpu); \
646 SETUP_CHROMA_BLOCKCOPY_SP(2, 8, cpu); \
647 SETUP_CHROMA_BLOCKCOPY_SP(4, 2, cpu); \
648 SETUP_CHROMA_BLOCKCOPY_SP(4, 4, cpu); \
649 SETUP_CHROMA_BLOCKCOPY_SP(4, 8, cpu); \
650 SETUP_CHROMA_BLOCKCOPY_SP(4, 16, cpu); \
651 SETUP_CHROMA_BLOCKCOPY_SP(6, 8, cpu); \
652 SETUP_CHROMA_BLOCKCOPY_SP(8, 2, cpu); \
653 SETUP_CHROMA_BLOCKCOPY_SP(8, 4, cpu); \
654 SETUP_CHROMA_BLOCKCOPY_SP(8, 6, cpu); \
655 SETUP_CHROMA_BLOCKCOPY_SP(8, 8, cpu); \
656 SETUP_CHROMA_BLOCKCOPY_SP(8, 16, cpu); \
657 SETUP_CHROMA_BLOCKCOPY_SP(8, 32, cpu); \
658 SETUP_CHROMA_BLOCKCOPY_SP(12, 16, cpu); \
659 SETUP_CHROMA_BLOCKCOPY_SP(16, 4, cpu); \
660 SETUP_CHROMA_BLOCKCOPY_SP(16, 8, cpu); \
661 SETUP_CHROMA_BLOCKCOPY_SP(16, 12, cpu); \
662 SETUP_CHROMA_BLOCKCOPY_SP(16, 16, cpu); \
663 SETUP_CHROMA_BLOCKCOPY_SP(16, 32, cpu); \
664 SETUP_CHROMA_BLOCKCOPY_SP(24, 32, cpu); \
665 SETUP_CHROMA_BLOCKCOPY_SP(32, 8, cpu); \
666 SETUP_CHROMA_BLOCKCOPY_SP(32, 16, cpu); \
667 SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
668 SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
669
670 #define SETUP_CHROMA_BLOCKCOPY_SP_422(W, H, cpu) \
671 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
672
673 #define CHROMA_BLOCKCOPY_SP_422(cpu) \
674 SETUP_CHROMA_BLOCKCOPY_SP_422(2, 8, cpu); \
675 SETUP_CHROMA_BLOCKCOPY_SP_422(2, 16, cpu); \
676 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 4, cpu); \
677 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 8, cpu); \
678 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 16, cpu); \
679 SETUP_CHROMA_BLOCKCOPY_SP_422(4, 32, cpu); \
680 SETUP_CHROMA_BLOCKCOPY_SP_422(6, 16, cpu); \
681 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 4, cpu); \
682 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 8, cpu); \
683 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 12, cpu); \
684 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 16, cpu); \
685 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 32, cpu); \
686 SETUP_CHROMA_BLOCKCOPY_SP_422(8, 64, cpu); \
687 SETUP_CHROMA_BLOCKCOPY_SP_422(12, 32, cpu); \
688 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 8, cpu); \
689 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 16, cpu); \
690 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 24, cpu); \
691 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 32, cpu); \
692 SETUP_CHROMA_BLOCKCOPY_SP_422(16, 64, cpu); \
693 SETUP_CHROMA_BLOCKCOPY_SP_422(24, 64, cpu); \
694 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 16, cpu); \
695 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 32, cpu); \
696 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 48, cpu); \
697 SETUP_CHROMA_BLOCKCOPY_SP_422(32, 64, cpu);
698
699 #define SETUP_CHROMA_PIXELSUB(W, H, cpu) \
700 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
701 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
702
703 #define CHROMA_PIXELSUB_PS(cpu) \
704 SETUP_CHROMA_PIXELSUB(4, 4, cpu); \
705 SETUP_CHROMA_PIXELSUB(8, 8, cpu); \
706 SETUP_CHROMA_PIXELSUB(16, 16, cpu); \
707 SETUP_CHROMA_PIXELSUB(32, 32, cpu);
708
709 #define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
710 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
711 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
712
713 #define CHROMA_PIXELSUB_PS_422(cpu) \
714 SETUP_CHROMA_PIXELSUB_422(4, 8, cpu); \
715 SETUP_CHROMA_PIXELSUB_422(8, 16, cpu); \
716 SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \
717 SETUP_CHROMA_PIXELSUB_422(32, 64, cpu);
718
719 #define LUMA_FILTERS(cpu) \
720 SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
721 SETUP_LUMA_FUNC_DEF(8, 8, cpu); \
722 SETUP_LUMA_FUNC_DEF(8, 4, cpu); \
723 SETUP_LUMA_FUNC_DEF(4, 8, cpu); \
724 SETUP_LUMA_FUNC_DEF(16, 16, cpu); \
725 SETUP_LUMA_FUNC_DEF(16, 8, cpu); \
726 SETUP_LUMA_FUNC_DEF(8, 16, cpu); \
727 SETUP_LUMA_FUNC_DEF(16, 12, cpu); \
728 SETUP_LUMA_FUNC_DEF(12, 16, cpu); \
729 SETUP_LUMA_FUNC_DEF(16, 4, cpu); \
730 SETUP_LUMA_FUNC_DEF(4, 16, cpu); \
731 SETUP_LUMA_FUNC_DEF(32, 32, cpu); \
732 SETUP_LUMA_FUNC_DEF(32, 16, cpu); \
733 SETUP_LUMA_FUNC_DEF(16, 32, cpu); \
734 SETUP_LUMA_FUNC_DEF(32, 24, cpu); \
735 SETUP_LUMA_FUNC_DEF(24, 32, cpu); \
736 SETUP_LUMA_FUNC_DEF(32, 8, cpu); \
737 SETUP_LUMA_FUNC_DEF(8, 32, cpu); \
738 SETUP_LUMA_FUNC_DEF(64, 64, cpu); \
739 SETUP_LUMA_FUNC_DEF(64, 32, cpu); \
740 SETUP_LUMA_FUNC_DEF(32, 64, cpu); \
741 SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
742 SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
743 SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
744 SETUP_LUMA_FUNC_DEF(16, 64, cpu);
745
746 #define LUMA_PIXELSUB(cpu) \
747 SETUP_LUMA_SUB_FUNC_DEF(4, 4, cpu); \
748 SETUP_LUMA_SUB_FUNC_DEF(8, 8, cpu); \
749 SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \
750 SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \
751 SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu);
752
753 #define LUMA_SP_FILTERS(cpu) \
754 SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
755 SETUP_LUMA_SP_FUNC_DEF(8, 8, cpu); \
756 SETUP_LUMA_SP_FUNC_DEF(8, 4, cpu); \
757 SETUP_LUMA_SP_FUNC_DEF(4, 8, cpu); \
758 SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \
759 SETUP_LUMA_SP_FUNC_DEF(16, 8, cpu); \
760 SETUP_LUMA_SP_FUNC_DEF(8, 16, cpu); \
761 SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \
762 SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \
763 SETUP_LUMA_SP_FUNC_DEF(16, 4, cpu); \
764 SETUP_LUMA_SP_FUNC_DEF(4, 16, cpu); \
765 SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \
766 SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \
767 SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \
768 SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \
769 SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \
770 SETUP_LUMA_SP_FUNC_DEF(32, 8, cpu); \
771 SETUP_LUMA_SP_FUNC_DEF(8, 32, cpu); \
772 SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \
773 SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \
774 SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \
775 SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \
776 SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \
777 SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \
778 SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
779
780 #define LUMA_SS_FILTERS(cpu) \
781 SETUP_LUMA_SS_FUNC_DEF(4, 4, cpu); \
782 SETUP_LUMA_SS_FUNC_DEF(8, 8, cpu); \
783 SETUP_LUMA_SS_FUNC_DEF(8, 4, cpu); \
784 SETUP_LUMA_SS_FUNC_DEF(4, 8, cpu); \
785 SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \
786 SETUP_LUMA_SS_FUNC_DEF(16, 8, cpu); \
787 SETUP_LUMA_SS_FUNC_DEF(8, 16, cpu); \
788 SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \
789 SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \
790 SETUP_LUMA_SS_FUNC_DEF(16, 4, cpu); \
791 SETUP_LUMA_SS_FUNC_DEF(4, 16, cpu); \
792 SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \
793 SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \
794 SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \
795 SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \
796 SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \
797 SETUP_LUMA_SS_FUNC_DEF(32, 8, cpu); \
798 SETUP_LUMA_SS_FUNC_DEF(8, 32, cpu); \
799 SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \
800 SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \
801 SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \
802 SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \
803 SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \
804 SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
805 SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
806
807 #define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
808 p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
809
810 #define LUMA_VAR(cpu) \
811 SETUP_PIXEL_VAR_DEF(8, 8, cpu); \
812 SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
813 SETUP_PIXEL_VAR_DEF(32, 32, cpu); \
814 SETUP_PIXEL_VAR_DEF(64, 64, cpu);
815
816 #define SETUP_PIXEL_SSE_SP_DEF(W, H, cpu) \
817 p.sse_sp[LUMA_ ## W ## x ## H] = x265_pixel_ssd_sp_ ## W ## x ## H ## cpu;
818
819 #define LUMA_SSE_SP(cpu) \
820 SETUP_PIXEL_SSE_SP_DEF(4, 4, cpu); \
821 SETUP_PIXEL_SSE_SP_DEF(8, 8, cpu); \
822 SETUP_PIXEL_SSE_SP_DEF(8, 4, cpu); \
823 SETUP_PIXEL_SSE_SP_DEF(4, 8, cpu); \
824 SETUP_PIXEL_SSE_SP_DEF(16, 16, cpu); \
825 SETUP_PIXEL_SSE_SP_DEF(16, 8, cpu); \
826 SETUP_PIXEL_SSE_SP_DEF(8, 16, cpu); \
827 SETUP_PIXEL_SSE_SP_DEF(16, 12, cpu); \
828 SETUP_PIXEL_SSE_SP_DEF(12, 16, cpu); \
829 SETUP_PIXEL_SSE_SP_DEF(16, 4, cpu); \
830 SETUP_PIXEL_SSE_SP_DEF(4, 16, cpu); \
831 SETUP_PIXEL_SSE_SP_DEF(32, 32, cpu); \
832 SETUP_PIXEL_SSE_SP_DEF(32, 16, cpu); \
833 SETUP_PIXEL_SSE_SP_DEF(16, 32, cpu); \
834 SETUP_PIXEL_SSE_SP_DEF(32, 24, cpu); \
835 SETUP_PIXEL_SSE_SP_DEF(24, 32, cpu); \
836 SETUP_PIXEL_SSE_SP_DEF(32, 8, cpu); \
837 SETUP_PIXEL_SSE_SP_DEF(8, 32, cpu); \
838 SETUP_PIXEL_SSE_SP_DEF(64, 64, cpu); \
839 SETUP_PIXEL_SSE_SP_DEF(64, 32, cpu); \
840 SETUP_PIXEL_SSE_SP_DEF(32, 64, cpu); \
841 SETUP_PIXEL_SSE_SP_DEF(64, 48, cpu); \
842 SETUP_PIXEL_SSE_SP_DEF(48, 64, cpu); \
843 SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
844 SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
845
846 #define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
847 p.luma_addAvg[LUMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
848
849 #define LUMA_ADDAVG(cpu) \
850 SETUP_LUMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
851 SETUP_LUMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
852 SETUP_LUMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
853 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
854 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
855 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
856 SETUP_LUMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
857 SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
858 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
859 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
860 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
861 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
862 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
863 SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
864 SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
865 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
866 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
867 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
868 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
869 SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
870 SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
871 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
872 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
873 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
874 SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
875
876 #define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
877 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
878
879 #define CHROMA_ADDAVG(cpu) \
880 SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 4, cpu); \
881 SETUP_CHROMA_ADDAVG_FUNC_DEF(2, 8, cpu); \
882 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 2, cpu); \
883 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 4, cpu); \
884 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 8, cpu); \
885 SETUP_CHROMA_ADDAVG_FUNC_DEF(4, 16, cpu); \
886 SETUP_CHROMA_ADDAVG_FUNC_DEF(6, 8, cpu); \
887 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 2, cpu); \
888 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 4, cpu); \
889 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 6, cpu); \
890 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 8, cpu); \
891 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 16, cpu); \
892 SETUP_CHROMA_ADDAVG_FUNC_DEF(8, 32, cpu); \
893 SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
894 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4, cpu); \
895 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8, cpu); \
896 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
897 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
898 SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
899 SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
900 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8, cpu); \
901 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
902 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
903 SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu);
904
905 #define SETUP_CHROMA_ADDAVG_FUNC_DEF_422(W, H, cpu) \
906 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
907
908 #define CHROMA_ADDAVG_422(cpu) \
909 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 8, cpu); \
910 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(2, 16, cpu); \
911 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 4, cpu); \
912 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 8, cpu); \
913 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 16, cpu); \
914 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(4, 32, cpu); \
915 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(6, 16, cpu); \
916 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 4, cpu); \
917 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 8, cpu); \
918 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 12, cpu); \
919 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 16, cpu); \
920 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 32, cpu); \
921 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(8, 64, cpu); \
922 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(12, 32, cpu); \
923 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 8, cpu); \
924 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 16, cpu); \
925 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 24, cpu); \
926 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 32, cpu); \
927 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(16, 64, cpu); \
928 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(24, 64, cpu); \
929 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 16, cpu); \
930 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 32, cpu); \
931 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
932 SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
933
934 #define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
935 p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
936 p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
937 p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
938 p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
939
940 #define SETUP_INTRA_ANG(mode, fno, cpu) \
941 p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
942 p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
943 p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
944
945 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
946 p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
947
948 #define SETUP_INTRA_ANG16_32(mode, fno, cpu) \
949 p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
950 p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
951
952 #define SETUP_INTRA_ANG4_8(mode, fno, cpu) \
953 p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
954 p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
955
956 #define INTRA_ANG_SSSE3(cpu) \
957 SETUP_INTRA_ANG_COMMON(2, 2, cpu); \
958 SETUP_INTRA_ANG_COMMON(34, 2, cpu);
959
960 #define INTRA_ANG_SSE4_COMMON(cpu) \
961 SETUP_INTRA_ANG_COMMON(3, 3, cpu); \
962 SETUP_INTRA_ANG_COMMON(4, 4, cpu); \
963 SETUP_INTRA_ANG_COMMON(5, 5, cpu); \
964 SETUP_INTRA_ANG_COMMON(6, 6, cpu); \
965 SETUP_INTRA_ANG_COMMON(7, 7, cpu); \
966 SETUP_INTRA_ANG_COMMON(8, 8, cpu); \
967 SETUP_INTRA_ANG_COMMON(9, 9, cpu); \
968 SETUP_INTRA_ANG_COMMON(10, 10, cpu); \
969 SETUP_INTRA_ANG_COMMON(11, 11, cpu); \
970 SETUP_INTRA_ANG_COMMON(12, 12, cpu); \
971 SETUP_INTRA_ANG_COMMON(13, 13, cpu); \
972 SETUP_INTRA_ANG_COMMON(14, 14, cpu); \
973 SETUP_INTRA_ANG_COMMON(15, 15, cpu); \
974 SETUP_INTRA_ANG_COMMON(16, 16, cpu); \
975 SETUP_INTRA_ANG_COMMON(17, 17, cpu); \
976 SETUP_INTRA_ANG_COMMON(18, 18, cpu);
977
978 #define INTRA_ANG_SSE4_HIGH(cpu) \
979 SETUP_INTRA_ANG(19, 19, cpu); \
980 SETUP_INTRA_ANG(20, 20, cpu); \
981 SETUP_INTRA_ANG(21, 21, cpu); \
982 SETUP_INTRA_ANG(22, 22, cpu); \
983 SETUP_INTRA_ANG(23, 23, cpu); \
984 SETUP_INTRA_ANG(24, 24, cpu); \
985 SETUP_INTRA_ANG(25, 25, cpu); \
986 SETUP_INTRA_ANG(26, 26, cpu); \
987 SETUP_INTRA_ANG(27, 27, cpu); \
988 SETUP_INTRA_ANG(28, 28, cpu); \
989 SETUP_INTRA_ANG(29, 29, cpu); \
990 SETUP_INTRA_ANG(30, 30, cpu); \
991 SETUP_INTRA_ANG(31, 31, cpu); \
992 SETUP_INTRA_ANG(32, 32, cpu); \
993 SETUP_INTRA_ANG(33, 33, cpu); \
994 SETUP_INTRA_ANG4(19, 17, cpu); \
995 SETUP_INTRA_ANG4(20, 16, cpu); \
996 SETUP_INTRA_ANG4(21, 15, cpu); \
997 SETUP_INTRA_ANG4(22, 14, cpu); \
998 SETUP_INTRA_ANG4(23, 13, cpu); \
999 SETUP_INTRA_ANG4(24, 12, cpu); \
1000 SETUP_INTRA_ANG4(25, 11, cpu); \
1001 SETUP_INTRA_ANG4(26, 26, cpu); \
1002 SETUP_INTRA_ANG4(27, 9, cpu); \
1003 SETUP_INTRA_ANG4(28, 8, cpu); \
1004 SETUP_INTRA_ANG4(29, 7, cpu); \
1005 SETUP_INTRA_ANG4(30, 6, cpu); \
1006 SETUP_INTRA_ANG4(31, 5, cpu); \
1007 SETUP_INTRA_ANG4(32, 4, cpu); \
1008 SETUP_INTRA_ANG4(33, 3, cpu);
1009
1010 #define INTRA_ANG_SSE4(cpu) \
1011 SETUP_INTRA_ANG4_8(19, 17, cpu); \
1012 SETUP_INTRA_ANG4_8(20, 16, cpu); \
1013 SETUP_INTRA_ANG4_8(21, 15, cpu); \
1014 SETUP_INTRA_ANG4_8(22, 14, cpu); \
1015 SETUP_INTRA_ANG4_8(23, 13, cpu); \
1016 SETUP_INTRA_ANG4_8(24, 12, cpu); \
1017 SETUP_INTRA_ANG4_8(25, 11, cpu); \
1018 SETUP_INTRA_ANG4_8(26, 26, cpu); \
1019 SETUP_INTRA_ANG4_8(27, 9, cpu); \
1020 SETUP_INTRA_ANG4_8(28, 8, cpu); \
1021 SETUP_INTRA_ANG4_8(29, 7, cpu); \
1022 SETUP_INTRA_ANG4_8(30, 6, cpu); \
1023 SETUP_INTRA_ANG4_8(31, 5, cpu); \
1024 SETUP_INTRA_ANG4_8(32, 4, cpu); \
1025 SETUP_INTRA_ANG4_8(33, 3, cpu); \
1026 SETUP_INTRA_ANG16_32(19, 19, cpu); \
1027 SETUP_INTRA_ANG16_32(20, 20, cpu); \
1028 SETUP_INTRA_ANG16_32(21, 21, cpu); \
1029 SETUP_INTRA_ANG16_32(22, 22, cpu); \
1030 SETUP_INTRA_ANG16_32(23, 23, cpu); \
1031 SETUP_INTRA_ANG16_32(24, 24, cpu); \
1032 SETUP_INTRA_ANG16_32(25, 25, cpu); \
1033 SETUP_INTRA_ANG16_32(26, 26, cpu); \
1034 SETUP_INTRA_ANG16_32(27, 27, cpu); \
1035 SETUP_INTRA_ANG16_32(28, 28, cpu); \
1036 SETUP_INTRA_ANG16_32(29, 29, cpu); \
1037 SETUP_INTRA_ANG16_32(30, 30, cpu); \
1038 SETUP_INTRA_ANG16_32(31, 31, cpu); \
1039 SETUP_INTRA_ANG16_32(32, 32, cpu); \
1040 SETUP_INTRA_ANG16_32(33, 33, cpu);
1041
1042 #define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
1043 p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
1044 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
1045 p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
1046 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
1047
1048 #define CHROMA_VERT_FILTERS(cpu) \
1049 SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
1050 SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
1051 SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
1052 SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
1053 SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \
1054 SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \
1055 SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
1056 SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
1057 SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
1058 SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
1059 SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
1060 SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
1061 SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
1062 SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
1063 SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
1064 SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
1065 SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
1066 SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
1067 SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
1068 SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu);
1069
1070 #define CHROMA_VERT_FILTERS_SSE4(cpu) \
1071 SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \
1072 SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
1073 SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
1074 SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
1075
1076 #define SETUP_CHROMA_VERT_FUNC_DEF_422(W, H, cpu) \
1077 p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
1078 p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
1079 p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
1080 p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
1081
1082 #define CHROMA_VERT_FILTERS_422(cpu) \
1083 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 8, cpu); \
1084 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 16, cpu); \
1085 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 8, cpu); \
1086 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 16, cpu); \
1087 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 12, cpu); \
1088 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 4, cpu); \
1089 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 32, cpu); \
1090 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 16, cpu); \
1091 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 32, cpu); \
1092 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 24, cpu); \
1093 SETUP_CHROMA_VERT_FUNC_DEF_422(12, 32, cpu); \
1094 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 8, cpu); \
1095 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 32, cpu); \
1096 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 64, cpu); \
1097 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 32, cpu); \
1098 SETUP_CHROMA_VERT_FUNC_DEF_422(16, 64, cpu); \
1099 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 48, cpu); \
1100 SETUP_CHROMA_VERT_FUNC_DEF_422(24, 64, cpu); \
1101 SETUP_CHROMA_VERT_FUNC_DEF_422(32, 16, cpu); \
1102 SETUP_CHROMA_VERT_FUNC_DEF_422(8, 64, cpu);
1103
1104 #define CHROMA_VERT_FILTERS_SSE4_422(cpu) \
1105 SETUP_CHROMA_VERT_FUNC_DEF_422(2, 8, cpu); \
1106 SETUP_CHROMA_VERT_FUNC_DEF_422(2, 16, cpu); \
1107 SETUP_CHROMA_VERT_FUNC_DEF_422(4, 4, cpu); \
1108 SETUP_CHROMA_VERT_FUNC_DEF_422(6, 16, cpu);
1109
1110 #define SETUP_CHROMA_VERT_FUNC_DEF_444(W, H, cpu) \
1111 p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
1112 p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
1113 p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
1114 p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
1115
1116 #define CHROMA_VERT_FILTERS_444(cpu) \
1117 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 8, cpu); \
1118 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 4, cpu); \
1119 SETUP_CHROMA_VERT_FUNC_DEF_444(4, 8, cpu); \
1120 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 16, cpu); \
1121 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 8, cpu); \
1122 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 16, cpu); \
1123 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 12, cpu); \
1124 SETUP_CHROMA_VERT_FUNC_DEF_444(12, 16, cpu); \
1125 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 4, cpu); \
1126 SETUP_CHROMA_VERT_FUNC_DEF_444(4, 16, cpu); \
1127 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 32, cpu); \
1128 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 16, cpu); \
1129 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 32, cpu); \
1130 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 24, cpu); \
1131 SETUP_CHROMA_VERT_FUNC_DEF_444(24, 32, cpu); \
1132 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 8, cpu); \
1133 SETUP_CHROMA_VERT_FUNC_DEF_444(8, 32, cpu); \
1134 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 64, cpu); \
1135 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 32, cpu); \
1136 SETUP_CHROMA_VERT_FUNC_DEF_444(32, 64, cpu); \
1137 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 48, cpu); \
1138 SETUP_CHROMA_VERT_FUNC_DEF_444(48, 64, cpu); \
1139 SETUP_CHROMA_VERT_FUNC_DEF_444(64, 16, cpu); \
1140 SETUP_CHROMA_VERT_FUNC_DEF_444(16, 64, cpu);
1141
1142 #define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
1143 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
1144 p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
1145
1146 #define CHROMA_HORIZ_FILTERS(cpu) \
1147 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
1148 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \
1149 SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \
1150 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
1151 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
1152 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
1153 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \
1154 SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \
1155 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \
1156 SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
1157 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
1158 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
1159 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
1160 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
1161 SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
1162 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
1163 SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
1164 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
1165 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
1166 SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
1167 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
1168 SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
1169 SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
1170 SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu);
1171
1172 #define SETUP_CHROMA_HORIZ_FUNC_DEF_422(W, H, cpu) \
1173 p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
1174 p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
1175
1176 #define CHROMA_HORIZ_FILTERS_422(cpu) \
1177 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 8, cpu); \
1178 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 4, cpu); \
1179 SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 8, cpu); \
1180 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 16, cpu); \
1181 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 8, cpu); \
1182 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 16, cpu); \
1183 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 12, cpu); \
1184 SETUP_CHROMA_HORIZ_FUNC_DEF_422(6, 16, cpu); \
1185 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 4, cpu); \
1186 SETUP_CHROMA_HORIZ_FUNC_DEF_422(2, 16, cpu); \
1187 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 32, cpu); \
1188 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 16, cpu); \
1189 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 32, cpu); \
1190 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 24, cpu); \
1191 SETUP_CHROMA_HORIZ_FUNC_DEF_422(12, 32, cpu); \
1192 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 8, cpu); \
1193 SETUP_CHROMA_HORIZ_FUNC_DEF_422(4, 32, cpu); \
1194 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 64, cpu); \
1195 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 32, cpu); \
1196 SETUP_CHROMA_HORIZ_FUNC_DEF_422(16, 64, cpu); \
1197 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 48, cpu); \
1198 SETUP_CHROMA_HORIZ_FUNC_DEF_422(24, 64, cpu); \
1199 SETUP_CHROMA_HORIZ_FUNC_DEF_422(32, 16, cpu); \
1200 SETUP_CHROMA_HORIZ_FUNC_DEF_422(8, 64, cpu);
1201
1202 #define SETUP_CHROMA_HORIZ_FUNC_DEF_444(W, H, cpu) \
1203 p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
1204 p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
1205
1206 #define CHROMA_HORIZ_FILTERS_444(cpu) \
1207 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 8, cpu); \
1208 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 4, cpu); \
1209 SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 8, cpu); \
1210 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 16, cpu); \
1211 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 8, cpu); \
1212 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 16, cpu); \
1213 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 12, cpu); \
1214 SETUP_CHROMA_HORIZ_FUNC_DEF_444(12, 16, cpu); \
1215 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 4, cpu); \
1216 SETUP_CHROMA_HORIZ_FUNC_DEF_444(4, 16, cpu); \
1217 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 32, cpu); \
1218 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 16, cpu); \
1219 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 32, cpu); \
1220 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 24, cpu); \
1221 SETUP_CHROMA_HORIZ_FUNC_DEF_444(24, 32, cpu); \
1222 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 8, cpu); \
1223 SETUP_CHROMA_HORIZ_FUNC_DEF_444(8, 32, cpu); \
1224 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 64, cpu); \
1225 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 32, cpu); \
1226 SETUP_CHROMA_HORIZ_FUNC_DEF_444(32, 64, cpu); \
1227 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 48, cpu); \
1228 SETUP_CHROMA_HORIZ_FUNC_DEF_444(48, 64, cpu); \
1229 SETUP_CHROMA_HORIZ_FUNC_DEF_444(64, 16, cpu); \
1230 SETUP_CHROMA_HORIZ_FUNC_DEF_444(16, 64, cpu);
1231
1232 namespace x265 {
1233 // private x265 namespace
1234
1235 #if HIGH_BIT_DEPTH
1236 /* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
1237 template<int log2Size>
1238 void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
1239 {
1240 const int size = 1 << log2Size;
1241 const int sizeIdx = log2Size - 2;
1242 ALIGN_VAR_32(pixel, buffer[32 * 32]);
1243
1244 for (int mode = 2; mode <= 34; mode++)
1245 {
1246 pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0);
1247 pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0);
1248 pixel *out = dest + ((mode - 2) << (log2Size * 2));
1249
1250 if (mode < 18)
1251 {
1252 primitives.intra_pred[mode][sizeIdx](buffer, size, left, above, mode, bLuma);
1253 primitives.transpose[sizeIdx](out, buffer, size);
1254 }
1255 else
1256 primitives.intra_pred[mode][sizeIdx](out, size, left, above, mode, bLuma);
1257 }
1258 }
1259 #endif
1260
1261 void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
1262 {
1263 #if HIGH_BIT_DEPTH
1264 if (cpuMask & X265_CPU_SSE2)
1265 {
1266 INIT8(sad, _mmx2);
1267 INIT2(sad, _sse2);
1268 SAD(sse2);
1269
1270 INIT6(satd, _sse2);
1271 HEVC_SATD(sse2);
1272 p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1273
1274 p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1275 SA8D_INTER_FROM_BLOCK(sse2);
1276 p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
1277 p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
1278
1279 p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_mmx2;
1280 p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_mmx2;
1281 p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_mmx2;
1282 p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_sse2;
1283 p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2;
1284 p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2;
1285 p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2;
1286 p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2;
1287 p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2;
1288 p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2;
1289 p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2;
1290 p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2;
1291 p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2;
1292 p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2;
1293 p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2;
1294 p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2;
1295 p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2;
1296 p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2;
1297 p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
1298 p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
1299 p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2;
1300 p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2;
1301 p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2;
1302 p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2;
1303 p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2;
1304
1305 p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
1306 p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
1307 p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
1308 p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
1309 p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
1310
1311 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
1312 p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
1313 PIXEL_AVG(sse2);
1314 PIXEL_AVG_W4(mmx2);
1315 LUMA_VAR(_sse2);
1316
1317 SAD_X3(sse2);
1318 p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
1319 p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
1320 p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
1321 p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
1322 p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
1323 p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
1324 p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
1325 p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
1326 p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
1327
1328 SAD_X4(sse2);
1329 p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
1330 p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
1331 p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
1332 p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
1333 p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
1334 p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
1335 p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
1336 p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
1337 p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
1338
1339 p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
1340 p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
1341 p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
1342 p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
1343 p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
1344 p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
1345 p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
1346 p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
1347 p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
1348 p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
1349 p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
1350 p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
1351 p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
1352 p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
1353 p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
1354 p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
1355
1356 CHROMA_PIXELSUB_PS(_sse2);
1357 CHROMA_PIXELSUB_PS_422(_sse2);
1358 LUMA_PIXELSUB(_sse2);
1359
1360 CHROMA_BLOCKCOPY(ss, _sse2);
1361 CHROMA_BLOCKCOPY_422(ss, _sse2);
1362 LUMA_BLOCKCOPY(ss, _sse2);
1363
1364 CHROMA_VERT_FILTERS(_sse2);
1365 CHROMA_VERT_FILTERS_422(_sse2);
1366 CHROMA_VERT_FILTERS_444(_sse2);
1367 p.luma_p2s = x265_luma_p2s_sse2;
1368 p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
1369 p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
1370 p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_sse2; // for i444 , chroma_p2s can be replaced by luma_p2s
1371
1372 p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
1373 p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
1374 p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
1375 p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
1376
1377 // TODO: overflow on 12-bits mode!
1378 p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
1379 p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
1380 p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
1381 p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
1382
1383 p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
1384 p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
1385 p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
1386 p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
1387
1388 p.dct[DCT_4x4] = x265_dct4_sse2;
1389 p.idct[IDCT_4x4] = x265_idct4_sse2;
1390 #if X86_64
1391 p.idct[IDCT_8x8] = x265_idct8_sse2;
1392 #endif
1393 p.idct[IDST_4x4] = x265_idst4_sse2;
1394
1395 LUMA_SS_FILTERS(_sse2);
1396 }
1397 if (cpuMask & X265_CPU_SSSE3)
1398 {
1399 p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
1400 p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
1401
1402 INTRA_ANG_SSSE3(ssse3);
1403
1404 p.dct[DST_4x4] = x265_dst4_ssse3;
1405 p.idct[IDCT_8x8] = x265_idct8_ssse3;
1406 p.count_nonzero = x265_count_nonzero_ssse3;
1407 }
1408 if (cpuMask & X265_CPU_SSE4)
1409 {
1410 LUMA_ADDAVG(_sse4);
1411 CHROMA_ADDAVG(_sse4);
1412 CHROMA_ADDAVG_422(_sse4);
1413 LUMA_FILTERS(_sse4);
1414 CHROMA_HORIZ_FILTERS(_sse4);
1415 CHROMA_VERT_FILTERS_SSE4(_sse4);
1416 CHROMA_HORIZ_FILTERS_422(_sse4);
1417 CHROMA_VERT_FILTERS_SSE4_422(_sse4);
1418 CHROMA_HORIZ_FILTERS_444(_sse4);
1419
1420 p.dct[DCT_8x8] = x265_dct8_sse4;
1421 p.quant = x265_quant_sse4;
1422 p.nquant = x265_nquant_sse4;
1423 p.dequant_normal = x265_dequant_normal_sse4;
1424 p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
1425 p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
1426 p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
1427 p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4;
1428
1429 p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4;
1430 p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
1431 p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
1432 p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
1433 p.planecopy_cp = x265_upShift_8_sse4;
1434
1435 INTRA_ANG_SSE4_COMMON(sse4);
1436 INTRA_ANG_SSE4_HIGH(sse4);
1437 }
1438 if (cpuMask & X265_CPU_XOP)
1439 {
1440 p.frameInitLowres = x265_frame_init_lowres_core_xop;
1441 SA8D_INTER_FROM_BLOCK(xop);
1442 INIT7(satd, _xop);
1443 HEVC_SATD(xop);
1444 }
1445 if (cpuMask & X265_CPU_AVX2)
1446 {
1447 p.dct[DCT_4x4] = x265_dct4_avx2;
1448 p.quant = x265_quant_avx2;
1449 p.nquant = x265_nquant_avx2;
1450 p.dequant_normal = x265_dequant_normal_avx2;
1451 p.scale1D_128to64 = x265_scale1D_128to64_avx2;
1452 p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
1453 p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
1454 p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
1455 p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
1456 p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
1457 p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
1458 p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
1459 p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
1460 #if X86_64
1461 p.dct[DCT_8x8] = x265_dct8_avx2;
1462 p.dct[DCT_16x16] = x265_dct16_avx2;
1463 p.dct[DCT_32x32] = x265_dct32_avx2;
1464 p.idct[IDCT_4x4] = x265_idct4_avx2;
1465 p.idct[IDCT_8x8] = x265_idct8_avx2;
1466 p.idct[IDCT_16x16] = x265_idct16_avx2;
1467 p.idct[IDCT_32x32] = x265_idct32_avx2;
1468 p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
1469 p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
1470 p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
1471 p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
1472 #endif
1473 }
1474 /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
1475 for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
1476 {
1477 p.sse_pp[i] = (pixelcmp_t)p.sse_ss[i];
1478 p.sse_sp[i] = (pixelcmp_sp_t)p.sse_ss[i];
1479 }
1480
1481 for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
1482 {
1483 p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
1484 p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
1485 p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
1486 }
1487
1488 for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
1489 {
1490 p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
1491 p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
1492 p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
1493 }
1494
1495 for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
1496 {
1497 p.chroma[X265_CSP_I422].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I422].copy_ss[i];
1498 p.chroma[X265_CSP_I422].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I422].copy_ss[i];
1499 p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
1500 }
1501
1502 if (p.intra_pred[0][0] && p.transpose[0])
1503 {
1504 p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
1505 p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
1506 p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
1507 p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
1508 }
1509
1510 #else // if HIGH_BIT_DEPTH
1511 if (cpuMask & X265_CPU_SSE2)
1512 {
1513 INIT8_NAME(sse_pp, ssd, _mmx);
1514 INIT8(sad, _mmx2);
1515 INIT8(sad_x3, _mmx2);
1516 INIT8(sad_x4, _mmx2);
1517 p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1518 p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
1519 p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
1520
1521 PIXEL_AVG(sse2);
1522 PIXEL_AVG_W4(mmx2);
1523
1524 LUMA_VAR(_sse2);
1525
1526 ASSGN_SSE(sse2);
1527 ASSGN_SSE_SS(sse2);
1528 INIT2(sad, _sse2);
1529 SAD(sse2);
1530 INIT2(sad_x3, _sse2);
1531 INIT2(sad_x4, _sse2);
1532 HEVC_SATD(sse2);
1533
1534 CHROMA_BLOCKCOPY(ss, _sse2);
1535 CHROMA_BLOCKCOPY(pp, _sse2);
1536 CHROMA_BLOCKCOPY_422(ss, _sse2);
1537 CHROMA_BLOCKCOPY_422(pp, _sse2);
1538 LUMA_BLOCKCOPY(ss, _sse2);
1539 LUMA_BLOCKCOPY(pp, _sse2);
1540 LUMA_BLOCKCOPY(sp, _sse2);
1541 CHROMA_BLOCKCOPY_SP(_sse2);
1542 CHROMA_BLOCKCOPY_SP_422(_sse2);
1543
1544 CHROMA_SS_FILTERS_420(_sse2);
1545 CHROMA_SS_FILTERS_422(_sse2);
1546 CHROMA_SS_FILTERS_444(_sse2);
1547 CHROMA_SP_FILTERS_420(_sse2);
1548 CHROMA_SP_FILTERS_422(_sse2);
1549 CHROMA_SP_FILTERS_444(_sse2);
1550 LUMA_SS_FILTERS(_sse2);
1551
1552 // This function pointer initialization is temporary will be removed
1553 // later with macro definitions. It is used to avoid linker errors
1554 // until all partitions are coded and commit smaller patches, easier to
1555 // review.
1556
1557 p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
1558 p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
1559 p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
1560 p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
1561
1562 p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
1563 p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
1564 p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
1565 p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
1566
1567 p.frameInitLowres = x265_frame_init_lowres_core_sse2;
1568 SA8D_INTER_FROM_BLOCK(sse2);
1569
1570 p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
1571 p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
1572 p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
1573 p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
1574 p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
1575 p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
1576 p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
1577 p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
1578 p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
1579 p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
1580 p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
1581 p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
1582 p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
1583 p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
1584 p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
1585 p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
1586
1587 p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
1588 p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
1589 p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
1590 p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
1591 p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
1592 p.transpose[BLOCK_32x32] = x265_transpose32_sse2;
1593 p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
1594 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
1595 p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
1596
1597 p.dct[DCT_4x4] = x265_dct4_sse2;
1598 p.idct[IDCT_4x4] = x265_idct4_sse2;
1599 #if X86_64
1600 p.idct[IDCT_8x8] = x265_idct8_sse2;
1601 #endif
1602 p.idct[IDST_4x4] = x265_idst4_sse2;
1603
1604 p.planecopy_sp = x265_downShift_16_sse2;
1605 }
1606 if (cpuMask & X265_CPU_SSSE3)
1607 {
1608 p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
1609 SA8D_INTER_FROM_BLOCK(ssse3);
1610 p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
1611 ASSGN_SSE(ssse3);
1612 PIXEL_AVG(ssse3);
1613 PIXEL_AVG_W4(ssse3);
1614
1615 INTRA_ANG_SSSE3(ssse3);
1616
1617 p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
1618 p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
1619 SAD_X3(ssse3);
1620 SAD_X4(ssse3);
1621 p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
1622 p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
1623 p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
1624 p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3;
1625 p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_ssse3;
1626 p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_ssse3;
1627
1628 p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
1629 p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
1630
1631 p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
1632 p.luma_p2s = x265_luma_p2s_ssse3;
1633 p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
1634 p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
1635 p.chroma[X265_CSP_I444].p2s = x265_luma_p2s_ssse3; // for i444, chroma_p2s can use luma_p2s
1636
1637 p.dct[DST_4x4] = x265_dst4_ssse3;
1638 p.idct[IDCT_8x8] = x265_idct8_ssse3;
1639 p.count_nonzero = x265_count_nonzero_ssse3;
1640 }
1641 if (cpuMask & X265_CPU_SSE4)
1642 {
1643 p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
1644
1645 LUMA_ADDAVG(_sse4);
1646 CHROMA_ADDAVG(_sse4);
1647 CHROMA_ADDAVG_422(_sse4);
1648
1649 // TODO: check POPCNT flag!
1650 p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
1651 p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_sse4;
1652 p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_sse4;
1653 p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_sse4;
1654
1655 HEVC_SATD(sse4);
1656 SA8D_INTER_FROM_BLOCK(sse4);
1657
1658 p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_sse4;
1659 p.sse_pp[LUMA_24x32] = x265_pixel_ssd_24x32_sse4;
1660 p.sse_pp[LUMA_48x64] = x265_pixel_ssd_48x64_sse4;
1661 p.sse_pp[LUMA_64x16] = x265_pixel_ssd_64x16_sse4;
1662 p.sse_pp[LUMA_64x32] = x265_pixel_ssd_64x32_sse4;
1663 p.sse_pp[LUMA_64x48] = x265_pixel_ssd_64x48_sse4;
1664 p.sse_pp[LUMA_64x64] = x265_pixel_ssd_64x64_sse4;
1665
1666 LUMA_SSE_SP(_sse4);
1667
1668 CHROMA_PIXELSUB_PS(_sse4);
1669 CHROMA_PIXELSUB_PS_422(_sse4);
1670 LUMA_PIXELSUB(_sse4);
1671
1672 CHROMA_FILTERS_420(_sse4);
1673 CHROMA_FILTERS_422(_sse4);
1674 CHROMA_FILTERS_444(_sse4);
1675 CHROMA_SS_FILTERS_SSE4_420(_sse4);
1676 CHROMA_SS_FILTERS_SSE4_422(_sse4);
1677 CHROMA_SP_FILTERS_SSE4_420(_sse4);
1678 CHROMA_SP_FILTERS_SSE4_422(_sse4);
1679 CHROMA_SP_FILTERS_SSE4_444(_sse4);
1680 LUMA_SP_FILTERS(_sse4);
1681 LUMA_FILTERS(_sse4);
1682 ASSGN_SSE_SS(sse4);
1683
1684 p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
1685 p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
1686 p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
1687 CHROMA_BLOCKCOPY(ps, _sse4);
1688 CHROMA_BLOCKCOPY_422(ps, _sse4);
1689 LUMA_BLOCKCOPY(ps, _sse4);
1690
1691 p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
1692 p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
1693 p.quant = x265_quant_sse4;
1694 p.nquant = x265_nquant_sse4;
1695 p.dequant_normal = x265_dequant_normal_sse4;
1696 p.weight_pp = x265_weight_pp_sse4;
1697 p.weight_sp = x265_weight_sp_sse4;
1698 p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
1699 p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
1700 p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
1701 p.intra_pred[0][BLOCK_32x32] = x265_intra_pred_planar32_sse4;
1702
1703 p.intra_pred_allangs[BLOCK_4x4] = x265_all_angs_pred_4x4_sse4;
1704 p.intra_pred_allangs[BLOCK_8x8] = x265_all_angs_pred_8x8_sse4;
1705 p.intra_pred_allangs[BLOCK_16x16] = x265_all_angs_pred_16x16_sse4;
1706 p.intra_pred_allangs[BLOCK_32x32] = x265_all_angs_pred_32x32_sse4;
1707
1708 p.intra_pred[1][BLOCK_4x4] = x265_intra_pred_dc4_sse4;
1709 p.intra_pred[1][BLOCK_8x8] = x265_intra_pred_dc8_sse4;
1710 p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
1711 p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
1712
1713 INTRA_ANG_SSE4_COMMON(sse4);
1714 INTRA_ANG_SSE4(sse4);
1715
1716 p.dct[DCT_8x8] = x265_dct8_sse4;
1717 // p.denoiseDct = x265_denoise_dct_sse4;
1718 }
1719 if (cpuMask & X265_CPU_AVX)
1720 {
1721 p.frameInitLowres = x265_frame_init_lowres_core_avx;
1722 HEVC_SATD(avx);
1723 SA8D_INTER_FROM_BLOCK(avx);
1724 ASSGN_SSE(avx);
1725
1726 ASSGN_SSE_SS(avx);
1727 SAD_X3(avx);
1728 SAD_X4(avx);
1729 p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
1730 p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
1731 p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
1732 p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
1733
1734 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
1735 p.ssim_end_4 = x265_pixel_ssim_end4_avx;
1736 p.luma_copy_ss[LUMA_64x16] = x265_blockcopy_ss_64x16_avx;
1737 p.luma_copy_ss[LUMA_64x32] = x265_blockcopy_ss_64x32_avx;
1738 p.luma_copy_ss[LUMA_64x48] = x265_blockcopy_ss_64x48_avx;
1739 p.luma_copy_ss[LUMA_64x64] = x265_blockcopy_ss_64x64_avx;
1740
1741 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x8] = x265_blockcopy_pp_32x8_avx;
1742 p.luma_copy_pp[LUMA_32x8] = x265_blockcopy_pp_32x8_avx;
1743
1744 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x16] = x265_blockcopy_pp_32x16_avx;
1745 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x16] = x265_blockcopy_pp_32x16_avx;
1746 p.luma_copy_pp[LUMA_32x16] = x265_blockcopy_pp_32x16_avx;
1747
1748 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x24] = x265_blockcopy_pp_32x24_avx;
1749 p.luma_copy_pp[LUMA_32x24] = x265_blockcopy_pp_32x24_avx;
1750
1751 p.chroma[X265_CSP_I420].copy_pp[CHROMA_32x32] = x265_blockcopy_pp_32x32_avx;
1752 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x32] = x265_blockcopy_pp_32x32_avx;
1753 p.luma_copy_pp[LUMA_32x32] = x265_blockcopy_pp_32x32_avx;
1754
1755 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x48] = x265_blockcopy_pp_32x48_avx;
1756
1757 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_32x64] = x265_blockcopy_pp_32x64_avx;
1758 p.luma_copy_pp[LUMA_32x64] = x265_blockcopy_pp_32x64_avx;
1759 }
1760 if (cpuMask & X265_CPU_XOP)
1761 {
1762 p.frameInitLowres = x265_frame_init_lowres_core_xop;
1763 SA8D_INTER_FROM_BLOCK(xop);
1764 INIT7(satd, _xop);
1765 INIT5_NAME(sse_pp, ssd, _xop);
1766 HEVC_SATD(xop);
1767 }
1768 if (cpuMask & X265_CPU_AVX2)
1769 {
1770 INIT2(sad_x4, _avx2);
1771 INIT4(satd, _avx2);
1772 INIT2_NAME(sse_pp, ssd, _avx2);
1773 p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
1774 p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
1775 p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
1776
1777 /* Need to update assembly code as per changed interface of the copy_cnt primitive, once
1778 * code is updated, avx2 version will be enabled */
1779
1780 p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
1781 p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
1782 p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
1783
1784 p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
1785 p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
1786
1787 p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
1788 p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
1789 p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
1790 p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
1791 p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
1792 p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
1793 p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
1794 p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
1795
1796 // p.denoiseDct = x265_denoise_dct_avx2;
1797 p.dct[DCT_4x4] = x265_dct4_avx2;
1798 p.quant = x265_quant_avx2;
1799 p.nquant = x265_nquant_avx2;
1800 p.dequant_normal = x265_dequant_normal_avx2;
1801
1802 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
1803 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
1804 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
1805 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x16] = x265_blockcopy_ss_16x16_avx;
1806 p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x32] = x265_blockcopy_ss_16x32_avx;
1807 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x8] = x265_blockcopy_ss_16x8_avx;
1808 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x16] = x265_blockcopy_ss_16x16_avx;
1809 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x24] = x265_blockcopy_ss_16x24_avx;
1810 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x32] = x265_blockcopy_ss_16x32_avx;
1811 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_16x64] = x265_blockcopy_ss_16x64_avx;
1812 p.scale1D_128to64 = x265_scale1D_128to64_avx2;
1813
1814 p.weight_pp = x265_weight_pp_avx2;
1815
1816 #if X86_64
1817
1818 p.dct[DCT_8x8] = x265_dct8_avx2;
1819 p.dct[DCT_16x16] = x265_dct16_avx2;
1820 p.dct[DCT_32x32] = x265_dct32_avx2;
1821 p.idct[IDCT_4x4] = x265_idct4_avx2;
1822 p.idct[IDCT_8x8] = x265_idct8_avx2;
1823 p.idct[IDCT_16x16] = x265_idct16_avx2;
1824 p.idct[IDCT_32x32] = x265_idct32_avx2;
1825
1826 p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
1827 p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
1828 p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
1829 p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
1830
1831 p.luma_vpp[LUMA_12x16] = x265_interp_8tap_vert_pp_12x16_avx2;
1832
1833 p.luma_vpp[LUMA_16x4] = x265_interp_8tap_vert_pp_16x4_avx2;
1834 p.luma_vpp[LUMA_16x8] = x265_interp_8tap_vert_pp_16x8_avx2;
1835 p.luma_vpp[LUMA_16x12] = x265_interp_8tap_vert_pp_16x12_avx2;
1836 p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2;
1837 p.luma_vpp[LUMA_16x32] = x265_interp_8tap_vert_pp_16x32_avx2;
1838 p.luma_vpp[LUMA_16x64] = x265_interp_8tap_vert_pp_16x64_avx2;
1839
1840 p.luma_vpp[LUMA_24x32] = x265_interp_8tap_vert_pp_24x32_avx2;
1841
1842 p.luma_vpp[LUMA_32x8] = x265_interp_8tap_vert_pp_32x8_avx2;
1843 p.luma_vpp[LUMA_32x16] = x265_interp_8tap_vert_pp_32x16_avx2;
1844 p.luma_vpp[LUMA_32x24] = x265_interp_8tap_vert_pp_32x24_avx2;
1845 p.luma_vpp[LUMA_32x32] = x265_interp_8tap_vert_pp_32x32_avx2;
1846 p.luma_vpp[LUMA_32x64] = x265_interp_8tap_vert_pp_32x64_avx2;
1847
1848 p.luma_vpp[LUMA_48x64] = x265_interp_8tap_vert_pp_48x64_avx2;
1849
1850 p.luma_vpp[LUMA_64x16] = x265_interp_8tap_vert_pp_64x16_avx2;
1851 p.luma_vpp[LUMA_64x32] = x265_interp_8tap_vert_pp_64x32_avx2;
1852 p.luma_vpp[LUMA_64x48] = x265_interp_8tap_vert_pp_64x48_avx2;
1853 p.luma_vpp[LUMA_64x64] = x265_interp_8tap_vert_pp_64x64_avx2;
1854 #endif
1855 p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
1856
1857 p.luma_hpp[LUMA_8x4] = x265_interp_8tap_horiz_pp_8x4_avx2;
1858 p.luma_hpp[LUMA_8x8] = x265_interp_8tap_horiz_pp_8x8_avx2;
1859 p.luma_hpp[LUMA_8x16] = x265_interp_8tap_horiz_pp_8x16_avx2;
1860 p.luma_hpp[LUMA_8x32] = x265_interp_8tap_horiz_pp_8x32_avx2;
1861
1862 p.luma_hpp[LUMA_16x4] = x265_interp_8tap_horiz_pp_16x4_avx2;
1863 p.luma_hpp[LUMA_16x8] = x265_interp_8tap_horiz_pp_16x8_avx2;
1864 p.luma_hpp[LUMA_16x12] = x265_interp_8tap_horiz_pp_16x12_avx2;
1865 p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
1866 p.luma_hpp[LUMA_16x32] = x265_interp_8tap_horiz_pp_16x32_avx2;
1867 p.luma_hpp[LUMA_16x64] = x265_interp_8tap_horiz_pp_16x64_avx2;
1868
1869 p.luma_hpp[LUMA_32x8] = x265_interp_8tap_horiz_pp_32x8_avx2;
1870 p.luma_hpp[LUMA_32x16] = x265_interp_8tap_horiz_pp_32x16_avx2;
1871 p.luma_hpp[LUMA_32x24] = x265_interp_8tap_horiz_pp_32x24_avx2;
1872 p.luma_hpp[LUMA_32x32] = x265_interp_8tap_horiz_pp_32x32_avx2;
1873 p.luma_hpp[LUMA_32x64] = x265_interp_8tap_horiz_pp_32x64_avx2;
1874
1875 p.luma_hpp[LUMA_64x64] = x265_interp_8tap_horiz_pp_64x64_avx2;
1876 p.luma_hpp[LUMA_64x48] = x265_interp_8tap_horiz_pp_64x48_avx2;
1877 p.luma_hpp[LUMA_64x32] = x265_interp_8tap_horiz_pp_64x32_avx2;
1878 p.luma_hpp[LUMA_64x16] = x265_interp_8tap_horiz_pp_64x16_avx2;
1879
1880 p.luma_hpp[LUMA_48x64] = x265_interp_8tap_horiz_pp_48x64_avx2;
1881
1882 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_8x8] = x265_interp_4tap_horiz_pp_8x8_avx2;
1883 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_4x4] = x265_interp_4tap_horiz_pp_4x4_avx2;
1884 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_32x32] = x265_interp_4tap_horiz_pp_32x32_avx2;
1885 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_16x16] = x265_interp_4tap_horiz_pp_16x16_avx2;
1886
1887 p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
1888
1889 p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2;
1890 p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
1891 p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
1892 p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
1893
1894 // color space i420
1895 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
1896 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_8x8] = x265_interp_4tap_vert_pp_8x8_avx2;
1897
1898 // color space i422
1899 p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_4x4] = x265_interp_4tap_vert_pp_4x4_avx2;
1900
1901 p.luma_vps[LUMA_4x4] = x265_interp_8tap_vert_ps_4x4_avx2;
1902
1903 #if X86_64
1904 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;
1905 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;
1906 #endif
1907 }
1908 #endif // if HIGH_BIT_DEPTH
1909 }
1910 }
1911
1912 extern "C" {
1913 #ifdef __INTEL_COMPILER
1914
1915 /* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
1916 * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
1917 * adapted to x265's cpu schema. */
1918
1919 // Global variable indicating cpu
1920 int __intel_cpu_indicator = 0;
1921 // CPU dispatcher function
1922 void x265_intel_cpu_indicator_init(void)
1923 {
1924 uint32_t cpu = x265::cpu_detect();
1925
1926 if (cpu & X265_CPU_AVX)
1927 __intel_cpu_indicator = 0x20000;
1928 else if (cpu & X265_CPU_SSE42)
1929 __intel_cpu_indicator = 0x8000;
1930 else if (cpu & X265_CPU_SSE4)
1931 __intel_cpu_indicator = 0x2000;
1932 else if (cpu & X265_CPU_SSSE3)
1933 __intel_cpu_indicator = 0x1000;
1934 else if (cpu & X265_CPU_SSE3)
1935 __intel_cpu_indicator = 0x800;
1936 else if (cpu & X265_CPU_SSE2 && !(cpu & X265_CPU_SSE2_IS_SLOW))
1937 __intel_cpu_indicator = 0x200;
1938 else if (cpu & X265_CPU_SSE)
1939 __intel_cpu_indicator = 0x80;
1940 else if (cpu & X265_CPU_MMX2)
1941 __intel_cpu_indicator = 8;
1942 else
1943 __intel_cpu_indicator = 1;
1944 }
1945
1946 /* __intel_cpu_indicator_init appears to have a non-standard calling convention that
1947 * assumes certain registers aren't preserved, so we'll route it through a function
1948 * that backs up all the registers. */
1949 void __intel_cpu_indicator_init(void)
1950 {
1951 x265_safe_intel_cpu_indicator_init();
1952 }
1953
1954 #else // ifdef __INTEL_COMPILER
1955 void x265_intel_cpu_indicator_init(void) {}
1956
1957 #endif // ifdef __INTEL_COMPILER
1958 }