Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / ipfilter.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
5 * Rajesh Paulraj <rajesh@multicorewareinc.com>
6 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "primitives.h"
28 #include "x265.h"
29
30 using namespace x265;
31
32 #if _MSC_VER
33 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
34 #endif
35
36 namespace {
37 template<int dstStride>
38 void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
39 {
40 int shift = IF_INTERNAL_PREC - X265_DEPTH;
41 int row, col;
42
43 for (row = 0; row < height; row++)
44 {
45 for (col = 0; col < width; col++)
46 {
47 int16_t val = src[col] << shift;
48 dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
49 }
50
51 src += srcStride;
52 dst += dstStride;
53 }
54 }
55
56 void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX)
57 {
58 for (int y = 0; y < height; y++)
59 {
60 #if HIGH_BIT_DEPTH
61 for (int x = 0; x < marginX; x++)
62 {
63 txt[-marginX + x] = txt[0];
64 txt[width + x] = txt[width - 1];
65 }
66
67 #else
68 ::memset(txt - marginX, txt[0], marginX);
69 ::memset(txt + width, txt[width - 1], marginX);
70 #endif
71
72 txt += stride;
73 }
74 }
75
76 template<int N, int width, int height>
77 void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
78 {
79 const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
80 int headRoom = IF_FILTER_PREC;
81 int offset = (1 << (headRoom - 1));
82 uint16_t maxVal = (1 << X265_DEPTH) - 1;
83 int cStride = 1;
84
85 src -= (N / 2 - 1) * cStride;
86
87 int row, col;
88 for (row = 0; row < height; row++)
89 {
90 for (col = 0; col < width; col++)
91 {
92 int sum;
93
94 sum = src[col + 0 * cStride] * coeff[0];
95 sum += src[col + 1 * cStride] * coeff[1];
96 sum += src[col + 2 * cStride] * coeff[2];
97 sum += src[col + 3 * cStride] * coeff[3];
98 if (N == 8)
99 {
100 sum += src[col + 4 * cStride] * coeff[4];
101 sum += src[col + 5 * cStride] * coeff[5];
102 sum += src[col + 6 * cStride] * coeff[6];
103 sum += src[col + 7 * cStride] * coeff[7];
104 }
105 int16_t val = (int16_t)((sum + offset) >> headRoom);
106
107 if (val < 0) val = 0;
108 if (val > maxVal) val = maxVal;
109 dst[col] = (pixel)val;
110 }
111
112 src += srcStride;
113 dst += dstStride;
114 }
115 }
116
117 template<int N, int width, int height>
118 void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
119 {
120 const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
121 int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
122 int shift = IF_FILTER_PREC - headRoom;
123 int offset = -IF_INTERNAL_OFFS << shift;
124 int blkheight = height;
125
126 src -= N / 2 - 1;
127
128 if (isRowExt)
129 {
130 src -= (N / 2 - 1) * srcStride;
131 blkheight += N - 1;
132 }
133
134 int row, col;
135 for (row = 0; row < blkheight; row++)
136 {
137 for (col = 0; col < width; col++)
138 {
139 int sum;
140
141 sum = src[col + 0] * coeff[0];
142 sum += src[col + 1] * coeff[1];
143 sum += src[col + 2] * coeff[2];
144 sum += src[col + 3] * coeff[3];
145 if (N == 8)
146 {
147 sum += src[col + 4] * coeff[4];
148 sum += src[col + 5] * coeff[5];
149 sum += src[col + 6] * coeff[6];
150 sum += src[col + 7] * coeff[7];
151 }
152
153 int16_t val = (int16_t)((sum + offset) >> shift);
154 dst[col] = val;
155 }
156
157 src += srcStride;
158 dst += dstStride;
159 }
160 }
161
162 template<int N, int width, int height>
163 void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
164 {
165 const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
166 int shift = IF_FILTER_PREC;
167 int offset = 1 << (shift - 1);
168 uint16_t maxVal = (1 << X265_DEPTH) - 1;
169
170 src -= (N / 2 - 1) * srcStride;
171
172 int row, col;
173 for (row = 0; row < height; row++)
174 {
175 for (col = 0; col < width; col++)
176 {
177 int sum;
178
179 sum = src[col + 0 * srcStride] * c[0];
180 sum += src[col + 1 * srcStride] * c[1];
181 sum += src[col + 2 * srcStride] * c[2];
182 sum += src[col + 3 * srcStride] * c[3];
183 if (N == 8)
184 {
185 sum += src[col + 4 * srcStride] * c[4];
186 sum += src[col + 5 * srcStride] * c[5];
187 sum += src[col + 6 * srcStride] * c[6];
188 sum += src[col + 7 * srcStride] * c[7];
189 }
190
191 int16_t val = (int16_t)((sum + offset) >> shift);
192 val = (val < 0) ? 0 : val;
193 val = (val > maxVal) ? maxVal : val;
194
195 dst[col] = (pixel)val;
196 }
197
198 src += srcStride;
199 dst += dstStride;
200 }
201 }
202
203 template<int N, int width, int height>
204 void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
205 {
206 const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
207 int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
208 int shift = IF_FILTER_PREC - headRoom;
209 int offset = -IF_INTERNAL_OFFS << shift;
210
211 src -= (N / 2 - 1) * srcStride;
212
213 int row, col;
214 for (row = 0; row < height; row++)
215 {
216 for (col = 0; col < width; col++)
217 {
218 int sum;
219
220 sum = src[col + 0 * srcStride] * c[0];
221 sum += src[col + 1 * srcStride] * c[1];
222 sum += src[col + 2 * srcStride] * c[2];
223 sum += src[col + 3 * srcStride] * c[3];
224 if (N == 8)
225 {
226 sum += src[col + 4 * srcStride] * c[4];
227 sum += src[col + 5 * srcStride] * c[5];
228 sum += src[col + 6 * srcStride] * c[6];
229 sum += src[col + 7 * srcStride] * c[7];
230 }
231
232 int16_t val = (int16_t)((sum + offset) >> shift);
233 dst[col] = val;
234 }
235
236 src += srcStride;
237 dst += dstStride;
238 }
239 }
240
241 template<int N, int width, int height>
242 void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
243 {
244 int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
245 int shift = IF_FILTER_PREC + headRoom;
246 int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
247 uint16_t maxVal = (1 << X265_DEPTH) - 1;
248 const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
249
250 src -= (N / 2 - 1) * srcStride;
251
252 int row, col;
253 for (row = 0; row < height; row++)
254 {
255 for (col = 0; col < width; col++)
256 {
257 int sum;
258
259 sum = src[col + 0 * srcStride] * coeff[0];
260 sum += src[col + 1 * srcStride] * coeff[1];
261 sum += src[col + 2 * srcStride] * coeff[2];
262 sum += src[col + 3 * srcStride] * coeff[3];
263 if (N == 8)
264 {
265 sum += src[col + 4 * srcStride] * coeff[4];
266 sum += src[col + 5 * srcStride] * coeff[5];
267 sum += src[col + 6 * srcStride] * coeff[6];
268 sum += src[col + 7 * srcStride] * coeff[7];
269 }
270
271 int16_t val = (int16_t)((sum + offset) >> shift);
272
273 val = (val < 0) ? 0 : val;
274 val = (val > maxVal) ? maxVal : val;
275
276 dst[col] = (pixel)val;
277 }
278
279 src += srcStride;
280 dst += dstStride;
281 }
282 }
283
284 template<int N, int width, int height>
285 void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
286 {
287 const int16_t* c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
288 int shift = IF_FILTER_PREC;
289 int row, col;
290
291 src -= (N / 2 - 1) * srcStride;
292 for (row = 0; row < height; row++)
293 {
294 for (col = 0; col < width; col++)
295 {
296 int sum;
297
298 sum = src[col + 0 * srcStride] * c[0];
299 sum += src[col + 1 * srcStride] * c[1];
300 sum += src[col + 2 * srcStride] * c[2];
301 sum += src[col + 3 * srcStride] * c[3];
302 if (N == 8)
303 {
304 sum += src[col + 4 * srcStride] * c[4];
305 sum += src[col + 5 * srcStride] * c[5];
306 sum += src[col + 6 * srcStride] * c[6];
307 sum += src[col + 7 * srcStride] * c[7];
308 }
309
310 int16_t val = (int16_t)((sum) >> shift);
311 dst[col] = val;
312 }
313
314 src += srcStride;
315 dst += dstStride;
316 }
317 }
318
319 template<int N>
320 void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int coeffIdx)
321 {
322 int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
323 int shift = IF_FILTER_PREC + headRoom;
324 int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
325 uint16_t maxVal = (1 << X265_DEPTH) - 1;
326 const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
327
328 src -= (N / 2 - 1) * srcStride;
329
330 int row, col;
331 for (row = 0; row < height; row++)
332 {
333 for (col = 0; col < width; col++)
334 {
335 int sum;
336
337 sum = src[col + 0 * srcStride] * coeff[0];
338 sum += src[col + 1 * srcStride] * coeff[1];
339 sum += src[col + 2 * srcStride] * coeff[2];
340 sum += src[col + 3 * srcStride] * coeff[3];
341 if (N == 8)
342 {
343 sum += src[col + 4 * srcStride] * coeff[4];
344 sum += src[col + 5 * srcStride] * coeff[5];
345 sum += src[col + 6 * srcStride] * coeff[6];
346 sum += src[col + 7 * srcStride] * coeff[7];
347 }
348
349 int16_t val = (int16_t)((sum + offset) >> shift);
350
351 val = (val < 0) ? 0 : val;
352 val = (val > maxVal) ? maxVal : val;
353
354 dst[col] = (pixel)val;
355 }
356
357 src += srcStride;
358 dst += dstStride;
359 }
360 }
361
362 template<int N, int width, int height>
363 void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
364 {
365 short immedVals[(64 + 8) * (64 + 8)];
366
367 interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
368 filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
369 }
370 }
371
372 namespace x265 {
373 // x265 private namespace
374
375 #define CHROMA_420(W, H) \
376 p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
377 p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
378 p.chroma[X265_CSP_I420].filter_vpp[CHROMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \
379 p.chroma[X265_CSP_I420].filter_vps[CHROMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \
380 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \
381 p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
382
383 #define CHROMA_422(W, H) \
384 p.chroma[X265_CSP_I422].filter_hpp[CHROMA422_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
385 p.chroma[X265_CSP_I422].filter_hps[CHROMA422_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
386 p.chroma[X265_CSP_I422].filter_vpp[CHROMA422_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \
387 p.chroma[X265_CSP_I422].filter_vps[CHROMA422_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \
388 p.chroma[X265_CSP_I422].filter_vsp[CHROMA422_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \
389 p.chroma[X265_CSP_I422].filter_vss[CHROMA422_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
390
391 #define CHROMA_444(W, H) \
392 p.chroma[X265_CSP_I444].filter_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<4, W, H>; \
393 p.chroma[X265_CSP_I444].filter_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<4, W, H>; \
394 p.chroma[X265_CSP_I444].filter_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>; \
395 p.chroma[X265_CSP_I444].filter_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<4, W, H>; \
396 p.chroma[X265_CSP_I444].filter_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<4, W, H>; \
397 p.chroma[X265_CSP_I444].filter_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<4, W, H>;
398
399 #define LUMA(W, H) \
400 p.luma_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<8, W, H>; \
401 p.luma_hps[LUMA_ ## W ## x ## H] = interp_horiz_ps_c<8, W, H>; \
402 p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>; \
403 p.luma_vps[LUMA_ ## W ## x ## H] = interp_vert_ps_c<8, W, H>; \
404 p.luma_vsp[LUMA_ ## W ## x ## H] = interp_vert_sp_c<8, W, H>; \
405 p.luma_vss[LUMA_ ## W ## x ## H] = interp_vert_ss_c<8, W, H>; \
406 p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_hv_pp_c<8, W, H>;
407
408 void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
409 {
410 LUMA(4, 4);
411 LUMA(8, 8);
412 CHROMA_420(4, 4);
413 LUMA(4, 8);
414 CHROMA_420(2, 4);
415 LUMA(8, 4);
416 CHROMA_420(4, 2);
417 LUMA(16, 16);
418 CHROMA_420(8, 8);
419 LUMA(16, 8);
420 CHROMA_420(8, 4);
421 LUMA(8, 16);
422 CHROMA_420(4, 8);
423 LUMA(16, 12);
424 CHROMA_420(8, 6);
425 LUMA(12, 16);
426 CHROMA_420(6, 8);
427 LUMA(16, 4);
428 CHROMA_420(8, 2);
429 LUMA(4, 16);
430 CHROMA_420(2, 8);
431 LUMA(32, 32);
432 CHROMA_420(16, 16);
433 LUMA(32, 16);
434 CHROMA_420(16, 8);
435 LUMA(16, 32);
436 CHROMA_420(8, 16);
437 LUMA(32, 24);
438 CHROMA_420(16, 12);
439 LUMA(24, 32);
440 CHROMA_420(12, 16);
441 LUMA(32, 8);
442 CHROMA_420(16, 4);
443 LUMA(8, 32);
444 CHROMA_420(4, 16);
445 LUMA(64, 64);
446 CHROMA_420(32, 32);
447 LUMA(64, 32);
448 CHROMA_420(32, 16);
449 LUMA(32, 64);
450 CHROMA_420(16, 32);
451 LUMA(64, 48);
452 CHROMA_420(32, 24);
453 LUMA(48, 64);
454 CHROMA_420(24, 32);
455 LUMA(64, 16);
456 CHROMA_420(32, 8);
457 LUMA(16, 64);
458 CHROMA_420(8, 32);
459
460 CHROMA_422(4, 8);
461 CHROMA_422(4, 4);
462 CHROMA_422(2, 8);
463 CHROMA_422(8, 16);
464 CHROMA_422(8, 8);
465 CHROMA_422(4, 16);
466 CHROMA_422(8, 12);
467 CHROMA_422(6, 16);
468 CHROMA_422(8, 4);
469 CHROMA_422(2, 16);
470 CHROMA_422(16, 32);
471 CHROMA_422(16, 16);
472 CHROMA_422(8, 32);
473 CHROMA_422(16, 24);
474 CHROMA_422(12, 32);
475 CHROMA_422(16, 8);
476 CHROMA_422(4, 32);
477 CHROMA_422(32, 64);
478 CHROMA_422(32, 32);
479 CHROMA_422(16, 64);
480 CHROMA_422(32, 48);
481 CHROMA_422(24, 64);
482 CHROMA_422(32, 16);
483 CHROMA_422(8, 64);
484
485 CHROMA_444(4, 4);
486 CHROMA_444(8, 8);
487 CHROMA_444(4, 8);
488 CHROMA_444(8, 4);
489 CHROMA_444(16, 16);
490 CHROMA_444(16, 8);
491 CHROMA_444(8, 16);
492 CHROMA_444(16, 12);
493 CHROMA_444(12, 16);
494 CHROMA_444(16, 4);
495 CHROMA_444(4, 16);
496 CHROMA_444(32, 32);
497 CHROMA_444(32, 16);
498 CHROMA_444(16, 32);
499 CHROMA_444(32, 24);
500 CHROMA_444(24, 32);
501 CHROMA_444(32, 8);
502 CHROMA_444(8, 32);
503 CHROMA_444(64, 64);
504 CHROMA_444(64, 32);
505 CHROMA_444(32, 64);
506 CHROMA_444(64, 48);
507 CHROMA_444(48, 64);
508 CHROMA_444(64, 16);
509 CHROMA_444(16, 64);
510 p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
511
512 p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
513 p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
514 p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
515
516 p.extendRowBorder = extendCURowColBorder;
517 }
518 }