1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Rajesh Paulraj <rajesh@multicorewareinc.com>
8 * Min Chen <min.chen@multicorewareinc.com>
9 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10 * Nabajit Deka <nabajit@multicorewareinc.com>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 * This program is also available under a commercial proprietary license.
27 * For more information, contact us at license @ x265.com.
28 *****************************************************************************/
31 #include "primitives.h"
36 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
40 // anonymous file-static namespace
42 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
43 // give identical results
44 void fastForwardDst(int16_t *block
, int16_t *coeff
, int shift
) // input block, output coeff
47 int rnd_factor
= 1 << (shift
- 1);
49 for (int i
= 0; i
< 4; i
++)
51 // Intermediate Variables
52 c
[0] = block
[4 * i
+ 0] + block
[4 * i
+ 3];
53 c
[1] = block
[4 * i
+ 1] + block
[4 * i
+ 3];
54 c
[2] = block
[4 * i
+ 0] - block
[4 * i
+ 1];
55 c
[3] = 74 * block
[4 * i
+ 2];
57 coeff
[i
] = (int16_t)((29 * c
[0] + 55 * c
[1] + c
[3] + rnd_factor
) >> shift
);
58 coeff
[4 + i
] = (int16_t)((74 * (block
[4 * i
+ 0] + block
[4 * i
+ 1] - block
[4 * i
+ 3]) + rnd_factor
) >> shift
);
59 coeff
[8 + i
] = (int16_t)((29 * c
[2] + 55 * c
[0] - c
[3] + rnd_factor
) >> shift
);
60 coeff
[12 + i
] = (int16_t)((55 * c
[2] - 29 * c
[1] + c
[3] + rnd_factor
) >> shift
);
64 void inversedst(int16_t *tmp
, int16_t *block
, int shift
) // input tmp, output block
67 int rnd_factor
= 1 << (shift
- 1);
69 for (i
= 0; i
< 4; i
++)
71 // Intermediate Variables
72 c
[0] = tmp
[i
] + tmp
[8 + i
];
73 c
[1] = tmp
[8 + i
] + tmp
[12 + i
];
74 c
[2] = tmp
[i
] - tmp
[12 + i
];
75 c
[3] = 74 * tmp
[4 + i
];
77 block
[4 * i
+ 0] = (int16_t)Clip3(-32768, 32767, (29 * c
[0] + 55 * c
[1] + c
[3] + rnd_factor
) >> shift
);
78 block
[4 * i
+ 1] = (int16_t)Clip3(-32768, 32767, (55 * c
[2] - 29 * c
[1] + c
[3] + rnd_factor
) >> shift
);
79 block
[4 * i
+ 2] = (int16_t)Clip3(-32768, 32767, (74 * (tmp
[i
] - tmp
[8 + i
] + tmp
[12 + i
]) + rnd_factor
) >> shift
);
80 block
[4 * i
+ 3] = (int16_t)Clip3(-32768, 32767, (55 * c
[0] + 29 * c
[2] - c
[3] + rnd_factor
) >> shift
);
84 void partialButterfly16(int16_t *src
, int16_t *dst
, int shift
, int line
)
90 int add
= 1 << (shift
- 1);
92 for (j
= 0; j
< line
; j
++)
95 for (k
= 0; k
< 8; k
++)
97 E
[k
] = src
[k
] + src
[15 - k
];
98 O
[k
] = src
[k
] - src
[15 - k
];
102 for (k
= 0; k
< 4; k
++)
104 EE
[k
] = E
[k
] + E
[7 - k
];
105 EO
[k
] = E
[k
] - E
[7 - k
];
109 EEE
[0] = EE
[0] + EE
[3];
110 EEO
[0] = EE
[0] - EE
[3];
111 EEE
[1] = EE
[1] + EE
[2];
112 EEO
[1] = EE
[1] - EE
[2];
114 dst
[0] = (int16_t)((g_t16
[0][0] * EEE
[0] + g_t16
[0][1] * EEE
[1] + add
) >> shift
);
115 dst
[8 * line
] = (int16_t)((g_t16
[8][0] * EEE
[0] + g_t16
[8][1] * EEE
[1] + add
) >> shift
);
116 dst
[4 * line
] = (int16_t)((g_t16
[4][0] * EEO
[0] + g_t16
[4][1] * EEO
[1] + add
) >> shift
);
117 dst
[12 * line
] = (int16_t)((g_t16
[12][0] * EEO
[0] + g_t16
[12][1] * EEO
[1] + add
) >> shift
);
119 for (k
= 2; k
< 16; k
+= 4)
121 dst
[k
* line
] = (int16_t)((g_t16
[k
][0] * EO
[0] + g_t16
[k
][1] * EO
[1] + g_t16
[k
][2] * EO
[2] +
122 g_t16
[k
][3] * EO
[3] + add
) >> shift
);
125 for (k
= 1; k
< 16; k
+= 2)
127 dst
[k
* line
] = (int16_t)((g_t16
[k
][0] * O
[0] + g_t16
[k
][1] * O
[1] + g_t16
[k
][2] * O
[2] + g_t16
[k
][3] * O
[3] +
128 g_t16
[k
][4] * O
[4] + g_t16
[k
][5] * O
[5] + g_t16
[k
][6] * O
[6] + g_t16
[k
][7] * O
[7] +
137 void partialButterfly32(int16_t *src
, int16_t *dst
, int shift
, int line
)
143 int EEEE
[2], EEEO
[2];
144 int add
= 1 << (shift
- 1);
146 for (j
= 0; j
< line
; j
++)
149 for (k
= 0; k
< 16; k
++)
151 E
[k
] = src
[k
] + src
[31 - k
];
152 O
[k
] = src
[k
] - src
[31 - k
];
156 for (k
= 0; k
< 8; k
++)
158 EE
[k
] = E
[k
] + E
[15 - k
];
159 EO
[k
] = E
[k
] - E
[15 - k
];
163 for (k
= 0; k
< 4; k
++)
165 EEE
[k
] = EE
[k
] + EE
[7 - k
];
166 EEO
[k
] = EE
[k
] - EE
[7 - k
];
170 EEEE
[0] = EEE
[0] + EEE
[3];
171 EEEO
[0] = EEE
[0] - EEE
[3];
172 EEEE
[1] = EEE
[1] + EEE
[2];
173 EEEO
[1] = EEE
[1] - EEE
[2];
175 dst
[0] = (int16_t)((g_t32
[0][0] * EEEE
[0] + g_t32
[0][1] * EEEE
[1] + add
) >> shift
);
176 dst
[16 * line
] = (int16_t)((g_t32
[16][0] * EEEE
[0] + g_t32
[16][1] * EEEE
[1] + add
) >> shift
);
177 dst
[8 * line
] = (int16_t)((g_t32
[8][0] * EEEO
[0] + g_t32
[8][1] * EEEO
[1] + add
) >> shift
);
178 dst
[24 * line
] = (int16_t)((g_t32
[24][0] * EEEO
[0] + g_t32
[24][1] * EEEO
[1] + add
) >> shift
);
179 for (k
= 4; k
< 32; k
+= 8)
181 dst
[k
* line
] = (int16_t)((g_t32
[k
][0] * EEO
[0] + g_t32
[k
][1] * EEO
[1] + g_t32
[k
][2] * EEO
[2] +
182 g_t32
[k
][3] * EEO
[3] + add
) >> shift
);
185 for (k
= 2; k
< 32; k
+= 4)
187 dst
[k
* line
] = (int16_t)((g_t32
[k
][0] * EO
[0] + g_t32
[k
][1] * EO
[1] + g_t32
[k
][2] * EO
[2] +
188 g_t32
[k
][3] * EO
[3] + g_t32
[k
][4] * EO
[4] + g_t32
[k
][5] * EO
[5] +
189 g_t32
[k
][6] * EO
[6] + g_t32
[k
][7] * EO
[7] + add
) >> shift
);
192 for (k
= 1; k
< 32; k
+= 2)
194 dst
[k
* line
] = (int16_t)((g_t32
[k
][0] * O
[0] + g_t32
[k
][1] * O
[1] + g_t32
[k
][2] * O
[2] + g_t32
[k
][3] * O
[3] +
195 g_t32
[k
][4] * O
[4] + g_t32
[k
][5] * O
[5] + g_t32
[k
][6] * O
[6] + g_t32
[k
][7] * O
[7] +
196 g_t32
[k
][8] * O
[8] + g_t32
[k
][9] * O
[9] + g_t32
[k
][10] * O
[10] + g_t32
[k
][11] *
197 O
[11] + g_t32
[k
][12] * O
[12] + g_t32
[k
][13] * O
[13] + g_t32
[k
][14] * O
[14] +
198 g_t32
[k
][15] * O
[15] + add
) >> shift
);
206 void partialButterfly8(int16_t *src
, int16_t *dst
, int shift
, int line
)
211 int add
= 1 << (shift
- 1);
213 for (j
= 0; j
< line
; j
++)
216 for (k
= 0; k
< 4; k
++)
218 E
[k
] = src
[k
] + src
[7 - k
];
219 O
[k
] = src
[k
] - src
[7 - k
];
228 dst
[0] = (int16_t)((g_t8
[0][0] * EE
[0] + g_t8
[0][1] * EE
[1] + add
) >> shift
);
229 dst
[4 * line
] = (int16_t)((g_t8
[4][0] * EE
[0] + g_t8
[4][1] * EE
[1] + add
) >> shift
);
230 dst
[2 * line
] = (int16_t)((g_t8
[2][0] * EO
[0] + g_t8
[2][1] * EO
[1] + add
) >> shift
);
231 dst
[6 * line
] = (int16_t)((g_t8
[6][0] * EO
[0] + g_t8
[6][1] * EO
[1] + add
) >> shift
);
233 dst
[line
] = (int16_t)((g_t8
[1][0] * O
[0] + g_t8
[1][1] * O
[1] + g_t8
[1][2] * O
[2] + g_t8
[1][3] * O
[3] + add
) >> shift
);
234 dst
[3 * line
] = (int16_t)((g_t8
[3][0] * O
[0] + g_t8
[3][1] * O
[1] + g_t8
[3][2] * O
[2] + g_t8
[3][3] * O
[3] + add
) >> shift
);
235 dst
[5 * line
] = (int16_t)((g_t8
[5][0] * O
[0] + g_t8
[5][1] * O
[1] + g_t8
[5][2] * O
[2] + g_t8
[5][3] * O
[3] + add
) >> shift
);
236 dst
[7 * line
] = (int16_t)((g_t8
[7][0] * O
[0] + g_t8
[7][1] * O
[1] + g_t8
[7][2] * O
[2] + g_t8
[7][3] * O
[3] + add
) >> shift
);
243 void partialButterflyInverse4(int16_t *src
, int16_t *dst
, int shift
, int line
)
247 int add
= 1 << (shift
- 1);
249 for (j
= 0; j
< line
; j
++)
251 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
252 O
[0] = g_t4
[1][0] * src
[line
] + g_t4
[3][0] * src
[3 * line
];
253 O
[1] = g_t4
[1][1] * src
[line
] + g_t4
[3][1] * src
[3 * line
];
254 E
[0] = g_t4
[0][0] * src
[0] + g_t4
[2][0] * src
[2 * line
];
255 E
[1] = g_t4
[0][1] * src
[0] + g_t4
[2][1] * src
[2 * line
];
257 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
258 dst
[0] = (int16_t)(Clip3(-32768, 32767, (E
[0] + O
[0] + add
) >> shift
));
259 dst
[1] = (int16_t)(Clip3(-32768, 32767, (E
[1] + O
[1] + add
) >> shift
));
260 dst
[2] = (int16_t)(Clip3(-32768, 32767, (E
[1] - O
[1] + add
) >> shift
));
261 dst
[3] = (int16_t)(Clip3(-32768, 32767, (E
[0] - O
[0] + add
) >> shift
));
268 void partialButterflyInverse8(int16_t *src
, int16_t *dst
, int shift
, int line
)
273 int add
= 1 << (shift
- 1);
275 for (j
= 0; j
< line
; j
++)
277 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
278 for (k
= 0; k
< 4; k
++)
280 O
[k
] = g_t8
[1][k
] * src
[line
] + g_t8
[3][k
] * src
[3 * line
] + g_t8
[5][k
] * src
[5 * line
] + g_t8
[7][k
] * src
[7 * line
];
283 EO
[0] = g_t8
[2][0] * src
[2 * line
] + g_t8
[6][0] * src
[6 * line
];
284 EO
[1] = g_t8
[2][1] * src
[2 * line
] + g_t8
[6][1] * src
[6 * line
];
285 EE
[0] = g_t8
[0][0] * src
[0] + g_t8
[4][0] * src
[4 * line
];
286 EE
[1] = g_t8
[0][1] * src
[0] + g_t8
[4][1] * src
[4 * line
];
288 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
289 E
[0] = EE
[0] + EO
[0];
290 E
[3] = EE
[0] - EO
[0];
291 E
[1] = EE
[1] + EO
[1];
292 E
[2] = EE
[1] - EO
[1];
293 for (k
= 0; k
< 4; k
++)
295 dst
[k
] = (int16_t)Clip3(-32768, 32767, (E
[k
] + O
[k
] + add
) >> shift
);
296 dst
[k
+ 4] = (int16_t)Clip3(-32768, 32767, (E
[3 - k
] - O
[3 - k
] + add
) >> shift
);
304 void partialButterflyInverse16(int16_t *src
, int16_t *dst
, int shift
, int line
)
310 int add
= 1 << (shift
- 1);
312 for (j
= 0; j
< line
; j
++)
314 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
315 for (k
= 0; k
< 8; k
++)
317 O
[k
] = g_t16
[1][k
] * src
[line
] + g_t16
[3][k
] * src
[3 * line
] + g_t16
[5][k
] * src
[5 * line
] + g_t16
[7][k
] * src
[7 * line
] +
318 g_t16
[9][k
] * src
[9 * line
] + g_t16
[11][k
] * src
[11 * line
] + g_t16
[13][k
] * src
[13 * line
] + g_t16
[15][k
] * src
[15 * line
];
321 for (k
= 0; k
< 4; k
++)
323 EO
[k
] = g_t16
[2][k
] * src
[2 * line
] + g_t16
[6][k
] * src
[6 * line
] + g_t16
[10][k
] * src
[10 * line
] + g_t16
[14][k
] * src
[14 * line
];
326 EEO
[0] = g_t16
[4][0] * src
[4 * line
] + g_t16
[12][0] * src
[12 * line
];
327 EEE
[0] = g_t16
[0][0] * src
[0] + g_t16
[8][0] * src
[8 * line
];
328 EEO
[1] = g_t16
[4][1] * src
[4 * line
] + g_t16
[12][1] * src
[12 * line
];
329 EEE
[1] = g_t16
[0][1] * src
[0] + g_t16
[8][1] * src
[8 * line
];
331 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
332 for (k
= 0; k
< 2; k
++)
334 EE
[k
] = EEE
[k
] + EEO
[k
];
335 EE
[k
+ 2] = EEE
[1 - k
] - EEO
[1 - k
];
338 for (k
= 0; k
< 4; k
++)
340 E
[k
] = EE
[k
] + EO
[k
];
341 E
[k
+ 4] = EE
[3 - k
] - EO
[3 - k
];
344 for (k
= 0; k
< 8; k
++)
346 dst
[k
] = (int16_t)Clip3(-32768, 32767, (E
[k
] + O
[k
] + add
) >> shift
);
347 dst
[k
+ 8] = (int16_t)Clip3(-32768, 32767, (E
[7 - k
] - O
[7 - k
] + add
) >> shift
);
355 void partialButterflyInverse32(int16_t *src
, int16_t *dst
, int shift
, int line
)
361 int EEEE
[2], EEEO
[2];
362 int add
= 1 << (shift
- 1);
364 for (j
= 0; j
< line
; j
++)
366 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
367 for (k
= 0; k
< 16; k
++)
369 O
[k
] = g_t32
[1][k
] * src
[line
] + g_t32
[3][k
] * src
[3 * line
] + g_t32
[5][k
] * src
[5 * line
] + g_t32
[7][k
] * src
[7 * line
] +
370 g_t32
[9][k
] * src
[9 * line
] + g_t32
[11][k
] * src
[11 * line
] + g_t32
[13][k
] * src
[13 * line
] + g_t32
[15][k
] * src
[15 * line
] +
371 g_t32
[17][k
] * src
[17 * line
] + g_t32
[19][k
] * src
[19 * line
] + g_t32
[21][k
] * src
[21 * line
] + g_t32
[23][k
] * src
[23 * line
] +
372 g_t32
[25][k
] * src
[25 * line
] + g_t32
[27][k
] * src
[27 * line
] + g_t32
[29][k
] * src
[29 * line
] + g_t32
[31][k
] * src
[31 * line
];
375 for (k
= 0; k
< 8; k
++)
377 EO
[k
] = g_t32
[2][k
] * src
[2 * line
] + g_t32
[6][k
] * src
[6 * line
] + g_t32
[10][k
] * src
[10 * line
] + g_t32
[14][k
] * src
[14 * line
] +
378 g_t32
[18][k
] * src
[18 * line
] + g_t32
[22][k
] * src
[22 * line
] + g_t32
[26][k
] * src
[26 * line
] + g_t32
[30][k
] * src
[30 * line
];
381 for (k
= 0; k
< 4; k
++)
383 EEO
[k
] = g_t32
[4][k
] * src
[4 * line
] + g_t32
[12][k
] * src
[12 * line
] + g_t32
[20][k
] * src
[20 * line
] + g_t32
[28][k
] * src
[28 * line
];
386 EEEO
[0] = g_t32
[8][0] * src
[8 * line
] + g_t32
[24][0] * src
[24 * line
];
387 EEEO
[1] = g_t32
[8][1] * src
[8 * line
] + g_t32
[24][1] * src
[24 * line
];
388 EEEE
[0] = g_t32
[0][0] * src
[0] + g_t32
[16][0] * src
[16 * line
];
389 EEEE
[1] = g_t32
[0][1] * src
[0] + g_t32
[16][1] * src
[16 * line
];
391 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
392 EEE
[0] = EEEE
[0] + EEEO
[0];
393 EEE
[3] = EEEE
[0] - EEEO
[0];
394 EEE
[1] = EEEE
[1] + EEEO
[1];
395 EEE
[2] = EEEE
[1] - EEEO
[1];
396 for (k
= 0; k
< 4; k
++)
398 EE
[k
] = EEE
[k
] + EEO
[k
];
399 EE
[k
+ 4] = EEE
[3 - k
] - EEO
[3 - k
];
402 for (k
= 0; k
< 8; k
++)
404 E
[k
] = EE
[k
] + EO
[k
];
405 E
[k
+ 8] = EE
[7 - k
] - EO
[7 - k
];
408 for (k
= 0; k
< 16; k
++)
410 dst
[k
] = (int16_t)Clip3(-32768, 32767, (E
[k
] + O
[k
] + add
) >> shift
);
411 dst
[k
+ 16] = (int16_t)Clip3(-32768, 32767, (E
[15 - k
] - O
[15 - k
] + add
) >> shift
);
419 void partialButterfly4(int16_t *src
, int16_t *dst
, int shift
, int line
)
423 int add
= 1 << (shift
- 1);
425 for (j
= 0; j
< line
; j
++)
428 E
[0] = src
[0] + src
[3];
429 O
[0] = src
[0] - src
[3];
430 E
[1] = src
[1] + src
[2];
431 O
[1] = src
[1] - src
[2];
433 dst
[0] = (int16_t)((g_t4
[0][0] * E
[0] + g_t4
[0][1] * E
[1] + add
) >> shift
);
434 dst
[2 * line
] = (int16_t)((g_t4
[2][0] * E
[0] + g_t4
[2][1] * E
[1] + add
) >> shift
);
435 dst
[line
] = (int16_t)((g_t4
[1][0] * O
[0] + g_t4
[1][1] * O
[1] + add
) >> shift
);
436 dst
[3 * line
] = (int16_t)((g_t4
[3][0] * O
[0] + g_t4
[3][1] * O
[1] + add
) >> shift
);
443 void dst4_c(int16_t *src
, int32_t *dst
, intptr_t stride
)
445 const int shift_1st
= 1 + X265_DEPTH
- 8;
446 const int shift_2nd
= 8;
448 ALIGN_VAR_32(int16_t, coef
[4 * 4]);
449 ALIGN_VAR_32(int16_t, block
[4 * 4]);
451 for (int i
= 0; i
< 4; i
++)
453 memcpy(&block
[i
* 4], &src
[i
* stride
], 4 * sizeof(int16_t));
456 fastForwardDst(block
, coef
, shift_1st
);
457 fastForwardDst(coef
, block
, shift_2nd
);
460 for (int i
= 0; i
< N
; i
++)
462 for (int j
= 0; j
< N
; j
++)
464 dst
[i
* N
+ j
] = block
[i
* N
+ j
];
471 void dct4_c(int16_t *src
, int32_t *dst
, intptr_t stride
)
473 const int shift_1st
= 1 + X265_DEPTH
- 8;
474 const int shift_2nd
= 8;
476 ALIGN_VAR_32(int16_t, coef
[4 * 4]);
477 ALIGN_VAR_32(int16_t, block
[4 * 4]);
479 for (int i
= 0; i
< 4; i
++)
481 memcpy(&block
[i
* 4], &src
[i
* stride
], 4 * sizeof(int16_t));
484 partialButterfly4(block
, coef
, shift_1st
, 4);
485 partialButterfly4(coef
, block
, shift_2nd
, 4);
487 for (int i
= 0; i
< N
; i
++)
489 for (int j
= 0; j
< N
; j
++)
491 dst
[i
* N
+ j
] = block
[i
* N
+ j
];
498 void dct8_c(int16_t *src
, int32_t *dst
, intptr_t stride
)
500 const int shift_1st
= 2 + X265_DEPTH
- 8;
501 const int shift_2nd
= 9;
503 ALIGN_VAR_32(int16_t, coef
[8 * 8]);
504 ALIGN_VAR_32(int16_t, block
[8 * 8]);
506 for (int i
= 0; i
< 8; i
++)
508 memcpy(&block
[i
* 8], &src
[i
* stride
], 8 * sizeof(int16_t));
511 partialButterfly8(block
, coef
, shift_1st
, 8);
512 partialButterfly8(coef
, block
, shift_2nd
, 8);
515 for (int i
= 0; i
< N
; i
++)
517 for (int j
= 0; j
< N
; j
++)
519 dst
[i
* N
+ j
] = block
[i
* N
+ j
];
526 void dct16_c(int16_t *src
, int32_t *dst
, intptr_t stride
)
528 const int shift_1st
= 3 + X265_DEPTH
- 8;
529 const int shift_2nd
= 10;
531 ALIGN_VAR_32(int16_t, coef
[16 * 16]);
532 ALIGN_VAR_32(int16_t, block
[16 * 16]);
534 for (int i
= 0; i
< 16; i
++)
536 memcpy(&block
[i
* 16], &src
[i
* stride
], 16 * sizeof(int16_t));
539 partialButterfly16(block
, coef
, shift_1st
, 16);
540 partialButterfly16(coef
, block
, shift_2nd
, 16);
543 for (int i
= 0; i
< N
; i
++)
545 for (int j
= 0; j
< N
; j
++)
547 dst
[i
* N
+ j
] = block
[i
* N
+ j
];
554 void dct32_c(int16_t *src
, int32_t *dst
, intptr_t stride
)
556 const int shift_1st
= 4 + X265_DEPTH
- 8;
557 const int shift_2nd
= 11;
559 ALIGN_VAR_32(int16_t, coef
[32 * 32]);
560 ALIGN_VAR_32(int16_t, block
[32 * 32]);
562 for (int i
= 0; i
< 32; i
++)
564 memcpy(&block
[i
* 32], &src
[i
* stride
], 32 * sizeof(int16_t));
567 partialButterfly32(block
, coef
, shift_1st
, 32);
568 partialButterfly32(coef
, block
, shift_2nd
, 32);
571 for (int i
= 0; i
< N
; i
++)
573 for (int j
= 0; j
< N
; j
++)
575 dst
[i
* N
+ j
] = block
[i
* N
+ j
];
582 void idst4_c(int32_t *src
, int16_t *dst
, intptr_t stride
)
584 const int shift_1st
= 7;
585 const int shift_2nd
= 12 - (X265_DEPTH
- 8);
587 ALIGN_VAR_32(int16_t, coef
[4 * 4]);
588 ALIGN_VAR_32(int16_t, block
[4 * 4]);
591 for (int i
= 0; i
< N
; i
++)
593 for (int j
= 0; j
< N
; j
++)
595 block
[i
* N
+ j
] = (int16_t)src
[i
* N
+ j
];
601 inversedst(block
, coef
, shift_1st
); // Forward DST BY FAST ALGORITHM, block input, coef output
602 inversedst(coef
, block
, shift_2nd
); // Forward DST BY FAST ALGORITHM, coef input, coeff output
604 for (int i
= 0; i
< 4; i
++)
606 memcpy(&dst
[i
* stride
], &block
[i
* 4], 4 * sizeof(int16_t));
610 void idct4_c(int32_t *src
, int16_t *dst
, intptr_t stride
)
612 const int shift_1st
= 7;
613 const int shift_2nd
= 12 - (X265_DEPTH
- 8);
615 ALIGN_VAR_32(int16_t, coef
[4 * 4]);
616 ALIGN_VAR_32(int16_t, block
[4 * 4]);
619 for (int i
= 0; i
< N
; i
++)
621 for (int j
= 0; j
< N
; j
++)
623 block
[i
* N
+ j
] = (int16_t)src
[i
* N
+ j
];
629 partialButterflyInverse4(block
, coef
, shift_1st
, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
630 partialButterflyInverse4(coef
, block
, shift_2nd
, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
632 for (int i
= 0; i
< 4; i
++)
634 memcpy(&dst
[i
* stride
], &block
[i
* 4], 4 * sizeof(int16_t));
638 void idct8_c(int32_t *src
, int16_t *dst
, intptr_t stride
)
640 const int shift_1st
= 7;
641 const int shift_2nd
= 12 - (X265_DEPTH
- 8);
643 ALIGN_VAR_32(int16_t, coef
[8 * 8]);
644 ALIGN_VAR_32(int16_t, block
[8 * 8]);
647 for (int i
= 0; i
< N
; i
++)
649 for (int j
= 0; j
< N
; j
++)
651 block
[i
* N
+ j
] = (int16_t)src
[i
* N
+ j
];
657 partialButterflyInverse8(block
, coef
, shift_1st
, 8);
658 partialButterflyInverse8(coef
, block
, shift_2nd
, 8);
659 for (int i
= 0; i
< 8; i
++)
661 memcpy(&dst
[i
* stride
], &block
[i
* 8], 8 * sizeof(int16_t));
665 void idct16_c(int32_t *src
, int16_t *dst
, intptr_t stride
)
667 const int shift_1st
= 7;
668 const int shift_2nd
= 12 - (X265_DEPTH
- 8);
670 ALIGN_VAR_32(int16_t, coef
[16 * 16]);
671 ALIGN_VAR_32(int16_t, block
[16 * 16]);
674 for (int i
= 0; i
< N
; i
++)
676 for (int j
= 0; j
< N
; j
++)
678 block
[i
* N
+ j
] = (int16_t)src
[i
* N
+ j
];
684 partialButterflyInverse16(block
, coef
, shift_1st
, 16);
685 partialButterflyInverse16(coef
, block
, shift_2nd
, 16);
686 for (int i
= 0; i
< 16; i
++)
688 memcpy(&dst
[i
* stride
], &block
[i
* 16], 16 * sizeof(int16_t));
692 void idct32_c(int32_t *src
, int16_t *dst
, intptr_t stride
)
694 const int shift_1st
= 7;
695 const int shift_2nd
= 12 - (X265_DEPTH
- 8);
697 ALIGN_VAR_32(int16_t, coef
[32 * 32]);
698 ALIGN_VAR_32(int16_t, block
[32 * 32]);
701 for (int i
= 0; i
< N
; i
++)
703 for (int j
= 0; j
< N
; j
++)
705 block
[i
* N
+ j
] = (int16_t)src
[i
* N
+ j
];
711 partialButterflyInverse32(block
, coef
, shift_1st
, 32);
712 partialButterflyInverse32(coef
, block
, shift_2nd
, 32);
714 for (int i
= 0; i
< 32; i
++)
716 memcpy(&dst
[i
* stride
], &block
[i
* 32], 32 * sizeof(int16_t));
720 void dequant_normal_c(const int16_t* quantCoef
, int32_t* coef
, int num
, int scale
, int shift
)
723 X265_CHECK(scale
< 32768 || ((scale
& 3) == 0 && shift
> 2), "dequant invalid scale %d\n", scale
);
725 // NOTE: maximum of scale is (72 * 256)
726 X265_CHECK(scale
< 32768, "dequant invalid scale %d\n", scale
);
728 X265_CHECK(num
<= 32 * 32, "dequant num %d too large\n", num
);
729 X265_CHECK((num
% 8) == 0, "dequant num %d not multiple of 8\n", num
);
730 X265_CHECK(shift
<= 10, "shift too large %d\n", shift
);
731 X265_CHECK(((intptr_t)coef
& 31) == 0, "dequant coef buffer not aligned\n");
735 add
= 1 << (shift
- 1);
737 for (int n
= 0; n
< num
; n
++)
739 coeffQ
= (quantCoef
[n
] * scale
+ add
) >> shift
;
740 coef
[n
] = Clip3(-32768, 32767, coeffQ
);
744 void dequant_scaling_c(const int16_t* quantCoef
, const int32_t *deQuantCoef
, int32_t* coef
, int num
, int per
, int shift
)
746 X265_CHECK(num
<= 32 * 32, "dequant num %d too large\n", num
);
754 add
= 1 << (shift
- per
- 1);
756 for (int n
= 0; n
< num
; n
++)
758 coeffQ
= ((quantCoef
[n
] * deQuantCoef
[n
]) + add
) >> (shift
- per
);
759 coef
[n
] = Clip3(-32768, 32767, coeffQ
);
764 for (int n
= 0; n
< num
; n
++)
766 coeffQ
= Clip3(-32768, 32767, quantCoef
[n
] * deQuantCoef
[n
]);
767 coef
[n
] = Clip3(-32768, 32767, coeffQ
<< (per
- shift
));
772 uint32_t quant_c(int32_t* coef
, int32_t* quantCoeff
, int32_t* deltaU
, int16_t* qCoef
, int qBits
, int add
, int numCoeff
)
774 X265_CHECK(qBits
>= 8, "qBits less than 8\n");
775 X265_CHECK((numCoeff
% 16) == 0, "numCoeff must be multiple of 16\n");
776 int qBits8
= qBits
- 8;
779 for (int blockpos
= 0; blockpos
< numCoeff
; blockpos
++)
781 int level
= coef
[blockpos
];
782 int sign
= (level
< 0 ? -1 : 1);
784 int tmplevel
= abs(level
) * quantCoeff
[blockpos
];
785 level
= ((tmplevel
+ add
) >> qBits
);
786 deltaU
[blockpos
] = ((tmplevel
- (level
<< qBits
)) >> qBits8
);
790 qCoef
[blockpos
] = (int16_t)Clip3(-32768, 32767, level
);
796 uint32_t nquant_c(int32_t* coef
, int32_t* quantCoeff
, int16_t* qCoef
, int qBits
, int add
, int numCoeff
)
798 X265_CHECK((numCoeff
% 16) == 0, "number of quant coeff is not multiple of 4x4\n");
799 X265_CHECK((uint32_t)add
< ((uint32_t)1 << qBits
), "2 ^ qBits less than add\n");
800 X265_CHECK(((intptr_t)quantCoeff
& 31) == 0, "quantCoeff buffer not aligned\n");
804 for (int blockpos
= 0; blockpos
< numCoeff
; blockpos
++)
806 int level
= coef
[blockpos
];
807 int sign
= (level
< 0 ? -1 : 1);
809 int tmplevel
= abs(level
) * quantCoeff
[blockpos
];
810 level
= ((tmplevel
+ add
) >> qBits
);
814 qCoef
[blockpos
] = (int16_t)Clip3(-32768, 32767, level
);
820 int count_nonzero_c(const int16_t *quantCoeff
, int numCoeff
)
822 X265_CHECK(((intptr_t)quantCoeff
& 15) == 0, "quant buffer not aligned\n");
823 X265_CHECK(numCoeff
> 0 && (numCoeff
& 15) == 0, "numCoeff invalid %d\n", numCoeff
);
827 for (int i
= 0; i
< numCoeff
; i
++)
829 count
+= quantCoeff
[i
] != 0;
836 uint32_t copy_count(int16_t* coeff
, int16_t* residual
, intptr_t stride
)
839 for (int k
= 0; k
< trSize
; k
++)
841 for (int j
= 0; j
< trSize
; j
++)
843 coeff
[k
* trSize
+ j
] = residual
[k
* stride
+ j
];
844 numSig
+= (residual
[k
* stride
+ j
] != 0);
851 void denoiseDct_c(int32_t* dctCoef
, uint32_t* resSum
, uint16_t* offset
, int numCoeff
)
853 for (int i
= 0; i
< numCoeff
; i
++)
855 int level
= dctCoef
[i
];
856 int sign
= level
>> 31;
857 level
= (level
+ sign
) ^ sign
;
860 dctCoef
[i
] = level
< 0 ? 0 : (level
^ sign
) - sign
;
864 } // closing - anonymous file-static namespace
867 // x265 private namespace
869 void Setup_C_DCTPrimitives(EncoderPrimitives
& p
)
871 p
.dequant_scaling
= dequant_scaling_c
;
872 p
.dequant_normal
= dequant_normal_c
;
875 p
.dct
[DST_4x4
] = dst4_c
;
876 p
.dct
[DCT_4x4
] = dct4_c
;
877 p
.dct
[DCT_8x8
] = dct8_c
;
878 p
.dct
[DCT_16x16
] = dct16_c
;
879 p
.dct
[DCT_32x32
] = dct32_c
;
880 p
.idct
[IDST_4x4
] = idst4_c
;
881 p
.idct
[IDCT_4x4
] = idct4_c
;
882 p
.idct
[IDCT_8x8
] = idct8_c
;
883 p
.idct
[IDCT_16x16
] = idct16_c
;
884 p
.idct
[IDCT_32x32
] = idct32_c
;
885 p
.count_nonzero
= count_nonzero_c
;
886 p
.denoiseDct
= denoiseDct_c
;
888 p
.copy_cnt
[BLOCK_4x4
] = copy_count
<4>;
889 p
.copy_cnt
[BLOCK_8x8
] = copy_count
<8>;
890 p
.copy_cnt
[BLOCK_16x16
] = copy_count
<16>;
891 p
.copy_cnt
[BLOCK_32x32
] = copy_count
<32>;