Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * RV40 decoder motion compensation functions | |
3 | * Copyright (c) 2008 Konstantin Shishkov | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | /** | |
23 | * @file | |
24 | * RV40 decoder motion compensation functions | |
25 | */ | |
26 | ||
27 | #include "libavutil/common.h" | |
28 | #include "libavutil/intreadwrite.h" | |
29 | #include "avcodec.h" | |
30 | #include "h264qpel.h" | |
31 | #include "mathops.h" | |
32 | #include "pixels.h" | |
33 | #include "rnd_avg.h" | |
34 | #include "rv34dsp.h" | |
35 | #include "libavutil/avassert.h" | |
36 | ||
37 | #define RV40_LOWPASS(OPNAME, OP) \ | |
38 | static void OPNAME ## rv40_qpel8_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ | |
39 | const int h, const int C1, const int C2, const int SHIFT){\ | |
40 | const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;\ | |
41 | int i;\ | |
42 | for(i = 0; i < h; i++)\ | |
43 | {\ | |
44 | OP(dst[0], (src[-2] + src[ 3] - 5*(src[-1]+src[2]) + src[0]*C1 + src[1]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
45 | OP(dst[1], (src[-1] + src[ 4] - 5*(src[ 0]+src[3]) + src[1]*C1 + src[2]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
46 | OP(dst[2], (src[ 0] + src[ 5] - 5*(src[ 1]+src[4]) + src[2]*C1 + src[3]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
47 | OP(dst[3], (src[ 1] + src[ 6] - 5*(src[ 2]+src[5]) + src[3]*C1 + src[4]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
48 | OP(dst[4], (src[ 2] + src[ 7] - 5*(src[ 3]+src[6]) + src[4]*C1 + src[5]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
49 | OP(dst[5], (src[ 3] + src[ 8] - 5*(src[ 4]+src[7]) + src[5]*C1 + src[6]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
50 | OP(dst[6], (src[ 4] + src[ 9] - 5*(src[ 5]+src[8]) + src[6]*C1 + src[7]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
51 | OP(dst[7], (src[ 5] + src[10] - 5*(src[ 6]+src[9]) + src[7]*C1 + src[8]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
52 | dst += dstStride;\ | |
53 | src += srcStride;\ | |
54 | }\ | |
55 | }\ | |
56 | \ | |
57 | static void OPNAME ## rv40_qpel8_v_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ | |
58 | const int w, const int C1, const int C2, const int SHIFT){\ | |
59 | const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;\ | |
60 | int i;\ | |
61 | for(i = 0; i < w; i++)\ | |
62 | {\ | |
63 | const int srcB = src[-2*srcStride];\ | |
64 | const int srcA = src[-1*srcStride];\ | |
65 | const int src0 = src[0 *srcStride];\ | |
66 | const int src1 = src[1 *srcStride];\ | |
67 | const int src2 = src[2 *srcStride];\ | |
68 | const int src3 = src[3 *srcStride];\ | |
69 | const int src4 = src[4 *srcStride];\ | |
70 | const int src5 = src[5 *srcStride];\ | |
71 | const int src6 = src[6 *srcStride];\ | |
72 | const int src7 = src[7 *srcStride];\ | |
73 | const int src8 = src[8 *srcStride];\ | |
74 | const int src9 = src[9 *srcStride];\ | |
75 | const int src10 = src[10*srcStride];\ | |
76 | OP(dst[0*dstStride], (srcB + src3 - 5*(srcA+src2) + src0*C1 + src1*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
77 | OP(dst[1*dstStride], (srcA + src4 - 5*(src0+src3) + src1*C1 + src2*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
78 | OP(dst[2*dstStride], (src0 + src5 - 5*(src1+src4) + src2*C1 + src3*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
79 | OP(dst[3*dstStride], (src1 + src6 - 5*(src2+src5) + src3*C1 + src4*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
80 | OP(dst[4*dstStride], (src2 + src7 - 5*(src3+src6) + src4*C1 + src5*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
81 | OP(dst[5*dstStride], (src3 + src8 - 5*(src4+src7) + src5*C1 + src6*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
82 | OP(dst[6*dstStride], (src4 + src9 - 5*(src5+src8) + src6*C1 + src7*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
83 | OP(dst[7*dstStride], (src5 + src10 - 5*(src6+src9) + src7*C1 + src8*C2 + (1<<(SHIFT-1))) >> SHIFT);\ | |
84 | dst++;\ | |
85 | src++;\ | |
86 | }\ | |
87 | }\ | |
88 | \ | |
89 | static void OPNAME ## rv40_qpel16_v_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ | |
90 | const int w, const int C1, const int C2, const int SHIFT){\ | |
91 | OPNAME ## rv40_qpel8_v_lowpass(dst , src , dstStride, srcStride, 8, C1, C2, SHIFT);\ | |
92 | OPNAME ## rv40_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride, 8, C1, C2, SHIFT);\ | |
93 | src += 8*srcStride;\ | |
94 | dst += 8*dstStride;\ | |
95 | OPNAME ## rv40_qpel8_v_lowpass(dst , src , dstStride, srcStride, w-8, C1, C2, SHIFT);\ | |
96 | OPNAME ## rv40_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride, w-8, C1, C2, SHIFT);\ | |
97 | }\ | |
98 | \ | |
99 | static void OPNAME ## rv40_qpel16_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ | |
100 | const int h, const int C1, const int C2, const int SHIFT){\ | |
101 | OPNAME ## rv40_qpel8_h_lowpass(dst , src , dstStride, srcStride, 8, C1, C2, SHIFT);\ | |
102 | OPNAME ## rv40_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride, 8, C1, C2, SHIFT);\ | |
103 | src += 8*srcStride;\ | |
104 | dst += 8*dstStride;\ | |
105 | OPNAME ## rv40_qpel8_h_lowpass(dst , src , dstStride, srcStride, h-8, C1, C2, SHIFT);\ | |
106 | OPNAME ## rv40_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride, h-8, C1, C2, SHIFT);\ | |
107 | }\ | |
108 | \ | |
109 | ||
110 | #define RV40_MC(OPNAME, SIZE) \ | |
111 | static void OPNAME ## rv40_qpel ## SIZE ## _mc10_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
112 | {\ | |
113 | OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 52, 20, 6);\ | |
114 | }\ | |
115 | \ | |
116 | static void OPNAME ## rv40_qpel ## SIZE ## _mc30_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
117 | {\ | |
118 | OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 20, 52, 6);\ | |
119 | }\ | |
120 | \ | |
121 | static void OPNAME ## rv40_qpel ## SIZE ## _mc01_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
122 | {\ | |
123 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 52, 20, 6);\ | |
124 | }\ | |
125 | \ | |
126 | static void OPNAME ## rv40_qpel ## SIZE ## _mc11_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
127 | {\ | |
128 | uint8_t full[SIZE*(SIZE+5)];\ | |
129 | uint8_t * const full_mid = full + SIZE*2;\ | |
130 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\ | |
131 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\ | |
132 | }\ | |
133 | \ | |
134 | static void OPNAME ## rv40_qpel ## SIZE ## _mc21_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
135 | {\ | |
136 | uint8_t full[SIZE*(SIZE+5)];\ | |
137 | uint8_t * const full_mid = full + SIZE*2;\ | |
138 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\ | |
139 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\ | |
140 | }\ | |
141 | \ | |
142 | static void OPNAME ## rv40_qpel ## SIZE ## _mc31_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
143 | {\ | |
144 | uint8_t full[SIZE*(SIZE+5)];\ | |
145 | uint8_t * const full_mid = full + SIZE*2;\ | |
146 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 52, 6);\ | |
147 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\ | |
148 | }\ | |
149 | \ | |
150 | static void OPNAME ## rv40_qpel ## SIZE ## _mc12_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
151 | {\ | |
152 | uint8_t full[SIZE*(SIZE+5)];\ | |
153 | uint8_t * const full_mid = full + SIZE*2;\ | |
154 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\ | |
155 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\ | |
156 | }\ | |
157 | \ | |
158 | static void OPNAME ## rv40_qpel ## SIZE ## _mc22_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
159 | {\ | |
160 | uint8_t full[SIZE*(SIZE+5)];\ | |
161 | uint8_t * const full_mid = full + SIZE*2;\ | |
162 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\ | |
163 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\ | |
164 | }\ | |
165 | \ | |
166 | static void OPNAME ## rv40_qpel ## SIZE ## _mc32_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
167 | {\ | |
168 | uint8_t full[SIZE*(SIZE+5)];\ | |
169 | uint8_t * const full_mid = full + SIZE*2;\ | |
170 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 52, 6);\ | |
171 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\ | |
172 | }\ | |
173 | \ | |
174 | static void OPNAME ## rv40_qpel ## SIZE ## _mc03_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
175 | {\ | |
176 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 20, 52, 6);\ | |
177 | }\ | |
178 | \ | |
179 | static void OPNAME ## rv40_qpel ## SIZE ## _mc13_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
180 | {\ | |
181 | uint8_t full[SIZE*(SIZE+5)];\ | |
182 | uint8_t * const full_mid = full + SIZE*2;\ | |
183 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\ | |
184 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 52, 6);\ | |
185 | }\ | |
186 | \ | |
187 | static void OPNAME ## rv40_qpel ## SIZE ## _mc23_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ | |
188 | {\ | |
189 | uint8_t full[SIZE*(SIZE+5)];\ | |
190 | uint8_t * const full_mid = full + SIZE*2;\ | |
191 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\ | |
192 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 52, 6);\ | |
193 | }\ | |
194 | \ | |
195 | ||
196 | #define op_avg(a, b) a = (((a)+cm[b]+1)>>1) | |
197 | #define op_put(a, b) a = cm[b] | |
198 | ||
199 | RV40_LOWPASS(put_ , op_put) | |
200 | RV40_LOWPASS(avg_ , op_avg) | |
201 | ||
202 | #undef op_avg | |
203 | #undef op_put | |
204 | ||
205 | RV40_MC(put_, 8) | |
206 | RV40_MC(put_, 16) | |
207 | RV40_MC(avg_, 8) | |
208 | RV40_MC(avg_, 16) | |
209 | ||
210 | #define PIXOP2(OPNAME, OP) \ | |
211 | static inline void OPNAME ## _pixels8_xy2_8_c(uint8_t *block, \ | |
212 | const uint8_t *pixels, \ | |
213 | ptrdiff_t line_size, \ | |
214 | int h) \ | |
215 | { \ | |
216 | /* FIXME HIGH BIT DEPTH */ \ | |
217 | int j; \ | |
218 | \ | |
219 | for (j = 0; j < 2; j++) { \ | |
220 | int i; \ | |
221 | const uint32_t a = AV_RN32(pixels); \ | |
222 | const uint32_t b = AV_RN32(pixels + 1); \ | |
223 | uint32_t l0 = (a & 0x03030303UL) + \ | |
224 | (b & 0x03030303UL) + \ | |
225 | 0x02020202UL; \ | |
226 | uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + \ | |
227 | ((b & 0xFCFCFCFCUL) >> 2); \ | |
228 | uint32_t l1, h1; \ | |
229 | \ | |
230 | pixels += line_size; \ | |
231 | for (i = 0; i < h; i += 2) { \ | |
232 | uint32_t a = AV_RN32(pixels); \ | |
233 | uint32_t b = AV_RN32(pixels + 1); \ | |
234 | l1 = (a & 0x03030303UL) + \ | |
235 | (b & 0x03030303UL); \ | |
236 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + \ | |
237 | ((b & 0xFCFCFCFCUL) >> 2); \ | |
238 | OP(*((uint32_t *) block), \ | |
239 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); \ | |
240 | pixels += line_size; \ | |
241 | block += line_size; \ | |
242 | a = AV_RN32(pixels); \ | |
243 | b = AV_RN32(pixels + 1); \ | |
244 | l0 = (a & 0x03030303UL) + \ | |
245 | (b & 0x03030303UL) + \ | |
246 | 0x02020202UL; \ | |
247 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + \ | |
248 | ((b & 0xFCFCFCFCUL) >> 2); \ | |
249 | OP(*((uint32_t *) block), \ | |
250 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); \ | |
251 | pixels += line_size; \ | |
252 | block += line_size; \ | |
253 | } \ | |
254 | pixels += 4 - line_size * (h + 1); \ | |
255 | block += 4 - line_size * h; \ | |
256 | } \ | |
257 | } \ | |
258 | \ | |
259 | CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_8_c, \ | |
260 | OPNAME ## _pixels8_xy2_8_c, \ | |
261 | 8) \ | |
262 | ||
263 | #define op_avg(a, b) a = rnd_avg32(a, b) | |
264 | #define op_put(a, b) a = b | |
265 | PIXOP2(avg, op_avg) | |
266 | PIXOP2(put, op_put) | |
267 | #undef op_avg | |
268 | #undef op_put | |
269 | ||
270 | static void put_rv40_qpel16_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) | |
271 | { | |
272 | put_pixels16_xy2_8_c(dst, src, stride, 16); | |
273 | } | |
274 | static void avg_rv40_qpel16_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) | |
275 | { | |
276 | avg_pixels16_xy2_8_c(dst, src, stride, 16); | |
277 | } | |
278 | static void put_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) | |
279 | { | |
280 | put_pixels8_xy2_8_c(dst, src, stride, 8); | |
281 | } | |
282 | static void avg_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) | |
283 | { | |
284 | avg_pixels8_xy2_8_c(dst, src, stride, 8); | |
285 | } | |
286 | ||
287 | static const int rv40_bias[4][4] = { | |
288 | { 0, 16, 32, 16 }, | |
289 | { 32, 28, 32, 28 }, | |
290 | { 0, 32, 16, 32 }, | |
291 | { 32, 28, 32, 28 } | |
292 | }; | |
293 | ||
294 | #define RV40_CHROMA_MC(OPNAME, OP)\ | |
295 | static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
296 | const int A = (8-x) * (8-y);\ | |
297 | const int B = ( x) * (8-y);\ | |
298 | const int C = (8-x) * ( y);\ | |
299 | const int D = ( x) * ( y);\ | |
300 | int i;\ | |
301 | int bias = rv40_bias[y>>1][x>>1];\ | |
302 | \ | |
303 | av_assert2(x<8 && y<8 && x>=0 && y>=0);\ | |
304 | \ | |
305 | if(D){\ | |
306 | for(i = 0; i < h; i++){\ | |
307 | OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + bias));\ | |
308 | OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + bias));\ | |
309 | OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + bias));\ | |
310 | OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + bias));\ | |
311 | dst += stride;\ | |
312 | src += stride;\ | |
313 | }\ | |
314 | }else{\ | |
315 | const int E = B + C;\ | |
316 | const int step = C ? stride : 1;\ | |
317 | for(i = 0; i < h; i++){\ | |
318 | OP(dst[0], (A*src[0] + E*src[step+0] + bias));\ | |
319 | OP(dst[1], (A*src[1] + E*src[step+1] + bias));\ | |
320 | OP(dst[2], (A*src[2] + E*src[step+2] + bias));\ | |
321 | OP(dst[3], (A*src[3] + E*src[step+3] + bias));\ | |
322 | dst += stride;\ | |
323 | src += stride;\ | |
324 | }\ | |
325 | }\ | |
326 | }\ | |
327 | \ | |
328 | static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
329 | const int A = (8-x) * (8-y);\ | |
330 | const int B = ( x) * (8-y);\ | |
331 | const int C = (8-x) * ( y);\ | |
332 | const int D = ( x) * ( y);\ | |
333 | int i;\ | |
334 | int bias = rv40_bias[y>>1][x>>1];\ | |
335 | \ | |
336 | av_assert2(x<8 && y<8 && x>=0 && y>=0);\ | |
337 | \ | |
338 | if(D){\ | |
339 | for(i = 0; i < h; i++){\ | |
340 | OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + bias));\ | |
341 | OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + bias));\ | |
342 | OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + bias));\ | |
343 | OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + bias));\ | |
344 | OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + bias));\ | |
345 | OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + bias));\ | |
346 | OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + bias));\ | |
347 | OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + bias));\ | |
348 | dst += stride;\ | |
349 | src += stride;\ | |
350 | }\ | |
351 | }else{\ | |
352 | const int E = B + C;\ | |
353 | const int step = C ? stride : 1;\ | |
354 | for(i = 0; i < h; i++){\ | |
355 | OP(dst[0], (A*src[0] + E*src[step+0] + bias));\ | |
356 | OP(dst[1], (A*src[1] + E*src[step+1] + bias));\ | |
357 | OP(dst[2], (A*src[2] + E*src[step+2] + bias));\ | |
358 | OP(dst[3], (A*src[3] + E*src[step+3] + bias));\ | |
359 | OP(dst[4], (A*src[4] + E*src[step+4] + bias));\ | |
360 | OP(dst[5], (A*src[5] + E*src[step+5] + bias));\ | |
361 | OP(dst[6], (A*src[6] + E*src[step+6] + bias));\ | |
362 | OP(dst[7], (A*src[7] + E*src[step+7] + bias));\ | |
363 | dst += stride;\ | |
364 | src += stride;\ | |
365 | }\ | |
366 | }\ | |
367 | } | |
368 | ||
369 | #define op_avg(a, b) a = (((a)+((b)>>6)+1)>>1) | |
370 | #define op_put(a, b) a = ((b)>>6) | |
371 | ||
372 | RV40_CHROMA_MC(put_, op_put) | |
373 | RV40_CHROMA_MC(avg_, op_avg) | |
374 | ||
375 | #define RV40_WEIGHT_FUNC(size) \ | |
376 | static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ | |
377 | {\ | |
378 | int i, j;\ | |
379 | \ | |
380 | for (j = 0; j < size; j++) {\ | |
381 | for (i = 0; i < size; i++)\ | |
382 | dst[i] = (((w2 * src1[i]) >> 9) + ((w1 * src2[i]) >> 9) + 0x10) >> 5;\ | |
383 | src1 += stride;\ | |
384 | src2 += stride;\ | |
385 | dst += stride;\ | |
386 | }\ | |
387 | }\ | |
388 | static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ | |
389 | {\ | |
390 | int i, j;\ | |
391 | \ | |
392 | for (j = 0; j < size; j++) {\ | |
393 | for (i = 0; i < size; i++)\ | |
394 | dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\ | |
395 | src1 += stride;\ | |
396 | src2 += stride;\ | |
397 | dst += stride;\ | |
398 | }\ | |
399 | } | |
400 | ||
401 | RV40_WEIGHT_FUNC(16) | |
402 | RV40_WEIGHT_FUNC(8) | |
403 | ||
404 | /** | |
405 | * dither values for deblocking filter - left/top values | |
406 | */ | |
407 | static const uint8_t rv40_dither_l[16] = { | |
408 | 0x40, 0x50, 0x20, 0x60, 0x30, 0x50, 0x40, 0x30, | |
409 | 0x50, 0x40, 0x50, 0x30, 0x60, 0x20, 0x50, 0x40 | |
410 | }; | |
411 | ||
412 | /** | |
413 | * dither values for deblocking filter - right/bottom values | |
414 | */ | |
415 | static const uint8_t rv40_dither_r[16] = { | |
416 | 0x40, 0x30, 0x60, 0x20, 0x50, 0x30, 0x30, 0x40, | |
417 | 0x40, 0x40, 0x50, 0x30, 0x20, 0x60, 0x30, 0x40 | |
418 | }; | |
419 | ||
420 | #define CLIP_SYMM(a, b) av_clip(a, -(b), b) | |
421 | /** | |
422 | * weaker deblocking very similar to the one described in 4.4.2 of JVT-A003r1 | |
423 | */ | |
424 | static av_always_inline void rv40_weak_loop_filter(uint8_t *src, | |
425 | const int step, | |
426 | const ptrdiff_t stride, | |
427 | const int filter_p1, | |
428 | const int filter_q1, | |
429 | const int alpha, | |
430 | const int beta, | |
431 | const int lim_p0q0, | |
432 | const int lim_q1, | |
433 | const int lim_p1) | |
434 | { | |
435 | const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; | |
436 | int i, t, u, diff; | |
437 | ||
438 | for (i = 0; i < 4; i++, src += stride) { | |
439 | int diff_p1p0 = src[-2*step] - src[-1*step]; | |
440 | int diff_q1q0 = src[ 1*step] - src[ 0*step]; | |
441 | int diff_p1p2 = src[-2*step] - src[-3*step]; | |
442 | int diff_q1q2 = src[ 1*step] - src[ 2*step]; | |
443 | ||
444 | t = src[0*step] - src[-1*step]; | |
445 | if (!t) | |
446 | continue; | |
447 | ||
448 | u = (alpha * FFABS(t)) >> 7; | |
449 | if (u > 3 - (filter_p1 && filter_q1)) | |
450 | continue; | |
451 | ||
452 | t <<= 2; | |
453 | if (filter_p1 && filter_q1) | |
454 | t += src[-2*step] - src[1*step]; | |
455 | ||
456 | diff = CLIP_SYMM((t + 4) >> 3, lim_p0q0); | |
457 | src[-1*step] = cm[src[-1*step] + diff]; | |
458 | src[ 0*step] = cm[src[ 0*step] - diff]; | |
459 | ||
460 | if (filter_p1 && FFABS(diff_p1p2) <= beta) { | |
461 | t = (diff_p1p0 + diff_p1p2 - diff) >> 1; | |
462 | src[-2*step] = cm[src[-2*step] - CLIP_SYMM(t, lim_p1)]; | |
463 | } | |
464 | ||
465 | if (filter_q1 && FFABS(diff_q1q2) <= beta) { | |
466 | t = (diff_q1q0 + diff_q1q2 + diff) >> 1; | |
467 | src[ 1*step] = cm[src[ 1*step] - CLIP_SYMM(t, lim_q1)]; | |
468 | } | |
469 | } | |
470 | } | |
471 | ||
472 | static void rv40_h_weak_loop_filter(uint8_t *src, const ptrdiff_t stride, | |
473 | const int filter_p1, const int filter_q1, | |
474 | const int alpha, const int beta, | |
475 | const int lim_p0q0, const int lim_q1, | |
476 | const int lim_p1) | |
477 | { | |
478 | rv40_weak_loop_filter(src, stride, 1, filter_p1, filter_q1, | |
479 | alpha, beta, lim_p0q0, lim_q1, lim_p1); | |
480 | } | |
481 | ||
482 | static void rv40_v_weak_loop_filter(uint8_t *src, const ptrdiff_t stride, | |
483 | const int filter_p1, const int filter_q1, | |
484 | const int alpha, const int beta, | |
485 | const int lim_p0q0, const int lim_q1, | |
486 | const int lim_p1) | |
487 | { | |
488 | rv40_weak_loop_filter(src, 1, stride, filter_p1, filter_q1, | |
489 | alpha, beta, lim_p0q0, lim_q1, lim_p1); | |
490 | } | |
491 | ||
492 | static av_always_inline void rv40_strong_loop_filter(uint8_t *src, | |
493 | const int step, | |
494 | const ptrdiff_t stride, | |
495 | const int alpha, | |
496 | const int lims, | |
497 | const int dmode, | |
498 | const int chroma) | |
499 | { | |
500 | int i; | |
501 | ||
502 | for(i = 0; i < 4; i++, src += stride){ | |
503 | int sflag, p0, q0, p1, q1; | |
504 | int t = src[0*step] - src[-1*step]; | |
505 | ||
506 | if (!t) | |
507 | continue; | |
508 | ||
509 | sflag = (alpha * FFABS(t)) >> 7; | |
510 | if (sflag > 1) | |
511 | continue; | |
512 | ||
513 | p0 = (25*src[-3*step] + 26*src[-2*step] + 26*src[-1*step] + | |
514 | 26*src[ 0*step] + 25*src[ 1*step] + | |
515 | rv40_dither_l[dmode + i]) >> 7; | |
516 | ||
517 | q0 = (25*src[-2*step] + 26*src[-1*step] + 26*src[ 0*step] + | |
518 | 26*src[ 1*step] + 25*src[ 2*step] + | |
519 | rv40_dither_r[dmode + i]) >> 7; | |
520 | ||
521 | if (sflag) { | |
522 | p0 = av_clip(p0, src[-1*step] - lims, src[-1*step] + lims); | |
523 | q0 = av_clip(q0, src[ 0*step] - lims, src[ 0*step] + lims); | |
524 | } | |
525 | ||
526 | p1 = (25*src[-4*step] + 26*src[-3*step] + 26*src[-2*step] + 26*p0 + | |
527 | 25*src[ 0*step] + rv40_dither_l[dmode + i]) >> 7; | |
528 | q1 = (25*src[-1*step] + 26*q0 + 26*src[ 1*step] + 26*src[ 2*step] + | |
529 | 25*src[ 3*step] + rv40_dither_r[dmode + i]) >> 7; | |
530 | ||
531 | if (sflag) { | |
532 | p1 = av_clip(p1, src[-2*step] - lims, src[-2*step] + lims); | |
533 | q1 = av_clip(q1, src[ 1*step] - lims, src[ 1*step] + lims); | |
534 | } | |
535 | ||
536 | src[-2*step] = p1; | |
537 | src[-1*step] = p0; | |
538 | src[ 0*step] = q0; | |
539 | src[ 1*step] = q1; | |
540 | ||
541 | if(!chroma){ | |
542 | src[-3*step] = (25*src[-1*step] + 26*src[-2*step] + | |
543 | 51*src[-3*step] + 26*src[-4*step] + 64) >> 7; | |
544 | src[ 2*step] = (25*src[ 0*step] + 26*src[ 1*step] + | |
545 | 51*src[ 2*step] + 26*src[ 3*step] + 64) >> 7; | |
546 | } | |
547 | } | |
548 | } | |
549 | ||
550 | static void rv40_h_strong_loop_filter(uint8_t *src, const ptrdiff_t stride, | |
551 | const int alpha, const int lims, | |
552 | const int dmode, const int chroma) | |
553 | { | |
554 | rv40_strong_loop_filter(src, stride, 1, alpha, lims, dmode, chroma); | |
555 | } | |
556 | ||
557 | static void rv40_v_strong_loop_filter(uint8_t *src, const ptrdiff_t stride, | |
558 | const int alpha, const int lims, | |
559 | const int dmode, const int chroma) | |
560 | { | |
561 | rv40_strong_loop_filter(src, 1, stride, alpha, lims, dmode, chroma); | |
562 | } | |
563 | ||
564 | static av_always_inline int rv40_loop_filter_strength(uint8_t *src, | |
565 | int step, ptrdiff_t stride, | |
566 | int beta, int beta2, | |
567 | int edge, | |
568 | int *p1, int *q1) | |
569 | { | |
570 | int sum_p1p0 = 0, sum_q1q0 = 0, sum_p1p2 = 0, sum_q1q2 = 0; | |
571 | int strong0 = 0, strong1 = 0; | |
572 | uint8_t *ptr; | |
573 | int i; | |
574 | ||
575 | for (i = 0, ptr = src; i < 4; i++, ptr += stride) { | |
576 | sum_p1p0 += ptr[-2*step] - ptr[-1*step]; | |
577 | sum_q1q0 += ptr[ 1*step] - ptr[ 0*step]; | |
578 | } | |
579 | ||
580 | *p1 = FFABS(sum_p1p0) < (beta << 2); | |
581 | *q1 = FFABS(sum_q1q0) < (beta << 2); | |
582 | ||
583 | if(!*p1 && !*q1) | |
584 | return 0; | |
585 | ||
586 | if (!edge) | |
587 | return 0; | |
588 | ||
589 | for (i = 0, ptr = src; i < 4; i++, ptr += stride) { | |
590 | sum_p1p2 += ptr[-2*step] - ptr[-3*step]; | |
591 | sum_q1q2 += ptr[ 1*step] - ptr[ 2*step]; | |
592 | } | |
593 | ||
594 | strong0 = *p1 && (FFABS(sum_p1p2) < beta2); | |
595 | strong1 = *q1 && (FFABS(sum_q1q2) < beta2); | |
596 | ||
597 | return strong0 && strong1; | |
598 | } | |
599 | ||
600 | static int rv40_h_loop_filter_strength(uint8_t *src, ptrdiff_t stride, | |
601 | int beta, int beta2, int edge, | |
602 | int *p1, int *q1) | |
603 | { | |
604 | return rv40_loop_filter_strength(src, stride, 1, beta, beta2, edge, p1, q1); | |
605 | } | |
606 | ||
607 | static int rv40_v_loop_filter_strength(uint8_t *src, ptrdiff_t stride, | |
608 | int beta, int beta2, int edge, | |
609 | int *p1, int *q1) | |
610 | { | |
611 | return rv40_loop_filter_strength(src, 1, stride, beta, beta2, edge, p1, q1); | |
612 | } | |
613 | ||
614 | av_cold void ff_rv40dsp_init(RV34DSPContext *c) | |
615 | { | |
616 | H264QpelContext qpel; | |
617 | ||
618 | ff_rv34dsp_init(c); | |
619 | ff_h264qpel_init(&qpel, 8); | |
620 | ||
621 | c->put_pixels_tab[0][ 0] = qpel.put_h264_qpel_pixels_tab[0][0]; | |
622 | c->put_pixels_tab[0][ 1] = put_rv40_qpel16_mc10_c; | |
623 | c->put_pixels_tab[0][ 2] = qpel.put_h264_qpel_pixels_tab[0][2]; | |
624 | c->put_pixels_tab[0][ 3] = put_rv40_qpel16_mc30_c; | |
625 | c->put_pixels_tab[0][ 4] = put_rv40_qpel16_mc01_c; | |
626 | c->put_pixels_tab[0][ 5] = put_rv40_qpel16_mc11_c; | |
627 | c->put_pixels_tab[0][ 6] = put_rv40_qpel16_mc21_c; | |
628 | c->put_pixels_tab[0][ 7] = put_rv40_qpel16_mc31_c; | |
629 | c->put_pixels_tab[0][ 8] = qpel.put_h264_qpel_pixels_tab[0][8]; | |
630 | c->put_pixels_tab[0][ 9] = put_rv40_qpel16_mc12_c; | |
631 | c->put_pixels_tab[0][10] = put_rv40_qpel16_mc22_c; | |
632 | c->put_pixels_tab[0][11] = put_rv40_qpel16_mc32_c; | |
633 | c->put_pixels_tab[0][12] = put_rv40_qpel16_mc03_c; | |
634 | c->put_pixels_tab[0][13] = put_rv40_qpel16_mc13_c; | |
635 | c->put_pixels_tab[0][14] = put_rv40_qpel16_mc23_c; | |
636 | c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_c; | |
637 | c->avg_pixels_tab[0][ 0] = qpel.avg_h264_qpel_pixels_tab[0][0]; | |
638 | c->avg_pixels_tab[0][ 1] = avg_rv40_qpel16_mc10_c; | |
639 | c->avg_pixels_tab[0][ 2] = qpel.avg_h264_qpel_pixels_tab[0][2]; | |
640 | c->avg_pixels_tab[0][ 3] = avg_rv40_qpel16_mc30_c; | |
641 | c->avg_pixels_tab[0][ 4] = avg_rv40_qpel16_mc01_c; | |
642 | c->avg_pixels_tab[0][ 5] = avg_rv40_qpel16_mc11_c; | |
643 | c->avg_pixels_tab[0][ 6] = avg_rv40_qpel16_mc21_c; | |
644 | c->avg_pixels_tab[0][ 7] = avg_rv40_qpel16_mc31_c; | |
645 | c->avg_pixels_tab[0][ 8] = qpel.avg_h264_qpel_pixels_tab[0][8]; | |
646 | c->avg_pixels_tab[0][ 9] = avg_rv40_qpel16_mc12_c; | |
647 | c->avg_pixels_tab[0][10] = avg_rv40_qpel16_mc22_c; | |
648 | c->avg_pixels_tab[0][11] = avg_rv40_qpel16_mc32_c; | |
649 | c->avg_pixels_tab[0][12] = avg_rv40_qpel16_mc03_c; | |
650 | c->avg_pixels_tab[0][13] = avg_rv40_qpel16_mc13_c; | |
651 | c->avg_pixels_tab[0][14] = avg_rv40_qpel16_mc23_c; | |
652 | c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c; | |
653 | c->put_pixels_tab[1][ 0] = qpel.put_h264_qpel_pixels_tab[1][0]; | |
654 | c->put_pixels_tab[1][ 1] = put_rv40_qpel8_mc10_c; | |
655 | c->put_pixels_tab[1][ 2] = qpel.put_h264_qpel_pixels_tab[1][2]; | |
656 | c->put_pixels_tab[1][ 3] = put_rv40_qpel8_mc30_c; | |
657 | c->put_pixels_tab[1][ 4] = put_rv40_qpel8_mc01_c; | |
658 | c->put_pixels_tab[1][ 5] = put_rv40_qpel8_mc11_c; | |
659 | c->put_pixels_tab[1][ 6] = put_rv40_qpel8_mc21_c; | |
660 | c->put_pixels_tab[1][ 7] = put_rv40_qpel8_mc31_c; | |
661 | c->put_pixels_tab[1][ 8] = qpel.put_h264_qpel_pixels_tab[1][8]; | |
662 | c->put_pixels_tab[1][ 9] = put_rv40_qpel8_mc12_c; | |
663 | c->put_pixels_tab[1][10] = put_rv40_qpel8_mc22_c; | |
664 | c->put_pixels_tab[1][11] = put_rv40_qpel8_mc32_c; | |
665 | c->put_pixels_tab[1][12] = put_rv40_qpel8_mc03_c; | |
666 | c->put_pixels_tab[1][13] = put_rv40_qpel8_mc13_c; | |
667 | c->put_pixels_tab[1][14] = put_rv40_qpel8_mc23_c; | |
668 | c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_c; | |
669 | c->avg_pixels_tab[1][ 0] = qpel.avg_h264_qpel_pixels_tab[1][0]; | |
670 | c->avg_pixels_tab[1][ 1] = avg_rv40_qpel8_mc10_c; | |
671 | c->avg_pixels_tab[1][ 2] = qpel.avg_h264_qpel_pixels_tab[1][2]; | |
672 | c->avg_pixels_tab[1][ 3] = avg_rv40_qpel8_mc30_c; | |
673 | c->avg_pixels_tab[1][ 4] = avg_rv40_qpel8_mc01_c; | |
674 | c->avg_pixels_tab[1][ 5] = avg_rv40_qpel8_mc11_c; | |
675 | c->avg_pixels_tab[1][ 6] = avg_rv40_qpel8_mc21_c; | |
676 | c->avg_pixels_tab[1][ 7] = avg_rv40_qpel8_mc31_c; | |
677 | c->avg_pixels_tab[1][ 8] = qpel.avg_h264_qpel_pixels_tab[1][8]; | |
678 | c->avg_pixels_tab[1][ 9] = avg_rv40_qpel8_mc12_c; | |
679 | c->avg_pixels_tab[1][10] = avg_rv40_qpel8_mc22_c; | |
680 | c->avg_pixels_tab[1][11] = avg_rv40_qpel8_mc32_c; | |
681 | c->avg_pixels_tab[1][12] = avg_rv40_qpel8_mc03_c; | |
682 | c->avg_pixels_tab[1][13] = avg_rv40_qpel8_mc13_c; | |
683 | c->avg_pixels_tab[1][14] = avg_rv40_qpel8_mc23_c; | |
684 | c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c; | |
685 | ||
686 | c->put_chroma_pixels_tab[0] = put_rv40_chroma_mc8_c; | |
687 | c->put_chroma_pixels_tab[1] = put_rv40_chroma_mc4_c; | |
688 | c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c; | |
689 | c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c; | |
690 | ||
691 | c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16; | |
692 | c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8; | |
693 | c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16; | |
694 | c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8; | |
695 | ||
696 | c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter; | |
697 | c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter; | |
698 | c->rv40_strong_loop_filter[0] = rv40_h_strong_loop_filter; | |
699 | c->rv40_strong_loop_filter[1] = rv40_v_strong_loop_filter; | |
700 | c->rv40_loop_filter_strength[0] = rv40_h_loop_filter_strength; | |
701 | c->rv40_loop_filter_strength[1] = rv40_v_loop_filter_strength; | |
702 | ||
703 | if (ARCH_AARCH64) | |
704 | ff_rv40dsp_init_aarch64(c); | |
705 | if (ARCH_ARM) | |
706 | ff_rv40dsp_init_arm(c); | |
707 | if (ARCH_X86) | |
708 | ff_rv40dsp_init_x86(c); | |
709 | } |