2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * H.264 / AVC / MPEG4 part10 prediction functions.
25 * @author Michael Niedermayer <michaelni@gmx.at>
28 #include "libavutil/intreadwrite.h"
32 #include "bit_depth_template.c"
34 static void FUNCC(pred4x4_vertical
)(uint8_t *_src
, const uint8_t *topright
,
37 pixel
*src
= (pixel
*)_src
;
38 int stride
= _stride
>>(sizeof(pixel
)-1);
39 const pixel4 a
= AV_RN4PA(src
-stride
);
41 AV_WN4PA(src
+0*stride
, a
);
42 AV_WN4PA(src
+1*stride
, a
);
43 AV_WN4PA(src
+2*stride
, a
);
44 AV_WN4PA(src
+3*stride
, a
);
47 static void FUNCC(pred4x4_horizontal
)(uint8_t *_src
, const uint8_t *topright
,
50 pixel
*src
= (pixel
*)_src
;
51 int stride
= _stride
>>(sizeof(pixel
)-1);
52 AV_WN4PA(src
+0*stride
, PIXEL_SPLAT_X4(src
[-1+0*stride
]));
53 AV_WN4PA(src
+1*stride
, PIXEL_SPLAT_X4(src
[-1+1*stride
]));
54 AV_WN4PA(src
+2*stride
, PIXEL_SPLAT_X4(src
[-1+2*stride
]));
55 AV_WN4PA(src
+3*stride
, PIXEL_SPLAT_X4(src
[-1+3*stride
]));
58 static void FUNCC(pred4x4_dc
)(uint8_t *_src
, const uint8_t *topright
,
61 pixel
*src
= (pixel
*)_src
;
62 int stride
= _stride
>>(sizeof(pixel
)-1);
63 const int dc
= ( src
[-stride
] + src
[1-stride
] + src
[2-stride
] + src
[3-stride
]
64 + src
[-1+0*stride
] + src
[-1+1*stride
] + src
[-1+2*stride
] + src
[-1+3*stride
] + 4) >>3;
65 const pixel4 a
= PIXEL_SPLAT_X4(dc
);
67 AV_WN4PA(src
+0*stride
, a
);
68 AV_WN4PA(src
+1*stride
, a
);
69 AV_WN4PA(src
+2*stride
, a
);
70 AV_WN4PA(src
+3*stride
, a
);
73 static void FUNCC(pred4x4_left_dc
)(uint8_t *_src
, const uint8_t *topright
,
76 pixel
*src
= (pixel
*)_src
;
77 int stride
= _stride
>>(sizeof(pixel
)-1);
78 const int dc
= ( src
[-1+0*stride
] + src
[-1+1*stride
] + src
[-1+2*stride
] + src
[-1+3*stride
] + 2) >>2;
79 const pixel4 a
= PIXEL_SPLAT_X4(dc
);
81 AV_WN4PA(src
+0*stride
, a
);
82 AV_WN4PA(src
+1*stride
, a
);
83 AV_WN4PA(src
+2*stride
, a
);
84 AV_WN4PA(src
+3*stride
, a
);
87 static void FUNCC(pred4x4_top_dc
)(uint8_t *_src
, const uint8_t *topright
,
90 pixel
*src
= (pixel
*)_src
;
91 int stride
= _stride
>>(sizeof(pixel
)-1);
92 const int dc
= ( src
[-stride
] + src
[1-stride
] + src
[2-stride
] + src
[3-stride
] + 2) >>2;
93 const pixel4 a
= PIXEL_SPLAT_X4(dc
);
95 AV_WN4PA(src
+0*stride
, a
);
96 AV_WN4PA(src
+1*stride
, a
);
97 AV_WN4PA(src
+2*stride
, a
);
98 AV_WN4PA(src
+3*stride
, a
);
101 static void FUNCC(pred4x4_128_dc
)(uint8_t *_src
, const uint8_t *topright
,
104 pixel
*src
= (pixel
*)_src
;
105 int stride
= _stride
>>(sizeof(pixel
)-1);
106 const pixel4 a
= PIXEL_SPLAT_X4(1<<(BIT_DEPTH
-1));
108 AV_WN4PA(src
+0*stride
, a
);
109 AV_WN4PA(src
+1*stride
, a
);
110 AV_WN4PA(src
+2*stride
, a
);
111 AV_WN4PA(src
+3*stride
, a
);
114 static void FUNCC(pred4x4_127_dc
)(uint8_t *_src
, const uint8_t *topright
,
117 pixel
*src
= (pixel
*)_src
;
118 int stride
= _stride
>>(sizeof(pixel
)-1);
119 const pixel4 a
= PIXEL_SPLAT_X4((1<<(BIT_DEPTH
-1))-1);
121 AV_WN4PA(src
+0*stride
, a
);
122 AV_WN4PA(src
+1*stride
, a
);
123 AV_WN4PA(src
+2*stride
, a
);
124 AV_WN4PA(src
+3*stride
, a
);
127 static void FUNCC(pred4x4_129_dc
)(uint8_t *_src
, const uint8_t *topright
,
130 pixel
*src
= (pixel
*)_src
;
131 int stride
= _stride
>>(sizeof(pixel
)-1);
132 const pixel4 a
= PIXEL_SPLAT_X4((1<<(BIT_DEPTH
-1))+1);
134 AV_WN4PA(src
+0*stride
, a
);
135 AV_WN4PA(src
+1*stride
, a
);
136 AV_WN4PA(src
+2*stride
, a
);
137 AV_WN4PA(src
+3*stride
, a
);
141 #define LOAD_TOP_RIGHT_EDGE\
142 const unsigned av_unused t4 = topright[0];\
143 const unsigned av_unused t5 = topright[1];\
144 const unsigned av_unused t6 = topright[2];\
145 const unsigned av_unused t7 = topright[3];\
147 #define LOAD_DOWN_LEFT_EDGE\
148 const unsigned av_unused l4 = src[-1+4*stride];\
149 const unsigned av_unused l5 = src[-1+5*stride];\
150 const unsigned av_unused l6 = src[-1+6*stride];\
151 const unsigned av_unused l7 = src[-1+7*stride];\
153 #define LOAD_LEFT_EDGE\
154 const unsigned av_unused l0 = src[-1+0*stride];\
155 const unsigned av_unused l1 = src[-1+1*stride];\
156 const unsigned av_unused l2 = src[-1+2*stride];\
157 const unsigned av_unused l3 = src[-1+3*stride];\
159 #define LOAD_TOP_EDGE\
160 const unsigned av_unused t0 = src[ 0-1*stride];\
161 const unsigned av_unused t1 = src[ 1-1*stride];\
162 const unsigned av_unused t2 = src[ 2-1*stride];\
163 const unsigned av_unused t3 = src[ 3-1*stride];\
165 static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
168 pixel
*src
= (pixel
*)_src
;
169 int stride
= _stride
>>(sizeof(pixel
)-1);
170 const int lt
= src
[-1-1*stride
];
174 src
[0+3*stride
]=(l3
+ 2*l2
+ l1
+ 2)>>2;
176 src
[1+3*stride
]=(l2
+ 2*l1
+ l0
+ 2)>>2;
179 src
[2+3*stride
]=(l1
+ 2*l0
+ lt
+ 2)>>2;
183 src
[3+3*stride
]=(l0
+ 2*lt
+ t0
+ 2)>>2;
186 src
[3+2*stride
]=(lt
+ 2*t0
+ t1
+ 2)>>2;
188 src
[3+1*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
189 src
[3+0*stride
]=(t1
+ 2*t2
+ t3
+ 2)>>2;
192 static void FUNCC(pred4x4_down_left
)(uint8_t *_src
, const uint8_t *_topright
,
195 pixel
*src
= (pixel
*)_src
;
196 const pixel
*topright
= (const pixel
*)_topright
;
197 int stride
= _stride
>>(sizeof(pixel
)-1);
202 src
[0+0*stride
]=(t0
+ t2
+ 2*t1
+ 2)>>2;
204 src
[0+1*stride
]=(t1
+ t3
+ 2*t2
+ 2)>>2;
207 src
[0+2*stride
]=(t2
+ t4
+ 2*t3
+ 2)>>2;
211 src
[0+3*stride
]=(t3
+ t5
+ 2*t4
+ 2)>>2;
214 src
[1+3*stride
]=(t4
+ t6
+ 2*t5
+ 2)>>2;
216 src
[2+3*stride
]=(t5
+ t7
+ 2*t6
+ 2)>>2;
217 src
[3+3*stride
]=(t6
+ 3*t7
+ 2)>>2;
220 static void FUNCC(pred4x4_vertical_right
)(uint8_t *_src
,
221 const uint8_t *topright
,
224 pixel
*src
= (pixel
*)_src
;
225 int stride
= _stride
>>(sizeof(pixel
)-1);
226 const int lt
= src
[-1-1*stride
];
231 src
[1+2*stride
]=(lt
+ t0
+ 1)>>1;
233 src
[2+2*stride
]=(t0
+ t1
+ 1)>>1;
235 src
[3+2*stride
]=(t1
+ t2
+ 1)>>1;
236 src
[3+0*stride
]=(t2
+ t3
+ 1)>>1;
238 src
[1+3*stride
]=(l0
+ 2*lt
+ t0
+ 2)>>2;
240 src
[2+3*stride
]=(lt
+ 2*t0
+ t1
+ 2)>>2;
242 src
[3+3*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
243 src
[3+1*stride
]=(t1
+ 2*t2
+ t3
+ 2)>>2;
244 src
[0+2*stride
]=(lt
+ 2*l0
+ l1
+ 2)>>2;
245 src
[0+3*stride
]=(l0
+ 2*l1
+ l2
+ 2)>>2;
248 static void FUNCC(pred4x4_vertical_left
)(uint8_t *_src
,
249 const uint8_t *_topright
,
252 pixel
*src
= (pixel
*)_src
;
253 const pixel
*topright
= (const pixel
*)_topright
;
254 int stride
= _stride
>>(sizeof(pixel
)-1);
258 src
[0+0*stride
]=(t0
+ t1
+ 1)>>1;
260 src
[0+2*stride
]=(t1
+ t2
+ 1)>>1;
262 src
[1+2*stride
]=(t2
+ t3
+ 1)>>1;
264 src
[2+2*stride
]=(t3
+ t4
+ 1)>>1;
265 src
[3+2*stride
]=(t4
+ t5
+ 1)>>1;
266 src
[0+1*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
268 src
[0+3*stride
]=(t1
+ 2*t2
+ t3
+ 2)>>2;
270 src
[1+3*stride
]=(t2
+ 2*t3
+ t4
+ 2)>>2;
272 src
[2+3*stride
]=(t3
+ 2*t4
+ t5
+ 2)>>2;
273 src
[3+3*stride
]=(t4
+ 2*t5
+ t6
+ 2)>>2;
276 static void FUNCC(pred4x4_horizontal_up
)(uint8_t *_src
, const uint8_t *topright
,
279 pixel
*src
= (pixel
*)_src
;
280 int stride
= _stride
>>(sizeof(pixel
)-1);
283 src
[0+0*stride
]=(l0
+ l1
+ 1)>>1;
284 src
[1+0*stride
]=(l0
+ 2*l1
+ l2
+ 2)>>2;
286 src
[0+1*stride
]=(l1
+ l2
+ 1)>>1;
288 src
[1+1*stride
]=(l1
+ 2*l2
+ l3
+ 2)>>2;
290 src
[0+2*stride
]=(l2
+ l3
+ 1)>>1;
292 src
[1+2*stride
]=(l2
+ 2*l3
+ l3
+ 2)>>2;
301 static void FUNCC(pred4x4_horizontal_down
)(uint8_t *_src
,
302 const uint8_t *topright
,
305 pixel
*src
= (pixel
*)_src
;
306 int stride
= _stride
>>(sizeof(pixel
)-1);
307 const int lt
= src
[-1-1*stride
];
312 src
[2+1*stride
]=(lt
+ l0
+ 1)>>1;
314 src
[3+1*stride
]=(l0
+ 2*lt
+ t0
+ 2)>>2;
315 src
[2+0*stride
]=(lt
+ 2*t0
+ t1
+ 2)>>2;
316 src
[3+0*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
318 src
[2+2*stride
]=(l0
+ l1
+ 1)>>1;
320 src
[3+2*stride
]=(lt
+ 2*l0
+ l1
+ 2)>>2;
322 src
[2+3*stride
]=(l1
+ l2
+ 1)>>1;
324 src
[3+3*stride
]=(l0
+ 2*l1
+ l2
+ 2)>>2;
325 src
[0+3*stride
]=(l2
+ l3
+ 1)>>1;
326 src
[1+3*stride
]=(l1
+ 2*l2
+ l3
+ 2)>>2;
329 static void FUNCC(pred16x16_vertical
)(uint8_t *_src
, ptrdiff_t _stride
)
332 pixel
*src
= (pixel
*)_src
;
333 int stride
= _stride
>>(sizeof(pixel
)-1);
334 const pixel4 a
= AV_RN4PA(((pixel4
*)(src
-stride
))+0);
335 const pixel4 b
= AV_RN4PA(((pixel4
*)(src
-stride
))+1);
336 const pixel4 c
= AV_RN4PA(((pixel4
*)(src
-stride
))+2);
337 const pixel4 d
= AV_RN4PA(((pixel4
*)(src
-stride
))+3);
340 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, a
);
341 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, b
);
342 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+2, c
);
343 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+3, d
);
347 static void FUNCC(pred16x16_horizontal
)(uint8_t *_src
, ptrdiff_t stride
)
350 pixel
*src
= (pixel
*)_src
;
351 stride
>>= sizeof(pixel
)-1;
354 const pixel4 a
= PIXEL_SPLAT_X4(src
[-1+i
*stride
]);
356 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, a
);
357 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, a
);
358 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+2, a
);
359 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+3, a
);
363 #define PREDICT_16x16_DC(v)\
364 for(i=0; i<16; i++){\
365 AV_WN4PA(src+ 0, v);\
366 AV_WN4PA(src+ 4, v);\
367 AV_WN4PA(src+ 8, v);\
368 AV_WN4PA(src+12, v);\
372 static void FUNCC(pred16x16_dc
)(uint8_t *_src
, ptrdiff_t stride
)
375 pixel
*src
= (pixel
*)_src
;
377 stride
>>= sizeof(pixel
)-1;
380 dc
+= src
[-1+i
*stride
];
387 dcsplat
= PIXEL_SPLAT_X4((dc
+16)>>5);
388 PREDICT_16x16_DC(dcsplat
);
391 static void FUNCC(pred16x16_left_dc
)(uint8_t *_src
, ptrdiff_t stride
)
394 pixel
*src
= (pixel
*)_src
;
396 stride
>>= sizeof(pixel
)-1;
399 dc
+= src
[-1+i
*stride
];
402 dcsplat
= PIXEL_SPLAT_X4((dc
+8)>>4);
403 PREDICT_16x16_DC(dcsplat
);
406 static void FUNCC(pred16x16_top_dc
)(uint8_t *_src
, ptrdiff_t stride
)
409 pixel
*src
= (pixel
*)_src
;
411 stride
>>= sizeof(pixel
)-1;
417 dcsplat
= PIXEL_SPLAT_X4((dc
+8)>>4);
418 PREDICT_16x16_DC(dcsplat
);
421 #define PRED16x16_X(n, v) \
422 static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
425 pixel *src = (pixel*)_src;\
426 stride >>= sizeof(pixel)-1;\
427 PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
430 PRED16x16_X(127, (1<<(BIT_DEPTH
-1))-1)
431 PRED16x16_X(128, (1<<(BIT_DEPTH
-1))+0)
432 PRED16x16_X(129, (1<<(BIT_DEPTH
-1))+1)
434 static inline void FUNCC(pred16x16_plane_compat
)(uint8_t *_src
,
442 pixel
*src
= (pixel
*)_src
;
443 int stride
= _stride
>>(sizeof(pixel
)-1);
444 const pixel
* const src0
= src
+7-stride
;
445 const pixel
* src1
= src
+8*stride
-1;
446 const pixel
* src2
= src1
-2*stride
; // == src+6*stride-1;
447 int H
= src0
[1] - src0
[-1];
448 int V
= src1
[0] - src2
[ 0];
449 for(k
=2; k
<=8; ++k
) {
450 src1
+= stride
; src2
-= stride
;
451 H
+= k
*(src0
[k
] - src0
[-k
]);
452 V
+= k
*(src1
[0] - src2
[ 0]);
455 H
= ( 5*(H
/4) ) / 16;
456 V
= ( 5*(V
/4) ) / 16;
458 /* required for 100% accuracy */
461 H
= ( H
+ (H
>>2) ) >> 4;
462 V
= ( V
+ (V
>>2) ) >> 4;
468 a
= 16*(src1
[0] + src2
[16] + 1) - 7*(V
+H
);
469 for(j
=16; j
>0; --j
) {
472 for(i
=-16; i
<0; i
+=4) {
473 src
[16+i
] = CLIP((b
) >> 5);
474 src
[17+i
] = CLIP((b
+ H
) >> 5);
475 src
[18+i
] = CLIP((b
+2*H
) >> 5);
476 src
[19+i
] = CLIP((b
+3*H
) >> 5);
483 static void FUNCC(pred16x16_plane
)(uint8_t *src
, ptrdiff_t stride
)
485 FUNCC(pred16x16_plane_compat
)(src
, stride
, 0, 0);
488 static void FUNCC(pred8x8_vertical
)(uint8_t *_src
, ptrdiff_t _stride
)
491 pixel
*src
= (pixel
*)_src
;
492 int stride
= _stride
>>(sizeof(pixel
)-1);
493 const pixel4 a
= AV_RN4PA(((pixel4
*)(src
-stride
))+0);
494 const pixel4 b
= AV_RN4PA(((pixel4
*)(src
-stride
))+1);
497 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, a
);
498 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, b
);
502 static void FUNCC(pred8x16_vertical
)(uint8_t *_src
, ptrdiff_t _stride
)
505 pixel
*src
= (pixel
*)_src
;
506 int stride
= _stride
>>(sizeof(pixel
)-1);
507 const pixel4 a
= AV_RN4PA(((pixel4
*)(src
-stride
))+0);
508 const pixel4 b
= AV_RN4PA(((pixel4
*)(src
-stride
))+1);
511 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, a
);
512 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, b
);
516 static void FUNCC(pred8x8_horizontal
)(uint8_t *_src
, ptrdiff_t stride
)
519 pixel
*src
= (pixel
*)_src
;
520 stride
>>= sizeof(pixel
)-1;
523 const pixel4 a
= PIXEL_SPLAT_X4(src
[-1+i
*stride
]);
524 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, a
);
525 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, a
);
529 static void FUNCC(pred8x16_horizontal
)(uint8_t *_src
, ptrdiff_t stride
)
532 pixel
*src
= (pixel
*)_src
;
533 stride
>>= sizeof(pixel
)-1;
535 const pixel4 a
= PIXEL_SPLAT_X4(src
[-1+i
*stride
]);
536 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, a
);
537 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, a
);
541 #define PRED8x8_X(n, v)\
542 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
545 const pixel4 a = PIXEL_SPLAT_X4(v);\
546 pixel *src = (pixel*)_src;\
547 stride >>= sizeof(pixel)-1;\
549 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
550 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
554 PRED8x8_X(127, (1<<(BIT_DEPTH
-1))-1)
555 PRED8x8_X(128, (1<<(BIT_DEPTH
-1))+0)
556 PRED8x8_X(129, (1<<(BIT_DEPTH
-1))+1)
558 static void FUNCC(pred8x16_128_dc
)(uint8_t *_src
, ptrdiff_t stride
)
560 FUNCC(pred8x8_128_dc
)(_src
, stride
);
561 FUNCC(pred8x8_128_dc
)(_src
+8*stride
, stride
);
564 static void FUNCC(pred8x8_left_dc
)(uint8_t *_src
, ptrdiff_t stride
)
568 pixel4 dc0splat
, dc2splat
;
569 pixel
*src
= (pixel
*)_src
;
570 stride
>>= sizeof(pixel
)-1;
574 dc0
+= src
[-1+i
*stride
];
575 dc2
+= src
[-1+(i
+4)*stride
];
577 dc0splat
= PIXEL_SPLAT_X4((dc0
+ 2)>>2);
578 dc2splat
= PIXEL_SPLAT_X4((dc2
+ 2)>>2);
581 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc0splat
);
582 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc0splat
);
585 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc2splat
);
586 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc2splat
);
590 static void FUNCC(pred8x16_left_dc
)(uint8_t *_src
, ptrdiff_t stride
)
592 FUNCC(pred8x8_left_dc
)(_src
, stride
);
593 FUNCC(pred8x8_left_dc
)(_src
+8*stride
, stride
);
596 static void FUNCC(pred8x8_top_dc
)(uint8_t *_src
, ptrdiff_t stride
)
600 pixel4 dc0splat
, dc1splat
;
601 pixel
*src
= (pixel
*)_src
;
602 stride
>>= sizeof(pixel
)-1;
607 dc1
+= src
[4+i
-stride
];
609 dc0splat
= PIXEL_SPLAT_X4((dc0
+ 2)>>2);
610 dc1splat
= PIXEL_SPLAT_X4((dc1
+ 2)>>2);
613 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc0splat
);
614 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc1splat
);
617 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc0splat
);
618 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc1splat
);
622 static void FUNCC(pred8x16_top_dc
)(uint8_t *_src
, ptrdiff_t stride
)
626 pixel4 dc0splat
, dc1splat
;
627 pixel
*src
= (pixel
*)_src
;
628 stride
>>= sizeof(pixel
)-1;
633 dc1
+= src
[4+i
-stride
];
635 dc0splat
= PIXEL_SPLAT_X4((dc0
+ 2)>>2);
636 dc1splat
= PIXEL_SPLAT_X4((dc1
+ 2)>>2);
639 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc0splat
);
640 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc1splat
);
644 static void FUNCC(pred8x8_dc
)(uint8_t *_src
, ptrdiff_t stride
)
648 pixel4 dc0splat
, dc1splat
, dc2splat
, dc3splat
;
649 pixel
*src
= (pixel
*)_src
;
650 stride
>>= sizeof(pixel
)-1;
654 dc0
+= src
[-1+i
*stride
] + src
[i
-stride
];
655 dc1
+= src
[4+i
-stride
];
656 dc2
+= src
[-1+(i
+4)*stride
];
658 dc0splat
= PIXEL_SPLAT_X4((dc0
+ 4)>>3);
659 dc1splat
= PIXEL_SPLAT_X4((dc1
+ 2)>>2);
660 dc2splat
= PIXEL_SPLAT_X4((dc2
+ 2)>>2);
661 dc3splat
= PIXEL_SPLAT_X4((dc1
+ dc2
+ 4)>>3);
664 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc0splat
);
665 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc1splat
);
668 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc2splat
);
669 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc3splat
);
673 static void FUNCC(pred8x16_dc
)(uint8_t *_src
, ptrdiff_t stride
)
676 int dc0
, dc1
, dc2
, dc3
, dc4
;
677 pixel4 dc0splat
, dc1splat
, dc2splat
, dc3splat
, dc4splat
, dc5splat
, dc6splat
, dc7splat
;
678 pixel
*src
= (pixel
*)_src
;
679 stride
>>= sizeof(pixel
)-1;
681 dc0
=dc1
=dc2
=dc3
=dc4
=0;
683 dc0
+= src
[-1+i
*stride
] + src
[i
-stride
];
684 dc1
+= src
[4+i
-stride
];
685 dc2
+= src
[-1+(i
+4)*stride
];
686 dc3
+= src
[-1+(i
+8)*stride
];
687 dc4
+= src
[-1+(i
+12)*stride
];
689 dc0splat
= PIXEL_SPLAT_X4((dc0
+ 4)>>3);
690 dc1splat
= PIXEL_SPLAT_X4((dc1
+ 2)>>2);
691 dc2splat
= PIXEL_SPLAT_X4((dc2
+ 2)>>2);
692 dc3splat
= PIXEL_SPLAT_X4((dc1
+ dc2
+ 4)>>3);
693 dc4splat
= PIXEL_SPLAT_X4((dc3
+ 2)>>2);
694 dc5splat
= PIXEL_SPLAT_X4((dc1
+ dc3
+ 4)>>3);
695 dc6splat
= PIXEL_SPLAT_X4((dc4
+ 2)>>2);
696 dc7splat
= PIXEL_SPLAT_X4((dc1
+ dc4
+ 4)>>3);
699 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc0splat
);
700 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc1splat
);
703 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc2splat
);
704 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc3splat
);
707 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc4splat
);
708 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc5splat
);
710 for(i
=12; i
<16; i
++){
711 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+0, dc6splat
);
712 AV_WN4PA(((pixel4
*)(src
+i
*stride
))+1, dc7splat
);
716 //the following 4 function should not be optimized!
717 static void FUNC(pred8x8_mad_cow_dc_l0t
)(uint8_t *src
, ptrdiff_t stride
)
719 FUNCC(pred8x8_top_dc
)(src
, stride
);
720 FUNCC(pred4x4_dc
)(src
, NULL
, stride
);
723 static void FUNC(pred8x16_mad_cow_dc_l0t
)(uint8_t *src
, ptrdiff_t stride
)
725 FUNCC(pred8x16_top_dc
)(src
, stride
);
726 FUNCC(pred4x4_dc
)(src
, NULL
, stride
);
729 static void FUNC(pred8x8_mad_cow_dc_0lt
)(uint8_t *src
, ptrdiff_t stride
)
731 FUNCC(pred8x8_dc
)(src
, stride
);
732 FUNCC(pred4x4_top_dc
)(src
, NULL
, stride
);
735 static void FUNC(pred8x16_mad_cow_dc_0lt
)(uint8_t *src
, ptrdiff_t stride
)
737 FUNCC(pred8x16_dc
)(src
, stride
);
738 FUNCC(pred4x4_top_dc
)(src
, NULL
, stride
);
741 static void FUNC(pred8x8_mad_cow_dc_l00
)(uint8_t *src
, ptrdiff_t stride
)
743 FUNCC(pred8x8_left_dc
)(src
, stride
);
744 FUNCC(pred4x4_128_dc
)(src
+ 4*stride
, NULL
, stride
);
745 FUNCC(pred4x4_128_dc
)(src
+ 4*stride
+ 4*sizeof(pixel
), NULL
, stride
);
748 static void FUNC(pred8x16_mad_cow_dc_l00
)(uint8_t *src
, ptrdiff_t stride
)
750 FUNCC(pred8x16_left_dc
)(src
, stride
);
751 FUNCC(pred4x4_128_dc
)(src
+ 4*stride
, NULL
, stride
);
752 FUNCC(pred4x4_128_dc
)(src
+ 4*stride
+ 4*sizeof(pixel
), NULL
, stride
);
755 static void FUNC(pred8x8_mad_cow_dc_0l0
)(uint8_t *src
, ptrdiff_t stride
)
757 FUNCC(pred8x8_left_dc
)(src
, stride
);
758 FUNCC(pred4x4_128_dc
)(src
, NULL
, stride
);
759 FUNCC(pred4x4_128_dc
)(src
+ 4*sizeof(pixel
), NULL
, stride
);
762 static void FUNC(pred8x16_mad_cow_dc_0l0
)(uint8_t *src
, ptrdiff_t stride
)
764 FUNCC(pred8x16_left_dc
)(src
, stride
);
765 FUNCC(pred4x4_128_dc
)(src
, NULL
, stride
);
766 FUNCC(pred4x4_128_dc
)(src
+ 4*sizeof(pixel
), NULL
, stride
);
769 static void FUNCC(pred8x8_plane
)(uint8_t *_src
, ptrdiff_t _stride
)
774 pixel
*src
= (pixel
*)_src
;
775 int stride
= _stride
>>(sizeof(pixel
)-1);
776 const pixel
* const src0
= src
+3-stride
;
777 const pixel
* src1
= src
+4*stride
-1;
778 const pixel
* src2
= src1
-2*stride
; // == src+2*stride-1;
779 int H
= src0
[1] - src0
[-1];
780 int V
= src1
[0] - src2
[ 0];
781 for(k
=2; k
<=4; ++k
) {
782 src1
+= stride
; src2
-= stride
;
783 H
+= k
*(src0
[k
] - src0
[-k
]);
784 V
+= k
*(src1
[0] - src2
[ 0]);
786 H
= ( 17*H
+16 ) >> 5;
787 V
= ( 17*V
+16 ) >> 5;
789 a
= 16*(src1
[0] + src2
[8]+1) - 3*(V
+H
);
793 src
[0] = CLIP((b
) >> 5);
794 src
[1] = CLIP((b
+ H
) >> 5);
795 src
[2] = CLIP((b
+2*H
) >> 5);
796 src
[3] = CLIP((b
+3*H
) >> 5);
797 src
[4] = CLIP((b
+4*H
) >> 5);
798 src
[5] = CLIP((b
+5*H
) >> 5);
799 src
[6] = CLIP((b
+6*H
) >> 5);
800 src
[7] = CLIP((b
+7*H
) >> 5);
805 static void FUNCC(pred8x16_plane
)(uint8_t *_src
, ptrdiff_t _stride
)
810 pixel
*src
= (pixel
*)_src
;
811 int stride
= _stride
>>(sizeof(pixel
)-1);
812 const pixel
* const src0
= src
+3-stride
;
813 const pixel
* src1
= src
+8*stride
-1;
814 const pixel
* src2
= src1
-2*stride
; // == src+6*stride-1;
815 int H
= src0
[1] - src0
[-1];
816 int V
= src1
[0] - src2
[ 0];
818 for (k
= 2; k
<= 4; ++k
) {
819 src1
+= stride
; src2
-= stride
;
820 H
+= k
*(src0
[k
] - src0
[-k
]);
821 V
+= k
*(src1
[0] - src2
[ 0]);
823 for (; k
<= 8; ++k
) {
824 src1
+= stride
; src2
-= stride
;
825 V
+= k
*(src1
[0] - src2
[0]);
831 a
= 16*(src1
[0] + src2
[8] + 1) - 7*V
- 3*H
;
832 for(j
=16; j
>0; --j
) {
835 src
[0] = CLIP((b
) >> 5);
836 src
[1] = CLIP((b
+ H
) >> 5);
837 src
[2] = CLIP((b
+2*H
) >> 5);
838 src
[3] = CLIP((b
+3*H
) >> 5);
839 src
[4] = CLIP((b
+4*H
) >> 5);
840 src
[5] = CLIP((b
+5*H
) >> 5);
841 src
[6] = CLIP((b
+6*H
) >> 5);
842 src
[7] = CLIP((b
+7*H
) >> 5);
847 #define SRC(x,y) src[(x)+(y)*stride]
849 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
850 #define PREDICT_8x8_LOAD_LEFT \
851 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
852 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
853 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
854 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
857 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
858 #define PREDICT_8x8_LOAD_TOP \
859 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
860 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
861 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
862 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
863 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
866 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
867 #define PREDICT_8x8_LOAD_TOPRIGHT \
868 int t8, t9, t10, t11, t12, t13, t14, t15; \
870 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
871 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
872 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
874 #define PREDICT_8x8_LOAD_TOPLEFT \
875 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
877 #define PREDICT_8x8_DC(v) \
879 for( y = 0; y < 8; y++ ) { \
880 AV_WN4PA(((pixel4*)src)+0, v); \
881 AV_WN4PA(((pixel4*)src)+1, v); \
885 static void FUNCC(pred8x8l_128_dc
)(uint8_t *_src
, int has_topleft
,
886 int has_topright
, ptrdiff_t _stride
)
888 pixel
*src
= (pixel
*)_src
;
889 int stride
= _stride
>>(sizeof(pixel
)-1);
891 PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH
-1)));
893 static void FUNCC(pred8x8l_left_dc
)(uint8_t *_src
, int has_topleft
,
894 int has_topright
, ptrdiff_t _stride
)
896 pixel
*src
= (pixel
*)_src
;
897 int stride
= _stride
>>(sizeof(pixel
)-1);
899 PREDICT_8x8_LOAD_LEFT
;
900 const pixel4 dc
= PIXEL_SPLAT_X4((l0
+l1
+l2
+l3
+l4
+l5
+l6
+l7
+4) >> 3);
903 static void FUNCC(pred8x8l_top_dc
)(uint8_t *_src
, int has_topleft
,
904 int has_topright
, ptrdiff_t _stride
)
906 pixel
*src
= (pixel
*)_src
;
907 int stride
= _stride
>>(sizeof(pixel
)-1);
909 PREDICT_8x8_LOAD_TOP
;
910 const pixel4 dc
= PIXEL_SPLAT_X4((t0
+t1
+t2
+t3
+t4
+t5
+t6
+t7
+4) >> 3);
913 static void FUNCC(pred8x8l_dc
)(uint8_t *_src
, int has_topleft
,
914 int has_topright
, ptrdiff_t _stride
)
916 pixel
*src
= (pixel
*)_src
;
917 int stride
= _stride
>>(sizeof(pixel
)-1);
919 PREDICT_8x8_LOAD_LEFT
;
920 PREDICT_8x8_LOAD_TOP
;
921 const pixel4 dc
= PIXEL_SPLAT_X4((l0
+l1
+l2
+l3
+l4
+l5
+l6
+l7
922 +t0
+t1
+t2
+t3
+t4
+t5
+t6
+t7
+8) >> 4);
925 static void FUNCC(pred8x8l_horizontal
)(uint8_t *_src
, int has_topleft
,
926 int has_topright
, ptrdiff_t _stride
)
928 pixel
*src
= (pixel
*)_src
;
929 int stride
= _stride
>>(sizeof(pixel
)-1);
932 PREDICT_8x8_LOAD_LEFT
;
933 #define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
934 AV_WN4PA(src+y*stride, a); \
935 AV_WN4PA(src+y*stride+4, a);
936 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
939 static void FUNCC(pred8x8l_vertical
)(uint8_t *_src
, int has_topleft
,
940 int has_topright
, ptrdiff_t _stride
)
943 pixel
*src
= (pixel
*)_src
;
944 int stride
= _stride
>>(sizeof(pixel
)-1);
947 PREDICT_8x8_LOAD_TOP
;
956 a
= AV_RN4PA(((pixel4
*)src
)+0);
957 b
= AV_RN4PA(((pixel4
*)src
)+1);
958 for( y
= 1; y
< 8; y
++ ) {
959 AV_WN4PA(((pixel4
*)(src
+y
*stride
))+0, a
);
960 AV_WN4PA(((pixel4
*)(src
+y
*stride
))+1, b
);
963 static void FUNCC(pred8x8l_down_left
)(uint8_t *_src
, int has_topleft
,
964 int has_topright
, ptrdiff_t _stride
)
966 pixel
*src
= (pixel
*)_src
;
967 int stride
= _stride
>>(sizeof(pixel
)-1);
968 PREDICT_8x8_LOAD_TOP
;
969 PREDICT_8x8_LOAD_TOPRIGHT
;
970 SRC(0,0)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
971 SRC(0,1)=SRC(1,0)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
972 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
973 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
974 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
975 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
976 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6
+ 2*t7
+ t8
+ 2) >> 2;
977 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7
+ 2*t8
+ t9
+ 2) >> 2;
978 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8
+ 2*t9
+ t10
+ 2) >> 2;
979 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9
+ 2*t10
+ t11
+ 2) >> 2;
980 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10
+ 2*t11
+ t12
+ 2) >> 2;
981 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11
+ 2*t12
+ t13
+ 2) >> 2;
982 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12
+ 2*t13
+ t14
+ 2) >> 2;
983 SRC(6,7)=SRC(7,6)= (t13
+ 2*t14
+ t15
+ 2) >> 2;
984 SRC(7,7)= (t14
+ 3*t15
+ 2) >> 2;
986 static void FUNCC(pred8x8l_down_right
)(uint8_t *_src
, int has_topleft
,
987 int has_topright
, ptrdiff_t _stride
)
989 pixel
*src
= (pixel
*)_src
;
990 int stride
= _stride
>>(sizeof(pixel
)-1);
991 PREDICT_8x8_LOAD_TOP
;
992 PREDICT_8x8_LOAD_LEFT
;
993 PREDICT_8x8_LOAD_TOPLEFT
;
994 SRC(0,7)= (l7
+ 2*l6
+ l5
+ 2) >> 2;
995 SRC(0,6)=SRC(1,7)= (l6
+ 2*l5
+ l4
+ 2) >> 2;
996 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5
+ 2*l4
+ l3
+ 2) >> 2;
997 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4
+ 2*l3
+ l2
+ 2) >> 2;
998 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3
+ 2*l2
+ l1
+ 2) >> 2;
999 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2
+ 2*l1
+ l0
+ 2) >> 2;
1000 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1
+ 2*l0
+ lt
+ 2) >> 2;
1001 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0
+ 2*lt
+ t0
+ 2) >> 2;
1002 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt
+ 2*t0
+ t1
+ 2) >> 2;
1003 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
1004 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
1005 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
1006 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
1007 SRC(6,0)=SRC(7,1)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
1008 SRC(7,0)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
1010 static void FUNCC(pred8x8l_vertical_right
)(uint8_t *_src
, int has_topleft
,
1011 int has_topright
, ptrdiff_t _stride
)
1013 pixel
*src
= (pixel
*)_src
;
1014 int stride
= _stride
>>(sizeof(pixel
)-1);
1015 PREDICT_8x8_LOAD_TOP
;
1016 PREDICT_8x8_LOAD_LEFT
;
1017 PREDICT_8x8_LOAD_TOPLEFT
;
1018 SRC(0,6)= (l5
+ 2*l4
+ l3
+ 2) >> 2;
1019 SRC(0,7)= (l6
+ 2*l5
+ l4
+ 2) >> 2;
1020 SRC(0,4)=SRC(1,6)= (l3
+ 2*l2
+ l1
+ 2) >> 2;
1021 SRC(0,5)=SRC(1,7)= (l4
+ 2*l3
+ l2
+ 2) >> 2;
1022 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1
+ 2*l0
+ lt
+ 2) >> 2;
1023 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2
+ 2*l1
+ l0
+ 2) >> 2;
1024 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0
+ 2*lt
+ t0
+ 2) >> 2;
1025 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt
+ t0
+ 1) >> 1;
1026 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt
+ 2*t0
+ t1
+ 2) >> 2;
1027 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0
+ t1
+ 1) >> 1;
1028 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
1029 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1
+ t2
+ 1) >> 1;
1030 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
1031 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2
+ t3
+ 1) >> 1;
1032 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
1033 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3
+ t4
+ 1) >> 1;
1034 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
1035 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4
+ t5
+ 1) >> 1;
1036 SRC(6,1)=SRC(7,3)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
1037 SRC(6,0)=SRC(7,2)= (t5
+ t6
+ 1) >> 1;
1038 SRC(7,1)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
1039 SRC(7,0)= (t6
+ t7
+ 1) >> 1;
1041 static void FUNCC(pred8x8l_horizontal_down
)(uint8_t *_src
, int has_topleft
,
1042 int has_topright
, ptrdiff_t _stride
)
1044 pixel
*src
= (pixel
*)_src
;
1045 int stride
= _stride
>>(sizeof(pixel
)-1);
1046 PREDICT_8x8_LOAD_TOP
;
1047 PREDICT_8x8_LOAD_LEFT
;
1048 PREDICT_8x8_LOAD_TOPLEFT
;
1049 SRC(0,7)= (l6
+ l7
+ 1) >> 1;
1050 SRC(1,7)= (l5
+ 2*l6
+ l7
+ 2) >> 2;
1051 SRC(0,6)=SRC(2,7)= (l5
+ l6
+ 1) >> 1;
1052 SRC(1,6)=SRC(3,7)= (l4
+ 2*l5
+ l6
+ 2) >> 2;
1053 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4
+ l5
+ 1) >> 1;
1054 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3
+ 2*l4
+ l5
+ 2) >> 2;
1055 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3
+ l4
+ 1) >> 1;
1056 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2
+ 2*l3
+ l4
+ 2) >> 2;
1057 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2
+ l3
+ 1) >> 1;
1058 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1
+ 2*l2
+ l3
+ 2) >> 2;
1059 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1
+ l2
+ 1) >> 1;
1060 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0
+ 2*l1
+ l2
+ 2) >> 2;
1061 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0
+ l1
+ 1) >> 1;
1062 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt
+ 2*l0
+ l1
+ 2) >> 2;
1063 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt
+ l0
+ 1) >> 1;
1064 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0
+ 2*lt
+ t0
+ 2) >> 2;
1065 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1
+ 2*t0
+ lt
+ 2) >> 2;
1066 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2
+ 2*t1
+ t0
+ 2) >> 2;
1067 SRC(4,0)=SRC(6,1)= (t3
+ 2*t2
+ t1
+ 2) >> 2;
1068 SRC(5,0)=SRC(7,1)= (t4
+ 2*t3
+ t2
+ 2) >> 2;
1069 SRC(6,0)= (t5
+ 2*t4
+ t3
+ 2) >> 2;
1070 SRC(7,0)= (t6
+ 2*t5
+ t4
+ 2) >> 2;
1072 static void FUNCC(pred8x8l_vertical_left
)(uint8_t *_src
, int has_topleft
,
1073 int has_topright
, ptrdiff_t _stride
)
1075 pixel
*src
= (pixel
*)_src
;
1076 int stride
= _stride
>>(sizeof(pixel
)-1);
1077 PREDICT_8x8_LOAD_TOP
;
1078 PREDICT_8x8_LOAD_TOPRIGHT
;
1079 SRC(0,0)= (t0
+ t1
+ 1) >> 1;
1080 SRC(0,1)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
1081 SRC(0,2)=SRC(1,0)= (t1
+ t2
+ 1) >> 1;
1082 SRC(0,3)=SRC(1,1)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
1083 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2
+ t3
+ 1) >> 1;
1084 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
1085 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3
+ t4
+ 1) >> 1;
1086 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
1087 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4
+ t5
+ 1) >> 1;
1088 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
1089 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5
+ t6
+ 1) >> 1;
1090 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
1091 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6
+ t7
+ 1) >> 1;
1092 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6
+ 2*t7
+ t8
+ 2) >> 2;
1093 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7
+ t8
+ 1) >> 1;
1094 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7
+ 2*t8
+ t9
+ 2) >> 2;
1095 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8
+ t9
+ 1) >> 1;
1096 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8
+ 2*t9
+ t10
+ 2) >> 2;
1097 SRC(6,6)=SRC(7,4)= (t9
+ t10
+ 1) >> 1;
1098 SRC(6,7)=SRC(7,5)= (t9
+ 2*t10
+ t11
+ 2) >> 2;
1099 SRC(7,6)= (t10
+ t11
+ 1) >> 1;
1100 SRC(7,7)= (t10
+ 2*t11
+ t12
+ 2) >> 2;
1102 static void FUNCC(pred8x8l_horizontal_up
)(uint8_t *_src
, int has_topleft
,
1103 int has_topright
, ptrdiff_t _stride
)
1105 pixel
*src
= (pixel
*)_src
;
1106 int stride
= _stride
>>(sizeof(pixel
)-1);
1107 PREDICT_8x8_LOAD_LEFT
;
1108 SRC(0,0)= (l0
+ l1
+ 1) >> 1;
1109 SRC(1,0)= (l0
+ 2*l1
+ l2
+ 2) >> 2;
1110 SRC(0,1)=SRC(2,0)= (l1
+ l2
+ 1) >> 1;
1111 SRC(1,1)=SRC(3,0)= (l1
+ 2*l2
+ l3
+ 2) >> 2;
1112 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2
+ l3
+ 1) >> 1;
1113 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2
+ 2*l3
+ l4
+ 2) >> 2;
1114 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3
+ l4
+ 1) >> 1;
1115 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3
+ 2*l4
+ l5
+ 2) >> 2;
1116 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4
+ l5
+ 1) >> 1;
1117 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4
+ 2*l5
+ l6
+ 2) >> 2;
1118 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5
+ l6
+ 1) >> 1;
1119 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5
+ 2*l6
+ l7
+ 2) >> 2;
1120 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6
+ l7
+ 1) >> 1;
1121 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6
+ 3*l7
+ 2) >> 2;
1122 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1123 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1124 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1125 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7
;
1128 static void FUNCC(pred8x8l_vertical_filter_add
)(uint8_t *_src
, int16_t *_block
, int has_topleft
,
1129 int has_topright
, ptrdiff_t _stride
)
1132 pixel
*src
= (pixel
*)_src
;
1133 const dctcoef
*block
= (const dctcoef
*)_block
;
1135 int stride
= _stride
>>(sizeof(pixel
)-1);
1136 PREDICT_8x8_LOAD_TOP
;
1149 src
[0*stride
]= v
+= block
[0];
1150 src
[1*stride
]= v
+= block
[8];
1151 src
[2*stride
]= v
+= block
[16];
1152 src
[3*stride
]= v
+= block
[24];
1153 src
[4*stride
]= v
+= block
[32];
1154 src
[5*stride
]= v
+= block
[40];
1155 src
[6*stride
]= v
+= block
[48];
1156 src
[7*stride
]= v
+ block
[56];
1161 memset(_block
, 0, sizeof(dctcoef
) * 64);
1164 static void FUNCC(pred8x8l_horizontal_filter_add
)(uint8_t *_src
, int16_t *_block
, int has_topleft
,
1165 int has_topright
, ptrdiff_t _stride
)
1168 pixel
*src
= (pixel
*)_src
;
1169 const dctcoef
*block
= (const dctcoef
*)_block
;
1171 int stride
= _stride
>>(sizeof(pixel
)-1);
1172 PREDICT_8x8_LOAD_LEFT
;
1185 src
[0]= v
+= block
[0];
1186 src
[1]= v
+= block
[1];
1187 src
[2]= v
+= block
[2];
1188 src
[3]= v
+= block
[3];
1189 src
[4]= v
+= block
[4];
1190 src
[5]= v
+= block
[5];
1191 src
[6]= v
+= block
[6];
1192 src
[7]= v
+ block
[7];
1197 memset(_block
, 0, sizeof(dctcoef
) * 64);
1200 #undef PREDICT_8x8_LOAD_LEFT
1201 #undef PREDICT_8x8_LOAD_TOP
1202 #undef PREDICT_8x8_LOAD_TOPLEFT
1203 #undef PREDICT_8x8_LOAD_TOPRIGHT
1204 #undef PREDICT_8x8_DC
1210 static void FUNCC(pred4x4_vertical_add
)(uint8_t *_pix
, int16_t *_block
,
1214 pixel
*pix
= (pixel
*)_pix
;
1215 const dctcoef
*block
= (const dctcoef
*)_block
;
1216 stride
>>= sizeof(pixel
)-1;
1220 pix
[1*stride
]= v
+= block
[0];
1221 pix
[2*stride
]= v
+= block
[4];
1222 pix
[3*stride
]= v
+= block
[8];
1223 pix
[4*stride
]= v
+ block
[12];
1228 memset(_block
, 0, sizeof(dctcoef
) * 16);
1231 static void FUNCC(pred4x4_horizontal_add
)(uint8_t *_pix
, int16_t *_block
,
1235 pixel
*pix
= (pixel
*)_pix
;
1236 const dctcoef
*block
= (const dctcoef
*)_block
;
1237 stride
>>= sizeof(pixel
)-1;
1240 pix
[0]= v
+= block
[0];
1241 pix
[1]= v
+= block
[1];
1242 pix
[2]= v
+= block
[2];
1243 pix
[3]= v
+ block
[3];
1248 memset(_block
, 0, sizeof(dctcoef
) * 16);
1251 static void FUNCC(pred8x8l_vertical_add
)(uint8_t *_pix
, int16_t *_block
,
1255 pixel
*pix
= (pixel
*)_pix
;
1256 const dctcoef
*block
= (const dctcoef
*)_block
;
1257 stride
>>= sizeof(pixel
)-1;
1261 pix
[1*stride
]= v
+= block
[0];
1262 pix
[2*stride
]= v
+= block
[8];
1263 pix
[3*stride
]= v
+= block
[16];
1264 pix
[4*stride
]= v
+= block
[24];
1265 pix
[5*stride
]= v
+= block
[32];
1266 pix
[6*stride
]= v
+= block
[40];
1267 pix
[7*stride
]= v
+= block
[48];
1268 pix
[8*stride
]= v
+ block
[56];
1273 memset(_block
, 0, sizeof(dctcoef
) * 64);
1276 static void FUNCC(pred8x8l_horizontal_add
)(uint8_t *_pix
, int16_t *_block
,
1280 pixel
*pix
= (pixel
*)_pix
;
1281 const dctcoef
*block
= (const dctcoef
*)_block
;
1282 stride
>>= sizeof(pixel
)-1;
1285 pix
[0]= v
+= block
[0];
1286 pix
[1]= v
+= block
[1];
1287 pix
[2]= v
+= block
[2];
1288 pix
[3]= v
+= block
[3];
1289 pix
[4]= v
+= block
[4];
1290 pix
[5]= v
+= block
[5];
1291 pix
[6]= v
+= block
[6];
1292 pix
[7]= v
+ block
[7];
1297 memset(_block
, 0, sizeof(dctcoef
) * 64);
1300 static void FUNCC(pred16x16_vertical_add
)(uint8_t *pix
, const int *block_offset
,
1306 FUNCC(pred4x4_vertical_add
)(pix
+ block_offset
[i
], block
+ i
*16*sizeof(pixel
), stride
);
1309 static void FUNCC(pred16x16_horizontal_add
)(uint8_t *pix
,
1310 const int *block_offset
,
1316 FUNCC(pred4x4_horizontal_add
)(pix
+ block_offset
[i
], block
+ i
*16*sizeof(pixel
), stride
);
1319 static void FUNCC(pred8x8_vertical_add
)(uint8_t *pix
, const int *block_offset
,
1320 int16_t *block
, ptrdiff_t stride
)
1324 FUNCC(pred4x4_vertical_add
)(pix
+ block_offset
[i
], block
+ i
*16*sizeof(pixel
), stride
);
1327 static void FUNCC(pred8x16_vertical_add
)(uint8_t *pix
, const int *block_offset
,
1328 int16_t *block
, ptrdiff_t stride
)
1332 FUNCC(pred4x4_vertical_add
)(pix
+ block_offset
[i
], block
+ i
*16*sizeof(pixel
), stride
);
1334 FUNCC(pred4x4_vertical_add
)(pix
+ block_offset
[i
+4], block
+ i
*16*sizeof(pixel
), stride
);
1337 static void FUNCC(pred8x8_horizontal_add
)(uint8_t *pix
, const int *block_offset
,
1343 FUNCC(pred4x4_horizontal_add
)(pix
+ block_offset
[i
], block
+ i
*16*sizeof(pixel
), stride
);
1346 static void FUNCC(pred8x16_horizontal_add
)(uint8_t *pix
,
1347 const int *block_offset
,
1348 int16_t *block
, ptrdiff_t stride
)
1352 FUNCC(pred4x4_horizontal_add
)(pix
+ block_offset
[i
], block
+ i
*16*sizeof(pixel
), stride
);
1354 FUNCC(pred4x4_horizontal_add
)(pix
+ block_offset
[i
+4], block
+ i
*16*sizeof(pixel
), stride
);