Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_intrapred_10bit.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29cextern pw_512
30cextern pw_16
31cextern pw_8
32cextern pw_4
33cextern pw_2
34cextern pw_1
35
36pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37pw_m3: times 8 dw -3
38pw_pixel_max: times 8 dw ((1 << 10)-1)
39pd_17: times 4 dd 17
40pd_16: times 4 dd 16
41
42SECTION .text
43
44; dest, left, right, src
45; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46%macro PRED4x4_LOWPASS 4
47 paddw %2, %3
48 psrlw %2, 1
49 pavgw %1, %4, %2
50%endmacro
51
52;-----------------------------------------------------------------------------
53; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54;-----------------------------------------------------------------------------
55%macro PRED4x4_DR 0
56cglobal pred4x4_down_right_10, 3, 3
57 sub r0, r2
58 lea r1, [r0+r2*2]
59 movhps m1, [r1-8]
60 movhps m2, [r0+r2*1-8]
61 movhps m4, [r0-8]
62 punpckhwd m2, m4
63 movq m3, [r0]
64 punpckhdq m1, m2
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
71 movq [r1+r2*2], m0
72 psrldq m0, 2
73 movq [r1+r2*1], m0
74 psrldq m0, 2
75 movq [r0+r2*2], m0
76 psrldq m0, 2
77 movq [r0+r2*1], m0
78 RET
79%endmacro
80
81INIT_XMM sse2
82PRED4x4_DR
83INIT_XMM ssse3
84PRED4x4_DR
85%if HAVE_AVX_EXTERNAL
86INIT_XMM avx
87PRED4x4_DR
88%endif
89
90;------------------------------------------------------------------------------
91; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
92;------------------------------------------------------------------------------
93%macro PRED4x4_VR 0
94cglobal pred4x4_vertical_right_10, 3, 3, 6
95 sub r0, r2
96 lea r1, [r0+r2*2]
97 movq m5, [r0] ; ........t3t2t1t0
98 movhps m1, [r0-8]
99 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
100 pavgw m5, m0
101 movhps m1, [r0+r2*1-8]
102 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
103 movhps m2, [r0+r2*2-8]
104 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
105 movhps m3, [r1+r2*1-8]
106 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
107 PRED4x4_LOWPASS m1, m0, m2, m1
108 pslldq m0, m1, 12
109 psrldq m1, 4
110 movq [r0+r2*1], m5
111 movq [r0+r2*2], m1
112 PALIGNR m5, m0, 14, m2
113 pslldq m0, 2
114 movq [r1+r2*1], m5
115 PALIGNR m1, m0, 14, m0
116 movq [r1+r2*2], m1
117 RET
118%endmacro
119
120INIT_XMM sse2
121PRED4x4_VR
122INIT_XMM ssse3
123PRED4x4_VR
124%if HAVE_AVX_EXTERNAL
125INIT_XMM avx
126PRED4x4_VR
127%endif
128
129;-------------------------------------------------------------------------------
130; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
131;-------------------------------------------------------------------------------
132%macro PRED4x4_HD 0
133cglobal pred4x4_horizontal_down_10, 3, 3
134 sub r0, r2
135 lea r1, [r0+r2*2]
136 movq m0, [r0-8] ; lt ..
137 movhps m0, [r0]
138 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
139 movq m1, [r1+r2*2-8] ; l3
140 movq m3, [r1+r2*1-8]
141 punpcklwd m1, m3 ; l2 l3
142 movq m2, [r0+r2*2-8] ; l1
143 movq m3, [r0+r2*1-8]
144 punpcklwd m2, m3 ; l0 l1
145 punpckhdq m1, m2 ; l0 l1 l2 l3
146 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
147 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
148 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
149 pavgw m5, m1, m3
150 PRED4x4_LOWPASS m3, m1, m0, m3
151 punpcklwd m5, m3
152 psrldq m3, 8
153 PALIGNR m3, m5, 12, m4
154 movq [r1+r2*2], m5
155 movhps [r0+r2*2], m5
156 psrldq m5, 4
157 movq [r1+r2*1], m5
158 movq [r0+r2*1], m3
159 RET
160%endmacro
161
162INIT_XMM sse2
163PRED4x4_HD
164INIT_XMM ssse3
165PRED4x4_HD
166%if HAVE_AVX_EXTERNAL
167INIT_XMM avx
168PRED4x4_HD
169%endif
170
171;-----------------------------------------------------------------------------
172; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
173;-----------------------------------------------------------------------------
174
175INIT_MMX mmxext
176cglobal pred4x4_dc_10, 3, 3
177 sub r0, r2
178 lea r1, [r0+r2*2]
179 movq m2, [r0+r2*1-8]
180 paddw m2, [r0+r2*2-8]
181 paddw m2, [r1+r2*1-8]
182 paddw m2, [r1+r2*2-8]
183 psrlq m2, 48
184 movq m0, [r0]
185 HADDW m0, m1
186 paddw m0, [pw_4]
187 paddw m0, m2
188 psrlw m0, 3
189 SPLATW m0, m0, 0
190 movq [r0+r2*1], m0
191 movq [r0+r2*2], m0
192 movq [r1+r2*1], m0
193 movq [r1+r2*2], m0
194 RET
195
196;-----------------------------------------------------------------------------
197; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
198;-----------------------------------------------------------------------------
199%macro PRED4x4_DL 0
200cglobal pred4x4_down_left_10, 3, 3
201 sub r0, r2
202 movq m0, [r0]
203 movhps m0, [r1]
204 psrldq m2, m0, 2
205 pslldq m3, m0, 2
206 pshufhw m2, m2, 10100100b
207 PRED4x4_LOWPASS m0, m3, m2, m0
208 lea r1, [r0+r2*2]
209 movhps [r1+r2*2], m0
210 psrldq m0, 2
211 movq [r0+r2*1], m0
212 psrldq m0, 2
213 movq [r0+r2*2], m0
214 psrldq m0, 2
215 movq [r1+r2*1], m0
216 RET
217%endmacro
218
219INIT_XMM sse2
220PRED4x4_DL
221%if HAVE_AVX_EXTERNAL
222INIT_XMM avx
223PRED4x4_DL
224%endif
225
226;-----------------------------------------------------------------------------
227; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
228;-----------------------------------------------------------------------------
229%macro PRED4x4_VL 0
230cglobal pred4x4_vertical_left_10, 3, 3
231 sub r0, r2
232 movu m1, [r0]
233 movhps m1, [r1]
234 psrldq m0, m1, 2
235 psrldq m2, m1, 4
236 pavgw m4, m0, m1
237 PRED4x4_LOWPASS m0, m1, m2, m0
238 lea r1, [r0+r2*2]
239 movq [r0+r2*1], m4
240 movq [r0+r2*2], m0
241 psrldq m4, 2
242 psrldq m0, 2
243 movq [r1+r2*1], m4
244 movq [r1+r2*2], m0
245 RET
246%endmacro
247
248INIT_XMM sse2
249PRED4x4_VL
250%if HAVE_AVX_EXTERNAL
251INIT_XMM avx
252PRED4x4_VL
253%endif
254
255;-----------------------------------------------------------------------------
256; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
257;-----------------------------------------------------------------------------
258INIT_MMX mmxext
259cglobal pred4x4_horizontal_up_10, 3, 3
260 sub r0, r2
261 lea r1, [r0+r2*2]
262 movq m0, [r0+r2*1-8]
263 punpckhwd m0, [r0+r2*2-8]
264 movq m1, [r1+r2*1-8]
265 punpckhwd m1, [r1+r2*2-8]
266 punpckhdq m0, m1
267 pshufw m1, m1, 0xFF
268 movq [r1+r2*2], m1
269 movd [r1+r2*1+4], m1
270 pshufw m2, m0, 11111001b
271 movq m1, m2
272 pavgw m2, m0
273
274 pshufw m5, m0, 11111110b
275 PRED4x4_LOWPASS m1, m0, m5, m1
276 movq m6, m2
277 punpcklwd m6, m1
278 movq [r0+r2*1], m6
279 psrlq m2, 16
280 psrlq m1, 16
281 punpcklwd m2, m1
282 movq [r0+r2*2], m2
283 psrlq m2, 32
284 movd [r1+r2*1], m2
285 RET
286
287
288
289;-----------------------------------------------------------------------------
290; void ff_pred8x8_vertical(pixel *src, int stride)
291;-----------------------------------------------------------------------------
292INIT_XMM sse2
293cglobal pred8x8_vertical_10, 2, 2
294 sub r0, r1
295 mova m0, [r0]
296%rep 3
297 mova [r0+r1*1], m0
298 mova [r0+r1*2], m0
299 lea r0, [r0+r1*2]
300%endrep
301 mova [r0+r1*1], m0
302 mova [r0+r1*2], m0
303 RET
304
305;-----------------------------------------------------------------------------
306; void ff_pred8x8_horizontal(pixel *src, int stride)
307;-----------------------------------------------------------------------------
308INIT_XMM sse2
309cglobal pred8x8_horizontal_10, 2, 3
310 mov r2d, 4
311.loop:
312 movq m0, [r0+r1*0-8]
313 movq m1, [r0+r1*1-8]
314 pshuflw m0, m0, 0xff
315 pshuflw m1, m1, 0xff
316 punpcklqdq m0, m0
317 punpcklqdq m1, m1
318 mova [r0+r1*0], m0
319 mova [r0+r1*1], m1
320 lea r0, [r0+r1*2]
321 dec r2d
322 jg .loop
323 REP_RET
324
325;-----------------------------------------------------------------------------
326; void ff_predict_8x8_dc(pixel *src, int stride)
327;-----------------------------------------------------------------------------
328%macro MOV8 2-3
329; sort of a hack, but it works
330%if mmsize==8
331 movq [%1+0], %2
332 movq [%1+8], %3
333%else
334 movdqa [%1], %2
335%endif
336%endmacro
337
338%macro PRED8x8_DC 1
339cglobal pred8x8_dc_10, 2, 6
340 sub r0, r1
341 pxor m4, m4
342 movq m0, [r0+0]
343 movq m1, [r0+8]
344%if mmsize==16
345 punpcklwd m0, m1
346 movhlps m1, m0
347 paddw m0, m1
348%else
349 pshufw m2, m0, 00001110b
350 pshufw m3, m1, 00001110b
351 paddw m0, m2
352 paddw m1, m3
353 punpcklwd m0, m1
354%endif
355 %1 m2, m0, 00001110b
356 paddw m0, m2
357
358 lea r5, [r1*3]
359 lea r4, [r0+r1*4]
360 movzx r2d, word [r0+r1*1-2]
361 movzx r3d, word [r0+r1*2-2]
362 add r2d, r3d
363 movzx r3d, word [r0+r5*1-2]
364 add r2d, r3d
365 movzx r3d, word [r4-2]
366 add r2d, r3d
367 movd m2, r2d ; s2
368
369 movzx r2d, word [r4+r1*1-2]
370 movzx r3d, word [r4+r1*2-2]
371 add r2d, r3d
372 movzx r3d, word [r4+r5*1-2]
373 add r2d, r3d
374 movzx r3d, word [r4+r1*4-2]
375 add r2d, r3d
376 movd m3, r2d ; s3
377
378 punpcklwd m2, m3
379 punpckldq m0, m2 ; s0, s1, s2, s3
380 %1 m3, m0, 11110110b ; s2, s1, s3, s3
381 %1 m0, m0, 01110100b ; s0, s1, s3, s1
382 paddw m0, m3
383 psrlw m0, 2
384 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
385%if mmsize==16
386 punpcklwd m0, m0
387 pshufd m3, m0, 11111010b
388 punpckldq m0, m0
389 SWAP 0,1
390%else
391 pshufw m1, m0, 0x00
392 pshufw m2, m0, 0x55
393 pshufw m3, m0, 0xaa
394 pshufw m4, m0, 0xff
395%endif
396 MOV8 r0+r1*1, m1, m2
397 MOV8 r0+r1*2, m1, m2
398 MOV8 r0+r5*1, m1, m2
399 MOV8 r0+r1*4, m1, m2
400 MOV8 r4+r1*1, m3, m4
401 MOV8 r4+r1*2, m3, m4
402 MOV8 r4+r5*1, m3, m4
403 MOV8 r4+r1*4, m3, m4
404 RET
405%endmacro
406
407INIT_MMX mmxext
408PRED8x8_DC pshufw
409INIT_XMM sse2
410PRED8x8_DC pshuflw
411
412;-----------------------------------------------------------------------------
413; void ff_pred8x8_top_dc(pixel *src, int stride)
414;-----------------------------------------------------------------------------
415INIT_XMM sse2
416cglobal pred8x8_top_dc_10, 2, 4
417 sub r0, r1
418 mova m0, [r0]
419 pshuflw m1, m0, 0x4e
420 pshufhw m1, m1, 0x4e
421 paddw m0, m1
422 pshuflw m1, m0, 0xb1
423 pshufhw m1, m1, 0xb1
424 paddw m0, m1
425 lea r2, [r1*3]
426 lea r3, [r0+r1*4]
427 paddw m0, [pw_2]
428 psrlw m0, 2
429 mova [r0+r1*1], m0
430 mova [r0+r1*2], m0
431 mova [r0+r2*1], m0
432 mova [r0+r1*4], m0
433 mova [r3+r1*1], m0
434 mova [r3+r1*2], m0
435 mova [r3+r2*1], m0
436 mova [r3+r1*4], m0
437 RET
438
439;-----------------------------------------------------------------------------
440; void ff_pred8x8_plane(pixel *src, int stride)
441;-----------------------------------------------------------------------------
442INIT_XMM sse2
443cglobal pred8x8_plane_10, 2, 7, 7
444 sub r0, r1
445 lea r2, [r1*3]
446 lea r3, [r0+r1*4]
447 mova m2, [r0]
448 pmaddwd m2, [pw_m32101234]
449 HADDD m2, m1
450 movd m0, [r0-4]
451 psrld m0, 14
452 psubw m2, m0 ; H
453 movd m0, [r3+r1*4-4]
454 movd m1, [r0+12]
455 paddw m0, m1
456 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
457 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
458 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
459 sub r4d, r5d
460 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
461 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
462 sub r6d, r5d
463 lea r4d, [r4+r6*2]
464 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
465 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
466 sub r5d, r6d
467 lea r5d, [r5*3]
468 add r4d, r5d
469 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
470 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
471 sub r6d, r5d
472 lea r4d, [r4+r6*4]
473 movd m3, r4d ; V
474 punpckldq m2, m3
475 pmaddwd m2, [pd_17]
476 paddd m2, [pd_16]
477 psrad m2, 5 ; b, c
478
479 mova m3, [pw_pixel_max]
480 pxor m1, m1
481 SPLATW m0, m0, 1
482 SPLATW m4, m2, 2
483 SPLATW m2, m2, 0
484 pmullw m2, [pw_m32101234] ; b
485 pmullw m5, m4, [pw_m3] ; c
486 paddw m5, [pw_16]
487 mov r2d, 8
488 add r0, r1
489.loop:
490 paddsw m6, m2, m5
491 paddsw m6, m0
492 psraw m6, 5
493 CLIPW m6, m1, m3
494 mova [r0], m6
495 paddw m5, m4
496 add r0, r1
497 dec r2d
498 jg .loop
499 REP_RET
500
501
502;-----------------------------------------------------------------------------
503; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
504; int stride)
505;-----------------------------------------------------------------------------
506%macro PRED8x8L_128_DC 0
507cglobal pred8x8l_128_dc_10, 4, 4
508 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
509 lea r1, [r3*3]
510 lea r2, [r0+r3*4]
511 MOV8 r0+r3*0, m0, m0
512 MOV8 r0+r3*1, m0, m0
513 MOV8 r0+r3*2, m0, m0
514 MOV8 r0+r1*1, m0, m0
515 MOV8 r2+r3*0, m0, m0
516 MOV8 r2+r3*1, m0, m0
517 MOV8 r2+r3*2, m0, m0
518 MOV8 r2+r1*1, m0, m0
519 RET
520%endmacro
521
522INIT_MMX mmxext
523PRED8x8L_128_DC
524INIT_XMM sse2
525PRED8x8L_128_DC
526
527;-----------------------------------------------------------------------------
528; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
529; int stride)
530;-----------------------------------------------------------------------------
531%macro PRED8x8L_TOP_DC 0
532cglobal pred8x8l_top_dc_10, 4, 4, 6
533 sub r0, r3
534 mova m0, [r0]
535 shr r1d, 14
536 shr r2d, 13
537 neg r1
538 pslldq m1, m0, 2
539 psrldq m2, m0, 2
540 pinsrw m1, [r0+r1], 0
541 pinsrw m2, [r0+r2+14], 7
542 lea r1, [r3*3]
543 lea r2, [r0+r3*4]
544 PRED4x4_LOWPASS m0, m2, m1, m0
545 HADDW m0, m1
546 paddw m0, [pw_4]
547 psrlw m0, 3
548 SPLATW m0, m0, 0
549 mova [r0+r3*1], m0
550 mova [r0+r3*2], m0
551 mova [r0+r1*1], m0
552 mova [r0+r3*4], m0
553 mova [r2+r3*1], m0
554 mova [r2+r3*2], m0
555 mova [r2+r1*1], m0
556 mova [r2+r3*4], m0
557 RET
558%endmacro
559
560INIT_XMM sse2
561PRED8x8L_TOP_DC
562%if HAVE_AVX_EXTERNAL
563INIT_XMM avx
564PRED8x8L_TOP_DC
565%endif
566
567;-------------------------------------------------------------------------------
568; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
569;-------------------------------------------------------------------------------
570;TODO: see if scalar is faster
571%macro PRED8x8L_DC 0
572cglobal pred8x8l_dc_10, 4, 6, 6
573 sub r0, r3
574 lea r4, [r0+r3*4]
575 lea r5, [r3*3]
576 mova m0, [r0+r3*2-16]
577 punpckhwd m0, [r0+r3*1-16]
578 mova m1, [r4+r3*0-16]
579 punpckhwd m1, [r0+r5*1-16]
580 punpckhdq m1, m0
581 mova m2, [r4+r3*2-16]
582 punpckhwd m2, [r4+r3*1-16]
583 mova m3, [r4+r3*4-16]
584 punpckhwd m3, [r4+r5*1-16]
585 punpckhdq m3, m2
586 punpckhqdq m3, m1
587 mova m0, [r0]
588 shr r1d, 14
589 shr r2d, 13
590 neg r1
591 pslldq m1, m0, 2
592 psrldq m2, m0, 2
593 pinsrw m1, [r0+r1], 0
594 pinsrw m2, [r0+r2+14], 7
595 not r1
596 and r1, r3
597 pslldq m4, m3, 2
598 psrldq m5, m3, 2
599 pshuflw m4, m4, 11100101b
600 pinsrw m5, [r0+r1-2], 7
601 PRED4x4_LOWPASS m3, m4, m5, m3
602 PRED4x4_LOWPASS m0, m2, m1, m0
603 paddw m0, m3
604 HADDW m0, m1
605 paddw m0, [pw_8]
606 psrlw m0, 4
607 SPLATW m0, m0
608 mova [r0+r3*1], m0
609 mova [r0+r3*2], m0
610 mova [r0+r5*1], m0
611 mova [r0+r3*4], m0
612 mova [r4+r3*1], m0
613 mova [r4+r3*2], m0
614 mova [r4+r5*1], m0
615 mova [r4+r3*4], m0
616 RET
617%endmacro
618
619INIT_XMM sse2
620PRED8x8L_DC
621%if HAVE_AVX_EXTERNAL
622INIT_XMM avx
623PRED8x8L_DC
624%endif
625
626;-----------------------------------------------------------------------------
627; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
628; int stride)
629;-----------------------------------------------------------------------------
630%macro PRED8x8L_VERTICAL 0
631cglobal pred8x8l_vertical_10, 4, 4, 6
632 sub r0, r3
633 mova m0, [r0]
634 shr r1d, 14
635 shr r2d, 13
636 neg r1
637 pslldq m1, m0, 2
638 psrldq m2, m0, 2
639 pinsrw m1, [r0+r1], 0
640 pinsrw m2, [r0+r2+14], 7
641 lea r1, [r3*3]
642 lea r2, [r0+r3*4]
643 PRED4x4_LOWPASS m0, m2, m1, m0
644 mova [r0+r3*1], m0
645 mova [r0+r3*2], m0
646 mova [r0+r1*1], m0
647 mova [r0+r3*4], m0
648 mova [r2+r3*1], m0
649 mova [r2+r3*2], m0
650 mova [r2+r1*1], m0
651 mova [r2+r3*4], m0
652 RET
653%endmacro
654
655INIT_XMM sse2
656PRED8x8L_VERTICAL
657%if HAVE_AVX_EXTERNAL
658INIT_XMM avx
659PRED8x8L_VERTICAL
660%endif
661
662;-----------------------------------------------------------------------------
663; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
664; int stride)
665;-----------------------------------------------------------------------------
666%macro PRED8x8L_HORIZONTAL 0
667cglobal pred8x8l_horizontal_10, 4, 4, 5
668 mova m0, [r0-16]
669 shr r1d, 14
670 dec r1
671 and r1, r3
672 sub r1, r3
673 punpckhwd m0, [r0+r1-16]
674 mova m1, [r0+r3*2-16]
675 punpckhwd m1, [r0+r3*1-16]
676 lea r2, [r0+r3*4]
677 lea r1, [r3*3]
678 punpckhdq m1, m0
679 mova m2, [r2+r3*0-16]
680 punpckhwd m2, [r0+r1-16]
681 mova m3, [r2+r3*2-16]
682 punpckhwd m3, [r2+r3*1-16]
683 punpckhdq m3, m2
684 punpckhqdq m3, m1
685 PALIGNR m4, m3, [r2+r1-16], 14, m0
686 pslldq m0, m4, 2
687 pshuflw m0, m0, 11100101b
688 PRED4x4_LOWPASS m4, m3, m0, m4
689 punpckhwd m3, m4, m4
690 punpcklwd m4, m4
691 pshufd m0, m3, 0xff
692 pshufd m1, m3, 0xaa
693 pshufd m2, m3, 0x55
694 pshufd m3, m3, 0x00
695 mova [r0+r3*0], m0
696 mova [r0+r3*1], m1
697 mova [r0+r3*2], m2
698 mova [r0+r1*1], m3
699 pshufd m0, m4, 0xff
700 pshufd m1, m4, 0xaa
701 pshufd m2, m4, 0x55
702 pshufd m3, m4, 0x00
703 mova [r2+r3*0], m0
704 mova [r2+r3*1], m1
705 mova [r2+r3*2], m2
706 mova [r2+r1*1], m3
707 RET
708%endmacro
709
710INIT_XMM sse2
711PRED8x8L_HORIZONTAL
712INIT_XMM ssse3
713PRED8x8L_HORIZONTAL
714%if HAVE_AVX_EXTERNAL
715INIT_XMM avx
716PRED8x8L_HORIZONTAL
717%endif
718
719;-----------------------------------------------------------------------------
720; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
721; int stride)
722;-----------------------------------------------------------------------------
723%macro PRED8x8L_DOWN_LEFT 0
724cglobal pred8x8l_down_left_10, 4, 4, 7
725 sub r0, r3
726 mova m3, [r0]
727 shr r1d, 14
728 neg r1
729 shr r2d, 13
730 pslldq m1, m3, 2
731 psrldq m2, m3, 2
732 pinsrw m1, [r0+r1], 0
733 pinsrw m2, [r0+r2+14], 7
734 PRED4x4_LOWPASS m6, m2, m1, m3
735 jz .fix_tr ; flags from shr r2d
736 mova m1, [r0+16]
737 psrldq m5, m1, 2
738 PALIGNR m2, m1, m3, 14, m3
739 pshufhw m5, m5, 10100100b
740 PRED4x4_LOWPASS m1, m2, m5, m1
741.do_topright:
742 lea r1, [r3*3]
743 psrldq m5, m1, 14
744 lea r2, [r0+r3*4]
745 PALIGNR m2, m1, m6, 2, m0
746 PALIGNR m3, m1, m6, 14, m0
747 PALIGNR m5, m1, 2, m0
748 pslldq m4, m6, 2
749 PRED4x4_LOWPASS m6, m4, m2, m6
750 PRED4x4_LOWPASS m1, m3, m5, m1
751 mova [r2+r3*4], m1
752 PALIGNR m1, m6, 14, m2
753 pslldq m6, 2
754 mova [r2+r1*1], m1
755 PALIGNR m1, m6, 14, m2
756 pslldq m6, 2
757 mova [r2+r3*2], m1
758 PALIGNR m1, m6, 14, m2
759 pslldq m6, 2
760 mova [r2+r3*1], m1
761 PALIGNR m1, m6, 14, m2
762 pslldq m6, 2
763 mova [r0+r3*4], m1
764 PALIGNR m1, m6, 14, m2
765 pslldq m6, 2
766 mova [r0+r1*1], m1
767 PALIGNR m1, m6, 14, m2
768 pslldq m6, 2
769 mova [r0+r3*2], m1
770 PALIGNR m1, m6, 14, m6
771 mova [r0+r3*1], m1
772 RET
773.fix_tr:
774 punpckhwd m3, m3
775 pshufd m1, m3, 0xFF
776 jmp .do_topright
777%endmacro
778
779INIT_XMM sse2
780PRED8x8L_DOWN_LEFT
781INIT_XMM ssse3
782PRED8x8L_DOWN_LEFT
783%if HAVE_AVX_EXTERNAL
784INIT_XMM avx
785PRED8x8L_DOWN_LEFT
786%endif
787
788;-----------------------------------------------------------------------------
789; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
790; int stride)
791;-----------------------------------------------------------------------------
792%macro PRED8x8L_DOWN_RIGHT 0
793; standard forbids this when has_topleft is false
794; no need to check
795cglobal pred8x8l_down_right_10, 4, 5, 8
796 sub r0, r3
797 lea r4, [r0+r3*4]
798 lea r1, [r3*3]
799 mova m0, [r0+r3*1-16]
800 punpckhwd m0, [r0+r3*0-16]
801 mova m1, [r0+r1*1-16]
802 punpckhwd m1, [r0+r3*2-16]
803 punpckhdq m1, m0
804 mova m2, [r4+r3*1-16]
805 punpckhwd m2, [r4+r3*0-16]
806 mova m3, [r4+r1*1-16]
807 punpckhwd m3, [r4+r3*2-16]
808 punpckhdq m3, m2
809 punpckhqdq m3, m1
810 mova m0, [r4+r3*4-16]
811 mova m1, [r0]
812 PALIGNR m4, m3, m0, 14, m0
813 PALIGNR m1, m3, 2, m2
814 pslldq m0, m4, 2
815 pshuflw m0, m0, 11100101b
816 PRED4x4_LOWPASS m6, m1, m4, m3
817 PRED4x4_LOWPASS m4, m3, m0, m4
818 mova m3, [r0]
819 shr r2d, 13
820 pslldq m1, m3, 2
821 psrldq m2, m3, 2
822 pinsrw m1, [r0-2], 0
823 pinsrw m2, [r0+r2+14], 7
824 PRED4x4_LOWPASS m3, m2, m1, m3
825 PALIGNR m2, m3, m6, 2, m0
826 PALIGNR m5, m3, m6, 14, m0
827 psrldq m7, m3, 2
828 PRED4x4_LOWPASS m6, m4, m2, m6
829 PRED4x4_LOWPASS m3, m5, m7, m3
830 mova [r4+r3*4], m6
831 PALIGNR m3, m6, 14, m2
832 pslldq m6, 2
833 mova [r0+r3*1], m3
834 PALIGNR m3, m6, 14, m2
835 pslldq m6, 2
836 mova [r0+r3*2], m3
837 PALIGNR m3, m6, 14, m2
838 pslldq m6, 2
839 mova [r0+r1*1], m3
840 PALIGNR m3, m6, 14, m2
841 pslldq m6, 2
842 mova [r0+r3*4], m3
843 PALIGNR m3, m6, 14, m2
844 pslldq m6, 2
845 mova [r4+r3*1], m3
846 PALIGNR m3, m6, 14, m2
847 pslldq m6, 2
848 mova [r4+r3*2], m3
849 PALIGNR m3, m6, 14, m6
850 mova [r4+r1*1], m3
851 RET
852%endmacro
853
854INIT_XMM sse2
855PRED8x8L_DOWN_RIGHT
856INIT_XMM ssse3
857PRED8x8L_DOWN_RIGHT
858%if HAVE_AVX_EXTERNAL
859INIT_XMM avx
860PRED8x8L_DOWN_RIGHT
861%endif
862
863;-----------------------------------------------------------------------------
864; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
865; int has_topright, int stride)
866;-----------------------------------------------------------------------------
867%macro PRED8x8L_VERTICAL_RIGHT 0
868; likewise with 8x8l_down_right
869cglobal pred8x8l_vertical_right_10, 4, 5, 7
870 sub r0, r3
871 lea r4, [r0+r3*4]
872 lea r1, [r3*3]
873 mova m0, [r0+r3*1-16]
874 punpckhwd m0, [r0+r3*0-16]
875 mova m1, [r0+r1*1-16]
876 punpckhwd m1, [r0+r3*2-16]
877 punpckhdq m1, m0
878 mova m2, [r4+r3*1-16]
879 punpckhwd m2, [r4+r3*0-16]
880 mova m3, [r4+r1*1-16]
881 punpckhwd m3, [r4+r3*2-16]
882 punpckhdq m3, m2
883 punpckhqdq m3, m1
884 mova m0, [r4+r3*4-16]
885 mova m1, [r0]
886 PALIGNR m4, m3, m0, 14, m0
887 PALIGNR m1, m3, 2, m2
888 PRED4x4_LOWPASS m3, m1, m4, m3
889 mova m2, [r0]
890 shr r2d, 13
891 pslldq m1, m2, 2
892 psrldq m5, m2, 2
893 pinsrw m1, [r0-2], 0
894 pinsrw m5, [r0+r2+14], 7
895 PRED4x4_LOWPASS m2, m5, m1, m2
896 PALIGNR m6, m2, m3, 12, m1
897 PALIGNR m5, m2, m3, 14, m0
898 PRED4x4_LOWPASS m0, m6, m2, m5
899 pavgw m2, m5
900 mova [r0+r3*2], m0
901 mova [r0+r3*1], m2
902 pslldq m6, m3, 4
903 pslldq m1, m3, 2
904 PRED4x4_LOWPASS m1, m3, m6, m1
905 PALIGNR m2, m1, 14, m4
906 mova [r0+r1*1], m2
907 pslldq m1, 2
908 PALIGNR m0, m1, 14, m3
909 mova [r0+r3*4], m0
910 pslldq m1, 2
911 PALIGNR m2, m1, 14, m4
912 mova [r4+r3*1], m2
913 pslldq m1, 2
914 PALIGNR m0, m1, 14, m3
915 mova [r4+r3*2], m0
916 pslldq m1, 2
917 PALIGNR m2, m1, 14, m4
918 mova [r4+r1*1], m2
919 pslldq m1, 2
920 PALIGNR m0, m1, 14, m1
921 mova [r4+r3*4], m0
922 RET
923%endmacro
924
925INIT_XMM sse2
926PRED8x8L_VERTICAL_RIGHT
927INIT_XMM ssse3
928PRED8x8L_VERTICAL_RIGHT
929%if HAVE_AVX_EXTERNAL
930INIT_XMM avx
931PRED8x8L_VERTICAL_RIGHT
932%endif
933
934;-----------------------------------------------------------------------------
935; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
936; int has_topright, int stride)
937;-----------------------------------------------------------------------------
938%macro PRED8x8L_HORIZONTAL_UP 0
939cglobal pred8x8l_horizontal_up_10, 4, 4, 6
940 mova m0, [r0+r3*0-16]
941 punpckhwd m0, [r0+r3*1-16]
942 shr r1d, 14
943 dec r1
944 and r1, r3
945 sub r1, r3
946 mova m4, [r0+r1*1-16]
947 lea r1, [r3*3]
948 lea r2, [r0+r3*4]
949 mova m1, [r0+r3*2-16]
950 punpckhwd m1, [r0+r1*1-16]
951 punpckhdq m0, m1
952 mova m2, [r2+r3*0-16]
953 punpckhwd m2, [r2+r3*1-16]
954 mova m3, [r2+r3*2-16]
955 punpckhwd m3, [r2+r1*1-16]
956 punpckhdq m2, m3
957 punpckhqdq m0, m2
958 PALIGNR m1, m0, m4, 14, m4
959 psrldq m2, m0, 2
960 pshufhw m2, m2, 10100100b
961 PRED4x4_LOWPASS m0, m1, m2, m0
962 psrldq m1, m0, 2
963 psrldq m2, m0, 4
964 pshufhw m1, m1, 10100100b
965 pshufhw m2, m2, 01010100b
966 pavgw m4, m0, m1
967 PRED4x4_LOWPASS m1, m2, m0, m1
968 punpckhwd m5, m4, m1
969 punpcklwd m4, m1
970 mova [r2+r3*0], m5
971 mova [r0+r3*0], m4
972 pshufd m0, m5, 11111001b
973 pshufd m1, m5, 11111110b
974 pshufd m2, m5, 11111111b
975 mova [r2+r3*1], m0
976 mova [r2+r3*2], m1
977 mova [r2+r1*1], m2
978 PALIGNR m2, m5, m4, 4, m0
979 PALIGNR m3, m5, m4, 8, m1
980 PALIGNR m5, m5, m4, 12, m4
981 mova [r0+r3*1], m2
982 mova [r0+r3*2], m3
983 mova [r0+r1*1], m5
984 RET
985%endmacro
986
987INIT_XMM sse2
988PRED8x8L_HORIZONTAL_UP
989INIT_XMM ssse3
990PRED8x8L_HORIZONTAL_UP
991%if HAVE_AVX_EXTERNAL
992INIT_XMM avx
993PRED8x8L_HORIZONTAL_UP
994%endif
995
996
997;-----------------------------------------------------------------------------
998; void ff_pred16x16_vertical(pixel *src, int stride)
999;-----------------------------------------------------------------------------
1000%macro MOV16 3-5
1001 mova [%1+ 0], %2
1002 mova [%1+mmsize], %3
1003%if mmsize==8
1004 mova [%1+ 16], %4
1005 mova [%1+ 24], %5
1006%endif
1007%endmacro
1008
1009%macro PRED16x16_VERTICAL 0
1010cglobal pred16x16_vertical_10, 2, 3
1011 sub r0, r1
1012 mov r2d, 8
1013 mova m0, [r0+ 0]
1014 mova m1, [r0+mmsize]
1015%if mmsize==8
1016 mova m2, [r0+16]
1017 mova m3, [r0+24]
1018%endif
1019.loop:
1020 MOV16 r0+r1*1, m0, m1, m2, m3
1021 MOV16 r0+r1*2, m0, m1, m2, m3
1022 lea r0, [r0+r1*2]
1023 dec r2d
1024 jg .loop
1025 REP_RET
1026%endmacro
1027
1028INIT_MMX mmxext
1029PRED16x16_VERTICAL
1030INIT_XMM sse2
1031PRED16x16_VERTICAL
1032
1033;-----------------------------------------------------------------------------
1034; void ff_pred16x16_horizontal(pixel *src, int stride)
1035;-----------------------------------------------------------------------------
1036%macro PRED16x16_HORIZONTAL 0
1037cglobal pred16x16_horizontal_10, 2, 3
1038 mov r2d, 8
1039.vloop:
1040 movd m0, [r0+r1*0-4]
1041 movd m1, [r0+r1*1-4]
1042 SPLATW m0, m0, 1
1043 SPLATW m1, m1, 1
1044 MOV16 r0+r1*0, m0, m0, m0, m0
1045 MOV16 r0+r1*1, m1, m1, m1, m1
1046 lea r0, [r0+r1*2]
1047 dec r2d
1048 jg .vloop
1049 REP_RET
1050%endmacro
1051
1052INIT_MMX mmxext
1053PRED16x16_HORIZONTAL
1054INIT_XMM sse2
1055PRED16x16_HORIZONTAL
1056
1057;-----------------------------------------------------------------------------
1058; void ff_pred16x16_dc(pixel *src, int stride)
1059;-----------------------------------------------------------------------------
1060%macro PRED16x16_DC 0
1061cglobal pred16x16_dc_10, 2, 6
1062 mov r5, r0
1063 sub r0, r1
1064 mova m0, [r0+0]
1065 paddw m0, [r0+mmsize]
1066%if mmsize==8
1067 paddw m0, [r0+16]
1068 paddw m0, [r0+24]
1069%endif
1070 HADDW m0, m2
1071
1072 lea r0, [r0+r1-2]
1073 movzx r3d, word [r0]
1074 movzx r4d, word [r0+r1]
1075%rep 7
1076 lea r0, [r0+r1*2]
1077 movzx r2d, word [r0]
1078 add r3d, r2d
1079 movzx r2d, word [r0+r1]
1080 add r4d, r2d
1081%endrep
1082 lea r3d, [r3+r4+16]
1083
1084 movd m1, r3d
1085 paddw m0, m1
1086 psrlw m0, 5
1087 SPLATW m0, m0
1088 mov r3d, 8
1089.loop:
1090 MOV16 r5+r1*0, m0, m0, m0, m0
1091 MOV16 r5+r1*1, m0, m0, m0, m0
1092 lea r5, [r5+r1*2]
1093 dec r3d
1094 jg .loop
1095 REP_RET
1096%endmacro
1097
1098INIT_MMX mmxext
1099PRED16x16_DC
1100INIT_XMM sse2
1101PRED16x16_DC
1102
1103;-----------------------------------------------------------------------------
1104; void ff_pred16x16_top_dc(pixel *src, int stride)
1105;-----------------------------------------------------------------------------
1106%macro PRED16x16_TOP_DC 0
1107cglobal pred16x16_top_dc_10, 2, 3
1108 sub r0, r1
1109 mova m0, [r0+0]
1110 paddw m0, [r0+mmsize]
1111%if mmsize==8
1112 paddw m0, [r0+16]
1113 paddw m0, [r0+24]
1114%endif
1115 HADDW m0, m2
1116
1117 SPLATW m0, m0
1118 paddw m0, [pw_8]
1119 psrlw m0, 4
1120 mov r2d, 8
1121.loop:
1122 MOV16 r0+r1*1, m0, m0, m0, m0
1123 MOV16 r0+r1*2, m0, m0, m0, m0
1124 lea r0, [r0+r1*2]
1125 dec r2d
1126 jg .loop
1127 REP_RET
1128%endmacro
1129
1130INIT_MMX mmxext
1131PRED16x16_TOP_DC
1132INIT_XMM sse2
1133PRED16x16_TOP_DC
1134
1135;-----------------------------------------------------------------------------
1136; void ff_pred16x16_left_dc(pixel *src, int stride)
1137;-----------------------------------------------------------------------------
1138%macro PRED16x16_LEFT_DC 0
1139cglobal pred16x16_left_dc_10, 2, 6
1140 mov r5, r0
1141
1142 sub r0, 2
1143 movzx r3d, word [r0]
1144 movzx r4d, word [r0+r1]
1145%rep 7
1146 lea r0, [r0+r1*2]
1147 movzx r2d, word [r0]
1148 add r3d, r2d
1149 movzx r2d, word [r0+r1]
1150 add r4d, r2d
1151%endrep
1152 lea r3d, [r3+r4+8]
1153 shr r3d, 4
1154
1155 movd m0, r3d
1156 SPLATW m0, m0
1157 mov r3d, 8
1158.loop:
1159 MOV16 r5+r1*0, m0, m0, m0, m0
1160 MOV16 r5+r1*1, m0, m0, m0, m0
1161 lea r5, [r5+r1*2]
1162 dec r3d
1163 jg .loop
1164 REP_RET
1165%endmacro
1166
1167INIT_MMX mmxext
1168PRED16x16_LEFT_DC
1169INIT_XMM sse2
1170PRED16x16_LEFT_DC
1171
1172;-----------------------------------------------------------------------------
1173; void ff_pred16x16_128_dc(pixel *src, int stride)
1174;-----------------------------------------------------------------------------
1175%macro PRED16x16_128_DC 0
1176cglobal pred16x16_128_dc_10, 2,3
1177 mova m0, [pw_512]
1178 mov r2d, 8
1179.loop:
1180 MOV16 r0+r1*0, m0, m0, m0, m0
1181 MOV16 r0+r1*1, m0, m0, m0, m0
1182 lea r0, [r0+r1*2]
1183 dec r2d
1184 jg .loop
1185 REP_RET
1186%endmacro
1187
1188INIT_MMX mmxext
1189PRED16x16_128_DC
1190INIT_XMM sse2
1191PRED16x16_128_DC