Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_intrapred.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* H.264 intra prediction asm optimizations
3;* Copyright (c) 2010 Fiona Glaser
4;* Copyright (c) 2010 Holger Lubitz
5;* Copyright (c) 2010 Loren Merritt
6;* Copyright (c) 2010 Ronald S. Bultje
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29tm_shuf: times 8 db 0x03, 0x80
30pw_ff00: times 8 dw 0xff00
31plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
39
40SECTION .text
41
42cextern pb_1
43cextern pb_3
44cextern pw_4
45cextern pw_5
46cextern pw_8
47cextern pw_16
48cextern pw_17
49cextern pw_32
50
51;-----------------------------------------------------------------------------
52; void ff_pred16x16_vertical_8(uint8_t *src, int stride)
53;-----------------------------------------------------------------------------
54
55INIT_MMX mmx
56cglobal pred16x16_vertical_8, 2,3
57 sub r0, r1
58 mov r2, 8
59 movq mm0, [r0+0]
60 movq mm1, [r0+8]
61.loop:
62 movq [r0+r1*1+0], mm0
63 movq [r0+r1*1+8], mm1
64 movq [r0+r1*2+0], mm0
65 movq [r0+r1*2+8], mm1
66 lea r0, [r0+r1*2]
67 dec r2
68 jg .loop
69 REP_RET
70
71INIT_XMM sse
72cglobal pred16x16_vertical_8, 2,3
73 sub r0, r1
74 mov r2, 4
75 movaps xmm0, [r0]
76.loop:
77 movaps [r0+r1*1], xmm0
78 movaps [r0+r1*2], xmm0
79 lea r0, [r0+r1*2]
80 movaps [r0+r1*1], xmm0
81 movaps [r0+r1*2], xmm0
82 lea r0, [r0+r1*2]
83 dec r2
84 jg .loop
85 REP_RET
86
87;-----------------------------------------------------------------------------
88; void ff_pred16x16_horizontal_8(uint8_t *src, int stride)
89;-----------------------------------------------------------------------------
90
91%macro PRED16x16_H 0
92cglobal pred16x16_horizontal_8, 2,3
93 mov r2, 8
94%if cpuflag(ssse3)
95 mova m2, [pb_3]
96%endif
97.loop:
98 movd m0, [r0+r1*0-4]
99 movd m1, [r0+r1*1-4]
100
101%if cpuflag(ssse3)
102 pshufb m0, m2
103 pshufb m1, m2
104%else
105 punpcklbw m0, m0
106 punpcklbw m1, m1
107 SPLATW m0, m0, 3
108 SPLATW m1, m1, 3
109 mova [r0+r1*0+8], m0
110 mova [r0+r1*1+8], m1
111%endif
112
113 mova [r0+r1*0], m0
114 mova [r0+r1*1], m1
115 lea r0, [r0+r1*2]
116 dec r2
117 jg .loop
118 REP_RET
119%endmacro
120
121INIT_MMX mmx
122PRED16x16_H
123INIT_MMX mmxext
124PRED16x16_H
125INIT_XMM ssse3
126PRED16x16_H
127
128;-----------------------------------------------------------------------------
129; void ff_pred16x16_dc_8(uint8_t *src, int stride)
130;-----------------------------------------------------------------------------
131
132%macro PRED16x16_DC 0
133cglobal pred16x16_dc_8, 2,7
134 mov r4, r0
135 sub r0, r1
136 pxor mm0, mm0
137 pxor mm1, mm1
138 psadbw mm0, [r0+0]
139 psadbw mm1, [r0+8]
140 dec r0
141 movzx r5d, byte [r0+r1*1]
142 paddw mm0, mm1
143 movd r6d, mm0
144 lea r0, [r0+r1*2]
145%rep 7
146 movzx r2d, byte [r0+r1*0]
147 movzx r3d, byte [r0+r1*1]
148 add r5d, r2d
149 add r6d, r3d
150 lea r0, [r0+r1*2]
151%endrep
152 movzx r2d, byte [r0+r1*0]
153 add r5d, r6d
154 lea r2d, [r2+r5+16]
155 shr r2d, 5
156%if cpuflag(ssse3)
157 pxor m1, m1
158%endif
159 SPLATB_REG m0, r2, m1
160
161%if mmsize==8
162 mov r3d, 8
163.loop:
164 mova [r4+r1*0+0], m0
165 mova [r4+r1*0+8], m0
166 mova [r4+r1*1+0], m0
167 mova [r4+r1*1+8], m0
168%else
169 mov r3d, 4
170.loop:
171 mova [r4+r1*0], m0
172 mova [r4+r1*1], m0
173 lea r4, [r4+r1*2]
174 mova [r4+r1*0], m0
175 mova [r4+r1*1], m0
176%endif
177 lea r4, [r4+r1*2]
178 dec r3d
179 jg .loop
180 REP_RET
181%endmacro
182
183INIT_MMX mmxext
184PRED16x16_DC
185INIT_XMM sse2
186PRED16x16_DC
187INIT_XMM ssse3
188PRED16x16_DC
189
190;-----------------------------------------------------------------------------
191; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride)
192;-----------------------------------------------------------------------------
193
194%macro PRED16x16_TM 0
195cglobal pred16x16_tm_vp8_8, 2,5
196 sub r0, r1
197 pxor mm7, mm7
198 movq mm0, [r0+0]
199 movq mm2, [r0+8]
200 movq mm1, mm0
201 movq mm3, mm2
202 punpcklbw mm0, mm7
203 punpckhbw mm1, mm7
204 punpcklbw mm2, mm7
205 punpckhbw mm3, mm7
206 movzx r3d, byte [r0-1]
207 mov r4d, 16
208.loop:
209 movzx r2d, byte [r0+r1-1]
210 sub r2d, r3d
211 movd mm4, r2d
212 SPLATW mm4, mm4, 0
213 movq mm5, mm4
214 movq mm6, mm4
215 movq mm7, mm4
216 paddw mm4, mm0
217 paddw mm5, mm1
218 paddw mm6, mm2
219 paddw mm7, mm3
220 packuswb mm4, mm5
221 packuswb mm6, mm7
222 movq [r0+r1+0], mm4
223 movq [r0+r1+8], mm6
224 add r0, r1
225 dec r4d
226 jg .loop
227 REP_RET
228%endmacro
229
230INIT_MMX mmx
231PRED16x16_TM
232INIT_MMX mmxext
233PRED16x16_TM
234
235INIT_XMM sse2
236cglobal pred16x16_tm_vp8_8, 2,6,6
237 sub r0, r1
238 pxor xmm2, xmm2
239 movdqa xmm0, [r0]
240 movdqa xmm1, xmm0
241 punpcklbw xmm0, xmm2
242 punpckhbw xmm1, xmm2
243 movzx r4d, byte [r0-1]
244 mov r5d, 8
245.loop:
246 movzx r2d, byte [r0+r1*1-1]
247 movzx r3d, byte [r0+r1*2-1]
248 sub r2d, r4d
249 sub r3d, r4d
250 movd xmm2, r2d
251 movd xmm4, r3d
252 pshuflw xmm2, xmm2, 0
253 pshuflw xmm4, xmm4, 0
254 punpcklqdq xmm2, xmm2
255 punpcklqdq xmm4, xmm4
256 movdqa xmm3, xmm2
257 movdqa xmm5, xmm4
258 paddw xmm2, xmm0
259 paddw xmm3, xmm1
260 paddw xmm4, xmm0
261 paddw xmm5, xmm1
262 packuswb xmm2, xmm3
263 packuswb xmm4, xmm5
264 movdqa [r0+r1*1], xmm2
265 movdqa [r0+r1*2], xmm4
266 lea r0, [r0+r1*2]
267 dec r5d
268 jg .loop
269 REP_RET
270
271;-----------------------------------------------------------------------------
272; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
273;-----------------------------------------------------------------------------
274
275%macro H264_PRED16x16_PLANE 1
276cglobal pred16x16_plane_%1_8, 2,9,7
277 mov r2, r1 ; +stride
278 neg r1 ; -stride
279
280 movh m0, [r0+r1 -1]
281%if mmsize == 8
282 pxor m4, m4
283 movh m1, [r0+r1 +3 ]
284 movh m2, [r0+r1 +8 ]
285 movh m3, [r0+r1 +12]
286 punpcklbw m0, m4
287 punpcklbw m1, m4
288 punpcklbw m2, m4
289 punpcklbw m3, m4
290 pmullw m0, [pw_m8tom1 ]
291 pmullw m1, [pw_m8tom1+8]
292 pmullw m2, [pw_1to8 ]
293 pmullw m3, [pw_1to8 +8]
294 paddw m0, m2
295 paddw m1, m3
296%else ; mmsize == 16
297%if cpuflag(ssse3)
298 movhps m0, [r0+r1 +8]
299 pmaddubsw m0, [plane_shuf] ; H coefficients
300%else ; sse2
301 pxor m2, m2
302 movh m1, [r0+r1 +8]
303 punpcklbw m0, m2
304 punpcklbw m1, m2
305 pmullw m0, [pw_m8tom1]
306 pmullw m1, [pw_1to8]
307 paddw m0, m1
308%endif
309 movhlps m1, m0
310%endif
311 paddw m0, m1
312%if cpuflag(mmxext)
313 PSHUFLW m1, m0, 0xE
314%elif cpuflag(mmx)
315 mova m1, m0
316 psrlq m1, 32
317%endif
318 paddw m0, m1
319%if cpuflag(mmxext)
320 PSHUFLW m1, m0, 0x1
321%elif cpuflag(mmx)
322 mova m1, m0
323 psrlq m1, 16
324%endif
325 paddw m0, m1 ; sum of H coefficients
326
327 lea r4, [r0+r2*8-1]
328 lea r3, [r0+r2*4-1]
329 add r4, r2
330
331%if ARCH_X86_64
332%define e_reg r8
333%else
334%define e_reg r0
335%endif
336
337 movzx e_reg, byte [r3+r2*2 ]
338 movzx r5, byte [r4+r1 ]
339 sub r5, e_reg
340
341 movzx e_reg, byte [r3+r2 ]
342 movzx r6, byte [r4 ]
343 sub r6, e_reg
344 lea r5, [r5+r6*2]
345
346 movzx e_reg, byte [r3+r1 ]
347 movzx r6, byte [r4+r2*2 ]
348 sub r6, e_reg
349 lea r5, [r5+r6*4]
350
351 movzx e_reg, byte [r3 ]
352%if ARCH_X86_64
353 movzx r7, byte [r4+r2 ]
354 sub r7, e_reg
355%else
356 movzx r6, byte [r4+r2 ]
357 sub r6, e_reg
358 lea r5, [r5+r6*4]
359 sub r5, r6
360%endif
361
362 lea e_reg, [r3+r1*4]
363 lea r3, [r4+r2*4]
364
365 movzx r4, byte [e_reg+r2 ]
366 movzx r6, byte [r3 ]
367 sub r6, r4
368%if ARCH_X86_64
369 lea r6, [r7+r6*2]
370 lea r5, [r5+r6*2]
371 add r5, r6
372%else
373 lea r5, [r5+r6*4]
374 lea r5, [r5+r6*2]
375%endif
376
377 movzx r4, byte [e_reg ]
378%if ARCH_X86_64
379 movzx r7, byte [r3 +r2 ]
380 sub r7, r4
381 sub r5, r7
382%else
383 movzx r6, byte [r3 +r2 ]
384 sub r6, r4
385 lea r5, [r5+r6*8]
386 sub r5, r6
387%endif
388
389 movzx r4, byte [e_reg+r1 ]
390 movzx r6, byte [r3 +r2*2]
391 sub r6, r4
392%if ARCH_X86_64
393 add r6, r7
394%endif
395 lea r5, [r5+r6*8]
396
397 movzx r4, byte [e_reg+r2*2]
398 movzx r6, byte [r3 +r1 ]
399 sub r6, r4
400 lea r5, [r5+r6*4]
401 add r5, r6 ; sum of V coefficients
402
403%if ARCH_X86_64 == 0
404 mov r0, r0m
405%endif
406
407%ifidn %1, h264
408 lea r5, [r5*5+32]
409 sar r5, 6
410%elifidn %1, rv40
411 lea r5, [r5*5]
412 sar r5, 6
413%elifidn %1, svq3
414 test r5, r5
415 lea r6, [r5+3]
416 cmovs r5, r6
417 sar r5, 2 ; V/4
418 lea r5, [r5*5] ; 5*(V/4)
419 test r5, r5
420 lea r6, [r5+15]
421 cmovs r5, r6
422 sar r5, 4 ; (5*(V/4))/16
423%endif
424
425 movzx r4, byte [r0+r1 +15]
426 movzx r3, byte [r3+r2*2 ]
427 lea r3, [r3+r4+1]
428 shl r3, 4
429
430 movd r1d, m0
431 movsx r1d, r1w
432%ifnidn %1, svq3
433%ifidn %1, h264
434 lea r1d, [r1d*5+32]
435%else ; rv40
436 lea r1d, [r1d*5]
437%endif
438 sar r1d, 6
439%else ; svq3
440 test r1d, r1d
441 lea r4d, [r1d+3]
442 cmovs r1d, r4d
443 sar r1d, 2 ; H/4
444 lea r1d, [r1d*5] ; 5*(H/4)
445 test r1d, r1d
446 lea r4d, [r1d+15]
447 cmovs r1d, r4d
448 sar r1d, 4 ; (5*(H/4))/16
449%endif
450 movd m0, r1d
451
452 add r1d, r5d
453 add r3d, r1d
454 shl r1d, 3
455 sub r3d, r1d ; a
456
457 movd m1, r5d
458 movd m3, r3d
459 SPLATW m0, m0, 0 ; H
460 SPLATW m1, m1, 0 ; V
461 SPLATW m3, m3, 0 ; a
462%ifidn %1, svq3
463 SWAP 0, 1
464%endif
465 mova m2, m0
466%if mmsize == 8
467 mova m5, m0
468%endif
469 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
470%if mmsize == 16
471 psllw m2, 3
472%else
473 psllw m5, 3
474 psllw m2, 2
475 mova m6, m5
476 paddw m6, m2
477%endif
478 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
479 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
480%if mmsize == 8
481 paddw m5, m0 ; a + {8,9,10,11}*H
482 paddw m6, m0 ; a + {12,13,14,15}*H
483%endif
484
485 mov r4, 8
486.loop:
487 mova m3, m0 ; b[0..7]
488 mova m4, m2 ; b[8..15]
489 psraw m3, 5
490 psraw m4, 5
491 packuswb m3, m4
492 mova [r0], m3
493%if mmsize == 8
494 mova m3, m5 ; b[8..11]
495 mova m4, m6 ; b[12..15]
496 psraw m3, 5
497 psraw m4, 5
498 packuswb m3, m4
499 mova [r0+8], m3
500%endif
501 paddw m0, m1
502 paddw m2, m1
503%if mmsize == 8
504 paddw m5, m1
505 paddw m6, m1
506%endif
507
508 mova m3, m0 ; b[0..7]
509 mova m4, m2 ; b[8..15]
510 psraw m3, 5
511 psraw m4, 5
512 packuswb m3, m4
513 mova [r0+r2], m3
514%if mmsize == 8
515 mova m3, m5 ; b[8..11]
516 mova m4, m6 ; b[12..15]
517 psraw m3, 5
518 psraw m4, 5
519 packuswb m3, m4
520 mova [r0+r2+8], m3
521%endif
522 paddw m0, m1
523 paddw m2, m1
524%if mmsize == 8
525 paddw m5, m1
526 paddw m6, m1
527%endif
528
529 lea r0, [r0+r2*2]
530 dec r4
531 jg .loop
532 REP_RET
533%endmacro
534
535INIT_MMX mmx
536H264_PRED16x16_PLANE h264
537H264_PRED16x16_PLANE rv40
538H264_PRED16x16_PLANE svq3
539INIT_MMX mmxext
540H264_PRED16x16_PLANE h264
541H264_PRED16x16_PLANE rv40
542H264_PRED16x16_PLANE svq3
543INIT_XMM sse2
544H264_PRED16x16_PLANE h264
545H264_PRED16x16_PLANE rv40
546H264_PRED16x16_PLANE svq3
547INIT_XMM ssse3
548H264_PRED16x16_PLANE h264
549H264_PRED16x16_PLANE rv40
550H264_PRED16x16_PLANE svq3
551
552;-----------------------------------------------------------------------------
553; void ff_pred8x8_plane_8(uint8_t *src, int stride)
554;-----------------------------------------------------------------------------
555
556%macro H264_PRED8x8_PLANE 0
557cglobal pred8x8_plane_8, 2,9,7
558 mov r2, r1 ; +stride
559 neg r1 ; -stride
560
561 movd m0, [r0+r1 -1]
562%if mmsize == 8
563 pxor m2, m2
564 movh m1, [r0+r1 +4 ]
565 punpcklbw m0, m2
566 punpcklbw m1, m2
567 pmullw m0, [pw_m4to4]
568 pmullw m1, [pw_m4to4+8]
569%else ; mmsize == 16
570%if cpuflag(ssse3)
571 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
572 pmaddubsw m0, [plane8_shuf] ; H coefficients
573%else ; sse2
574 pxor m2, m2
575 movd m1, [r0+r1 +4]
576 punpckldq m0, m1
577 punpcklbw m0, m2
578 pmullw m0, [pw_m4to4]
579%endif
580 movhlps m1, m0
581%endif
582 paddw m0, m1
583
584%if notcpuflag(ssse3)
585%if cpuflag(mmxext)
586 PSHUFLW m1, m0, 0xE
587%elif cpuflag(mmx)
588 mova m1, m0
589 psrlq m1, 32
590%endif
591 paddw m0, m1
592%endif ; !ssse3
593
594%if cpuflag(mmxext)
595 PSHUFLW m1, m0, 0x1
596%elif cpuflag(mmx)
597 mova m1, m0
598 psrlq m1, 16
599%endif
600 paddw m0, m1 ; sum of H coefficients
601
602 lea r4, [r0+r2*4-1]
603 lea r3, [r0 -1]
604 add r4, r2
605
606%if ARCH_X86_64
607%define e_reg r8
608%else
609%define e_reg r0
610%endif
611
612 movzx e_reg, byte [r3+r2*2 ]
613 movzx r5, byte [r4+r1 ]
614 sub r5, e_reg
615
616 movzx e_reg, byte [r3 ]
617%if ARCH_X86_64
618 movzx r7, byte [r4+r2 ]
619 sub r7, e_reg
620 sub r5, r7
621%else
622 movzx r6, byte [r4+r2 ]
623 sub r6, e_reg
624 lea r5, [r5+r6*4]
625 sub r5, r6
626%endif
627
628 movzx e_reg, byte [r3+r1 ]
629 movzx r6, byte [r4+r2*2 ]
630 sub r6, e_reg
631%if ARCH_X86_64
632 add r6, r7
633%endif
634 lea r5, [r5+r6*4]
635
636 movzx e_reg, byte [r3+r2 ]
637 movzx r6, byte [r4 ]
638 sub r6, e_reg
639 lea r6, [r5+r6*2]
640
641 lea r5, [r6*9+16]
642 lea r5, [r5+r6*8]
643 sar r5, 5
644
645%if ARCH_X86_64 == 0
646 mov r0, r0m
647%endif
648
649 movzx r3, byte [r4+r2*2 ]
650 movzx r4, byte [r0+r1 +7]
651 lea r3, [r3+r4+1]
652 shl r3, 4
653 movd r1d, m0
654 movsx r1d, r1w
655 imul r1d, 17
656 add r1d, 16
657 sar r1d, 5
658 movd m0, r1d
659 add r1d, r5d
660 sub r3d, r1d
661 add r1d, r1d
662 sub r3d, r1d ; a
663
664 movd m1, r5d
665 movd m3, r3d
666 SPLATW m0, m0, 0 ; H
667 SPLATW m1, m1, 0 ; V
668 SPLATW m3, m3, 0 ; a
669%if mmsize == 8
670 mova m2, m0
671%endif
672 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
673 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
674%if mmsize == 8
675 psllw m2, 2
676 paddw m2, m0 ; a + {4,5,6,7}*H
677%endif
678
679 mov r4, 4
680ALIGN 16
681.loop:
682%if mmsize == 16
683 mova m3, m0 ; b[0..7]
684 paddw m0, m1
685 psraw m3, 5
686 mova m4, m0 ; V+b[0..7]
687 paddw m0, m1
688 psraw m4, 5
689 packuswb m3, m4
690 movh [r0], m3
691 movhps [r0+r2], m3
692%else ; mmsize == 8
693 mova m3, m0 ; b[0..3]
694 mova m4, m2 ; b[4..7]
695 paddw m0, m1
696 paddw m2, m1
697 psraw m3, 5
698 psraw m4, 5
699 mova m5, m0 ; V+b[0..3]
700 mova m6, m2 ; V+b[4..7]
701 paddw m0, m1
702 paddw m2, m1
703 psraw m5, 5
704 psraw m6, 5
705 packuswb m3, m4
706 packuswb m5, m6
707 mova [r0], m3
708 mova [r0+r2], m5
709%endif
710
711 lea r0, [r0+r2*2]
712 dec r4
713 jg .loop
714 REP_RET
715%endmacro
716
717INIT_MMX mmx
718H264_PRED8x8_PLANE
719INIT_MMX mmxext
720H264_PRED8x8_PLANE
721INIT_XMM sse2
722H264_PRED8x8_PLANE
723INIT_XMM ssse3
724H264_PRED8x8_PLANE
725
726;-----------------------------------------------------------------------------
727; void ff_pred8x8_vertical_8(uint8_t *src, int stride)
728;-----------------------------------------------------------------------------
729
730INIT_MMX mmx
731cglobal pred8x8_vertical_8, 2,2
732 sub r0, r1
733 movq mm0, [r0]
734%rep 3
735 movq [r0+r1*1], mm0
736 movq [r0+r1*2], mm0
737 lea r0, [r0+r1*2]
738%endrep
739 movq [r0+r1*1], mm0
740 movq [r0+r1*2], mm0
741 RET
742
743;-----------------------------------------------------------------------------
744; void ff_pred8x8_horizontal_8(uint8_t *src, int stride)
745;-----------------------------------------------------------------------------
746
747%macro PRED8x8_H 0
748cglobal pred8x8_horizontal_8, 2,3
749 mov r2, 4
750%if cpuflag(ssse3)
751 mova m2, [pb_3]
752%endif
753.loop:
754 SPLATB_LOAD m0, r0+r1*0-1, m2
755 SPLATB_LOAD m1, r0+r1*1-1, m2
756 mova [r0+r1*0], m0
757 mova [r0+r1*1], m1
758 lea r0, [r0+r1*2]
759 dec r2
760 jg .loop
761 REP_RET
762%endmacro
763
764INIT_MMX mmx
765PRED8x8_H
766INIT_MMX mmxext
767PRED8x8_H
768INIT_MMX ssse3
769PRED8x8_H
770
771;-----------------------------------------------------------------------------
772; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
773;-----------------------------------------------------------------------------
774INIT_MMX mmxext
775cglobal pred8x8_top_dc_8, 2,5
776 sub r0, r1
777 movq mm0, [r0]
778 pxor mm1, mm1
779 pxor mm2, mm2
780 lea r2, [r0+r1*2]
781 punpckhbw mm1, mm0
782 punpcklbw mm0, mm2
783 psadbw mm1, mm2 ; s1
784 lea r3, [r2+r1*2]
785 psadbw mm0, mm2 ; s0
786 psrlw mm1, 1
787 psrlw mm0, 1
788 pavgw mm1, mm2
789 lea r4, [r3+r1*2]
790 pavgw mm0, mm2
791 pshufw mm1, mm1, 0
792 pshufw mm0, mm0, 0 ; dc0 (w)
793 packuswb mm0, mm1 ; dc0,dc1 (b)
794 movq [r0+r1*1], mm0
795 movq [r0+r1*2], mm0
796 lea r0, [r3+r1*2]
797 movq [r2+r1*1], mm0
798 movq [r2+r1*2], mm0
799 movq [r3+r1*1], mm0
800 movq [r3+r1*2], mm0
801 movq [r0+r1*1], mm0
802 movq [r0+r1*2], mm0
803 RET
804
805;-----------------------------------------------------------------------------
806; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride)
807;-----------------------------------------------------------------------------
808
809INIT_MMX mmxext
810cglobal pred8x8_dc_8, 2,5
811 sub r0, r1
812 pxor m7, m7
813 movd m0, [r0+0]
814 movd m1, [r0+4]
815 psadbw m0, m7 ; s0
816 mov r4, r0
817 psadbw m1, m7 ; s1
818
819 movzx r2d, byte [r0+r1*1-1]
820 movzx r3d, byte [r0+r1*2-1]
821 lea r0, [r0+r1*2]
822 add r2d, r3d
823 movzx r3d, byte [r0+r1*1-1]
824 add r2d, r3d
825 movzx r3d, byte [r0+r1*2-1]
826 add r2d, r3d
827 lea r0, [r0+r1*2]
828 movd m2, r2d ; s2
829 movzx r2d, byte [r0+r1*1-1]
830 movzx r3d, byte [r0+r1*2-1]
831 lea r0, [r0+r1*2]
832 add r2d, r3d
833 movzx r3d, byte [r0+r1*1-1]
834 add r2d, r3d
835 movzx r3d, byte [r0+r1*2-1]
836 add r2d, r3d
837 movd m3, r2d ; s3
838
839 punpcklwd m0, m1
840 mov r0, r4
841 punpcklwd m2, m3
842 punpckldq m0, m2 ; s0, s1, s2, s3
843 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
844 lea r2, [r0+r1*2]
845 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
846 paddw m0, m3
847 lea r3, [r2+r1*2]
848 psrlw m0, 2
849 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
850 lea r4, [r3+r1*2]
851 packuswb m0, m0
852 punpcklbw m0, m0
853 movq m1, m0
854 punpcklbw m0, m0
855 punpckhbw m1, m1
856 movq [r0+r1*1], m0
857 movq [r0+r1*2], m0
858 movq [r2+r1*1], m0
859 movq [r2+r1*2], m0
860 movq [r3+r1*1], m1
861 movq [r3+r1*2], m1
862 movq [r4+r1*1], m1
863 movq [r4+r1*2], m1
864 RET
865
866;-----------------------------------------------------------------------------
867; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride)
868;-----------------------------------------------------------------------------
869
870INIT_MMX mmxext
871cglobal pred8x8_dc_rv40_8, 2,7
872 mov r4, r0
873 sub r0, r1
874 pxor mm0, mm0
875 psadbw mm0, [r0]
876 dec r0
877 movzx r5d, byte [r0+r1*1]
878 movd r6d, mm0
879 lea r0, [r0+r1*2]
880%rep 3
881 movzx r2d, byte [r0+r1*0]
882 movzx r3d, byte [r0+r1*1]
883 add r5d, r2d
884 add r6d, r3d
885 lea r0, [r0+r1*2]
886%endrep
887 movzx r2d, byte [r0+r1*0]
888 add r5d, r6d
889 lea r2d, [r2+r5+8]
890 shr r2d, 4
891 movd mm0, r2d
892 punpcklbw mm0, mm0
893 pshufw mm0, mm0, 0
894 mov r3d, 4
895.loop:
896 movq [r4+r1*0], mm0
897 movq [r4+r1*1], mm0
898 lea r4, [r4+r1*2]
899 dec r3d
900 jg .loop
901 REP_RET
902
903;-----------------------------------------------------------------------------
904; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride)
905;-----------------------------------------------------------------------------
906
907%macro PRED8x8_TM 0
908cglobal pred8x8_tm_vp8_8, 2,6
909 sub r0, r1
910 pxor mm7, mm7
911 movq mm0, [r0]
912 movq mm1, mm0
913 punpcklbw mm0, mm7
914 punpckhbw mm1, mm7
915 movzx r4d, byte [r0-1]
916 mov r5d, 4
917.loop:
918 movzx r2d, byte [r0+r1*1-1]
919 movzx r3d, byte [r0+r1*2-1]
920 sub r2d, r4d
921 sub r3d, r4d
922 movd mm2, r2d
923 movd mm4, r3d
924 SPLATW mm2, mm2, 0
925 SPLATW mm4, mm4, 0
926 movq mm3, mm2
927 movq mm5, mm4
928 paddw mm2, mm0
929 paddw mm3, mm1
930 paddw mm4, mm0
931 paddw mm5, mm1
932 packuswb mm2, mm3
933 packuswb mm4, mm5
934 movq [r0+r1*1], mm2
935 movq [r0+r1*2], mm4
936 lea r0, [r0+r1*2]
937 dec r5d
938 jg .loop
939 REP_RET
940%endmacro
941
942INIT_MMX mmx
943PRED8x8_TM
944INIT_MMX mmxext
945PRED8x8_TM
946
947INIT_XMM sse2
948cglobal pred8x8_tm_vp8_8, 2,6,4
949 sub r0, r1
950 pxor xmm1, xmm1
951 movq xmm0, [r0]
952 punpcklbw xmm0, xmm1
953 movzx r4d, byte [r0-1]
954 mov r5d, 4
955.loop:
956 movzx r2d, byte [r0+r1*1-1]
957 movzx r3d, byte [r0+r1*2-1]
958 sub r2d, r4d
959 sub r3d, r4d
960 movd xmm2, r2d
961 movd xmm3, r3d
962 pshuflw xmm2, xmm2, 0
963 pshuflw xmm3, xmm3, 0
964 punpcklqdq xmm2, xmm2
965 punpcklqdq xmm3, xmm3
966 paddw xmm2, xmm0
967 paddw xmm3, xmm0
968 packuswb xmm2, xmm3
969 movq [r0+r1*1], xmm2
970 movhps [r0+r1*2], xmm2
971 lea r0, [r0+r1*2]
972 dec r5d
973 jg .loop
974 REP_RET
975
976INIT_XMM ssse3
977cglobal pred8x8_tm_vp8_8, 2,3,6
978 sub r0, r1
979 movdqa xmm4, [tm_shuf]
980 pxor xmm1, xmm1
981 movq xmm0, [r0]
982 punpcklbw xmm0, xmm1
983 movd xmm5, [r0-4]
984 pshufb xmm5, xmm4
985 mov r2d, 4
986.loop:
987 movd xmm2, [r0+r1*1-4]
988 movd xmm3, [r0+r1*2-4]
989 pshufb xmm2, xmm4
990 pshufb xmm3, xmm4
991 psubw xmm2, xmm5
992 psubw xmm3, xmm5
993 paddw xmm2, xmm0
994 paddw xmm3, xmm0
995 packuswb xmm2, xmm3
996 movq [r0+r1*1], xmm2
997 movhps [r0+r1*2], xmm2
998 lea r0, [r0+r1*2]
999 dec r2d
1000 jg .loop
1001 REP_RET
1002
1003; dest, left, right, src, tmp
1004; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1005%macro PRED4x4_LOWPASS 5
1006 mova %5, %2
1007 pavgb %2, %3
1008 pxor %3, %5
1009 mova %1, %4
1010 pand %3, [pb_1]
1011 psubusb %2, %3
1012 pavgb %1, %2
1013%endmacro
1014
1015;-----------------------------------------------------------------------------
1016; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1017; int stride)
1018;-----------------------------------------------------------------------------
1019%macro PRED8x8L_TOP_DC 0
1020cglobal pred8x8l_top_dc_8, 4,4
1021 sub r0, r3
1022 pxor mm7, mm7
1023 movq mm0, [r0-8]
1024 movq mm3, [r0]
1025 movq mm1, [r0+8]
1026 movq mm2, mm3
1027 movq mm4, mm3
1028 PALIGNR mm2, mm0, 7, mm0
1029 PALIGNR mm1, mm4, 1, mm4
1030 test r1, r1 ; top_left
1031 jz .fix_lt_2
1032 test r2, r2 ; top_right
1033 jz .fix_tr_1
1034 jmp .body
1035.fix_lt_2:
1036 movq mm5, mm3
1037 pxor mm5, mm2
1038 psllq mm5, 56
1039 psrlq mm5, 56
1040 pxor mm2, mm5
1041 test r2, r2 ; top_right
1042 jnz .body
1043.fix_tr_1:
1044 movq mm5, mm3
1045 pxor mm5, mm1
1046 psrlq mm5, 56
1047 psllq mm5, 56
1048 pxor mm1, mm5
1049.body:
1050 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1051 psadbw mm7, mm0
1052 paddw mm7, [pw_4]
1053 psrlw mm7, 3
1054 pshufw mm7, mm7, 0
1055 packuswb mm7, mm7
1056%rep 3
1057 movq [r0+r3*1], mm7
1058 movq [r0+r3*2], mm7
1059 lea r0, [r0+r3*2]
1060%endrep
1061 movq [r0+r3*1], mm7
1062 movq [r0+r3*2], mm7
1063 RET
1064%endmacro
1065
1066INIT_MMX mmxext
1067PRED8x8L_TOP_DC
1068INIT_MMX ssse3
1069PRED8x8L_TOP_DC
1070
1071;-----------------------------------------------------------------------------
1072; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1073; int stride)
1074;-----------------------------------------------------------------------------
1075
1076%macro PRED8x8L_DC 0
1077cglobal pred8x8l_dc_8, 4,5
1078 sub r0, r3
1079 lea r4, [r0+r3*2]
1080 movq mm0, [r0+r3*1-8]
1081 punpckhbw mm0, [r0+r3*0-8]
1082 movq mm1, [r4+r3*1-8]
1083 punpckhbw mm1, [r0+r3*2-8]
1084 mov r4, r0
1085 punpckhwd mm1, mm0
1086 lea r0, [r0+r3*4]
1087 movq mm2, [r0+r3*1-8]
1088 punpckhbw mm2, [r0+r3*0-8]
1089 lea r0, [r0+r3*2]
1090 movq mm3, [r0+r3*1-8]
1091 punpckhbw mm3, [r0+r3*0-8]
1092 punpckhwd mm3, mm2
1093 punpckhdq mm3, mm1
1094 lea r0, [r0+r3*2]
1095 movq mm0, [r0+r3*0-8]
1096 movq mm1, [r4]
1097 mov r0, r4
1098 movq mm4, mm3
1099 movq mm2, mm3
1100 PALIGNR mm4, mm0, 7, mm0
1101 PALIGNR mm1, mm2, 1, mm2
1102 test r1, r1
1103 jnz .do_left
1104.fix_lt_1:
1105 movq mm5, mm3
1106 pxor mm5, mm4
1107 psrlq mm5, 56
1108 psllq mm5, 48
1109 pxor mm1, mm5
1110 jmp .do_left
1111.fix_lt_2:
1112 movq mm5, mm3
1113 pxor mm5, mm2
1114 psllq mm5, 56
1115 psrlq mm5, 56
1116 pxor mm2, mm5
1117 test r2, r2
1118 jnz .body
1119.fix_tr_1:
1120 movq mm5, mm3
1121 pxor mm5, mm1
1122 psrlq mm5, 56
1123 psllq mm5, 56
1124 pxor mm1, mm5
1125 jmp .body
1126.do_left:
1127 movq mm0, mm4
1128 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1129 movq mm4, mm0
1130 movq mm7, mm2
1131 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1132 psllq mm1, 56
1133 PALIGNR mm7, mm1, 7, mm3
1134 movq mm0, [r0-8]
1135 movq mm3, [r0]
1136 movq mm1, [r0+8]
1137 movq mm2, mm3
1138 movq mm4, mm3
1139 PALIGNR mm2, mm0, 7, mm0
1140 PALIGNR mm1, mm4, 1, mm4
1141 test r1, r1
1142 jz .fix_lt_2
1143 test r2, r2
1144 jz .fix_tr_1
1145.body:
1146 lea r1, [r0+r3*2]
1147 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1148 pxor mm0, mm0
1149 pxor mm1, mm1
1150 lea r2, [r1+r3*2]
1151 psadbw mm0, mm7
1152 psadbw mm1, mm6
1153 paddw mm0, [pw_8]
1154 paddw mm0, mm1
1155 lea r4, [r2+r3*2]
1156 psrlw mm0, 4
1157 pshufw mm0, mm0, 0
1158 packuswb mm0, mm0
1159 movq [r0+r3*1], mm0
1160 movq [r0+r3*2], mm0
1161 movq [r1+r3*1], mm0
1162 movq [r1+r3*2], mm0
1163 movq [r2+r3*1], mm0
1164 movq [r2+r3*2], mm0
1165 movq [r4+r3*1], mm0
1166 movq [r4+r3*2], mm0
1167 RET
1168%endmacro
1169
1170INIT_MMX mmxext
1171PRED8x8L_DC
1172INIT_MMX ssse3
1173PRED8x8L_DC
1174
1175;-----------------------------------------------------------------------------
1176; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1177; int has_topright, int stride)
1178;-----------------------------------------------------------------------------
1179
1180%macro PRED8x8L_HORIZONTAL 0
1181cglobal pred8x8l_horizontal_8, 4,4
1182 sub r0, r3
1183 lea r2, [r0+r3*2]
1184 movq mm0, [r0+r3*1-8]
1185 test r1, r1
1186 lea r1, [r0+r3]
1187 cmovnz r1, r0
1188 punpckhbw mm0, [r1+r3*0-8]
1189 movq mm1, [r2+r3*1-8]
1190 punpckhbw mm1, [r0+r3*2-8]
1191 mov r2, r0
1192 punpckhwd mm1, mm0
1193 lea r0, [r0+r3*4]
1194 movq mm2, [r0+r3*1-8]
1195 punpckhbw mm2, [r0+r3*0-8]
1196 lea r0, [r0+r3*2]
1197 movq mm3, [r0+r3*1-8]
1198 punpckhbw mm3, [r0+r3*0-8]
1199 punpckhwd mm3, mm2
1200 punpckhdq mm3, mm1
1201 lea r0, [r0+r3*2]
1202 movq mm0, [r0+r3*0-8]
1203 movq mm1, [r1+r3*0-8]
1204 mov r0, r2
1205 movq mm4, mm3
1206 movq mm2, mm3
1207 PALIGNR mm4, mm0, 7, mm0
1208 PALIGNR mm1, mm2, 1, mm2
1209 movq mm0, mm4
1210 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1211 movq mm4, mm0
1212 movq mm7, mm2
1213 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1214 psllq mm1, 56
1215 PALIGNR mm7, mm1, 7, mm3
1216 movq mm3, mm7
1217 lea r1, [r0+r3*2]
1218 movq mm7, mm3
1219 punpckhbw mm3, mm3
1220 punpcklbw mm7, mm7
1221 pshufw mm0, mm3, 0xff
1222 pshufw mm1, mm3, 0xaa
1223 lea r2, [r1+r3*2]
1224 pshufw mm2, mm3, 0x55
1225 pshufw mm3, mm3, 0x00
1226 pshufw mm4, mm7, 0xff
1227 pshufw mm5, mm7, 0xaa
1228 pshufw mm6, mm7, 0x55
1229 pshufw mm7, mm7, 0x00
1230 movq [r0+r3*1], mm0
1231 movq [r0+r3*2], mm1
1232 movq [r1+r3*1], mm2
1233 movq [r1+r3*2], mm3
1234 movq [r2+r3*1], mm4
1235 movq [r2+r3*2], mm5
1236 lea r0, [r2+r3*2]
1237 movq [r0+r3*1], mm6
1238 movq [r0+r3*2], mm7
1239 RET
1240%endmacro
1241
1242INIT_MMX mmxext
1243PRED8x8L_HORIZONTAL
1244INIT_MMX ssse3
1245PRED8x8L_HORIZONTAL
1246
1247;-----------------------------------------------------------------------------
1248; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1249; int stride)
1250;-----------------------------------------------------------------------------
1251
1252%macro PRED8x8L_VERTICAL 0
1253cglobal pred8x8l_vertical_8, 4,4
1254 sub r0, r3
1255 movq mm0, [r0-8]
1256 movq mm3, [r0]
1257 movq mm1, [r0+8]
1258 movq mm2, mm3
1259 movq mm4, mm3
1260 PALIGNR mm2, mm0, 7, mm0
1261 PALIGNR mm1, mm4, 1, mm4
1262 test r1, r1 ; top_left
1263 jz .fix_lt_2
1264 test r2, r2 ; top_right
1265 jz .fix_tr_1
1266 jmp .body
1267.fix_lt_2:
1268 movq mm5, mm3
1269 pxor mm5, mm2
1270 psllq mm5, 56
1271 psrlq mm5, 56
1272 pxor mm2, mm5
1273 test r2, r2 ; top_right
1274 jnz .body
1275.fix_tr_1:
1276 movq mm5, mm3
1277 pxor mm5, mm1
1278 psrlq mm5, 56
1279 psllq mm5, 56
1280 pxor mm1, mm5
1281.body:
1282 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1283%rep 3
1284 movq [r0+r3*1], mm0
1285 movq [r0+r3*2], mm0
1286 lea r0, [r0+r3*2]
1287%endrep
1288 movq [r0+r3*1], mm0
1289 movq [r0+r3*2], mm0
1290 RET
1291%endmacro
1292
1293INIT_MMX mmxext
1294PRED8x8L_VERTICAL
1295INIT_MMX ssse3
1296PRED8x8L_VERTICAL
1297
1298;-----------------------------------------------------------------------------
1299; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1300; int has_topright, int stride)
1301;-----------------------------------------------------------------------------
1302
1303INIT_MMX mmxext
1304cglobal pred8x8l_down_left_8, 4,5
1305 sub r0, r3
1306 movq mm0, [r0-8]
1307 movq mm3, [r0]
1308 movq mm1, [r0+8]
1309 movq mm2, mm3
1310 movq mm4, mm3
1311 PALIGNR mm2, mm0, 7, mm0
1312 PALIGNR mm1, mm4, 1, mm4
1313 test r1, r1
1314 jz .fix_lt_2
1315 test r2, r2
1316 jz .fix_tr_1
1317 jmp .do_top
1318.fix_lt_2:
1319 movq mm5, mm3
1320 pxor mm5, mm2
1321 psllq mm5, 56
1322 psrlq mm5, 56
1323 pxor mm2, mm5
1324 test r2, r2
1325 jnz .do_top
1326.fix_tr_1:
1327 movq mm5, mm3
1328 pxor mm5, mm1
1329 psrlq mm5, 56
1330 psllq mm5, 56
1331 pxor mm1, mm5
1332 jmp .do_top
1333.fix_tr_2:
1334 punpckhbw mm3, mm3
1335 pshufw mm1, mm3, 0xFF
1336 jmp .do_topright
1337.do_top:
1338 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1339 movq mm7, mm4
1340 test r2, r2
1341 jz .fix_tr_2
1342 movq mm0, [r0+8]
1343 movq mm5, mm0
1344 movq mm2, mm0
1345 movq mm4, mm0
1346 psrlq mm5, 56
1347 PALIGNR mm2, mm3, 7, mm3
1348 PALIGNR mm5, mm4, 1, mm4
1349 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1350.do_topright:
1351 lea r1, [r0+r3*2]
1352 movq mm6, mm1
1353 psrlq mm1, 56
1354 movq mm4, mm1
1355 lea r2, [r1+r3*2]
1356 movq mm2, mm6
1357 PALIGNR mm2, mm7, 1, mm0
1358 movq mm3, mm6
1359 PALIGNR mm3, mm7, 7, mm0
1360 PALIGNR mm4, mm6, 1, mm0
1361 movq mm5, mm7
1362 movq mm1, mm7
1363 movq mm7, mm6
1364 lea r4, [r2+r3*2]
1365 psllq mm1, 8
1366 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1367 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1368 movq [r4+r3*2], mm1
1369 movq mm2, mm0
1370 psllq mm1, 8
1371 psrlq mm2, 56
1372 psllq mm0, 8
1373 por mm1, mm2
1374 movq [r4+r3*1], mm1
1375 movq mm2, mm0
1376 psllq mm1, 8
1377 psrlq mm2, 56
1378 psllq mm0, 8
1379 por mm1, mm2
1380 movq [r2+r3*2], mm1
1381 movq mm2, mm0
1382 psllq mm1, 8
1383 psrlq mm2, 56
1384 psllq mm0, 8
1385 por mm1, mm2
1386 movq [r2+r3*1], mm1
1387 movq mm2, mm0
1388 psllq mm1, 8
1389 psrlq mm2, 56
1390 psllq mm0, 8
1391 por mm1, mm2
1392 movq [r1+r3*2], mm1
1393 movq mm2, mm0
1394 psllq mm1, 8
1395 psrlq mm2, 56
1396 psllq mm0, 8
1397 por mm1, mm2
1398 movq [r1+r3*1], mm1
1399 movq mm2, mm0
1400 psllq mm1, 8
1401 psrlq mm2, 56
1402 psllq mm0, 8
1403 por mm1, mm2
1404 movq [r0+r3*2], mm1
1405 psllq mm1, 8
1406 psrlq mm0, 56
1407 por mm1, mm0
1408 movq [r0+r3*1], mm1
1409 RET
1410
1411%macro PRED8x8L_DOWN_LEFT 0
1412cglobal pred8x8l_down_left_8, 4,4
1413 sub r0, r3
1414 movq mm0, [r0-8]
1415 movq mm3, [r0]
1416 movq mm1, [r0+8]
1417 movq mm2, mm3
1418 movq mm4, mm3
1419 PALIGNR mm2, mm0, 7, mm0
1420 PALIGNR mm1, mm4, 1, mm4
1421 test r1, r1 ; top_left
1422 jz .fix_lt_2
1423 test r2, r2 ; top_right
1424 jz .fix_tr_1
1425 jmp .do_top
1426.fix_lt_2:
1427 movq mm5, mm3
1428 pxor mm5, mm2
1429 psllq mm5, 56
1430 psrlq mm5, 56
1431 pxor mm2, mm5
1432 test r2, r2 ; top_right
1433 jnz .do_top
1434.fix_tr_1:
1435 movq mm5, mm3
1436 pxor mm5, mm1
1437 psrlq mm5, 56
1438 psllq mm5, 56
1439 pxor mm1, mm5
1440 jmp .do_top
1441.fix_tr_2:
1442 punpckhbw mm3, mm3
1443 pshufw mm1, mm3, 0xFF
1444 jmp .do_topright
1445.do_top:
1446 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1447 movq2dq xmm3, mm4
1448 test r2, r2 ; top_right
1449 jz .fix_tr_2
1450 movq mm0, [r0+8]
1451 movq mm5, mm0
1452 movq mm2, mm0
1453 movq mm4, mm0
1454 psrlq mm5, 56
1455 PALIGNR mm2, mm3, 7, mm3
1456 PALIGNR mm5, mm4, 1, mm4
1457 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1458.do_topright:
1459 movq2dq xmm4, mm1
1460 psrlq mm1, 56
1461 movq2dq xmm5, mm1
1462 lea r1, [r0+r3*2]
1463 pslldq xmm4, 8
1464 por xmm3, xmm4
1465 movdqa xmm2, xmm3
1466 psrldq xmm2, 1
1467 pslldq xmm5, 15
1468 por xmm2, xmm5
1469 lea r2, [r1+r3*2]
1470 movdqa xmm1, xmm3
1471 pslldq xmm1, 1
1472INIT_XMM cpuname
1473 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1474 psrldq xmm0, 1
1475 movq [r0+r3*1], xmm0
1476 psrldq xmm0, 1
1477 movq [r0+r3*2], xmm0
1478 psrldq xmm0, 1
1479 lea r0, [r2+r3*2]
1480 movq [r1+r3*1], xmm0
1481 psrldq xmm0, 1
1482 movq [r1+r3*2], xmm0
1483 psrldq xmm0, 1
1484 movq [r2+r3*1], xmm0
1485 psrldq xmm0, 1
1486 movq [r2+r3*2], xmm0
1487 psrldq xmm0, 1
1488 movq [r0+r3*1], xmm0
1489 psrldq xmm0, 1
1490 movq [r0+r3*2], xmm0
1491 RET
1492%endmacro
1493
1494INIT_MMX sse2
1495PRED8x8L_DOWN_LEFT
1496INIT_MMX ssse3
1497PRED8x8L_DOWN_LEFT
1498
1499;-----------------------------------------------------------------------------
1500; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1501; int has_topright, int stride)
1502;-----------------------------------------------------------------------------
1503
1504INIT_MMX mmxext
1505cglobal pred8x8l_down_right_8, 4,5
1506 sub r0, r3
1507 lea r4, [r0+r3*2]
1508 movq mm0, [r0+r3*1-8]
1509 punpckhbw mm0, [r0+r3*0-8]
1510 movq mm1, [r4+r3*1-8]
1511 punpckhbw mm1, [r0+r3*2-8]
1512 mov r4, r0
1513 punpckhwd mm1, mm0
1514 lea r0, [r0+r3*4]
1515 movq mm2, [r0+r3*1-8]
1516 punpckhbw mm2, [r0+r3*0-8]
1517 lea r0, [r0+r3*2]
1518 movq mm3, [r0+r3*1-8]
1519 punpckhbw mm3, [r0+r3*0-8]
1520 punpckhwd mm3, mm2
1521 punpckhdq mm3, mm1
1522 lea r0, [r0+r3*2]
1523 movq mm0, [r0+r3*0-8]
1524 movq mm1, [r4]
1525 mov r0, r4
1526 movq mm4, mm3
1527 movq mm2, mm3
1528 PALIGNR mm4, mm0, 7, mm0
1529 PALIGNR mm1, mm2, 1, mm2
1530 test r1, r1 ; top_left
1531 jz .fix_lt_1
1532.do_left:
1533 movq mm0, mm4
1534 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1535 movq mm4, mm0
1536 movq mm7, mm2
1537 movq mm6, mm2
1538 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1539 psllq mm1, 56
1540 PALIGNR mm7, mm1, 7, mm3
1541 movq mm0, [r0-8]
1542 movq mm3, [r0]
1543 movq mm1, [r0+8]
1544 movq mm2, mm3
1545 movq mm4, mm3
1546 PALIGNR mm2, mm0, 7, mm0
1547 PALIGNR mm1, mm4, 1, mm4
1548 test r1, r1 ; top_left
1549 jz .fix_lt_2
1550 test r2, r2 ; top_right
1551 jz .fix_tr_1
1552.do_top:
1553 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1554 movq mm5, mm4
1555 jmp .body
1556.fix_lt_1:
1557 movq mm5, mm3
1558 pxor mm5, mm4
1559 psrlq mm5, 56
1560 psllq mm5, 48
1561 pxor mm1, mm5
1562 jmp .do_left
1563.fix_lt_2:
1564 movq mm5, mm3
1565 pxor mm5, mm2
1566 psllq mm5, 56
1567 psrlq mm5, 56
1568 pxor mm2, mm5
1569 test r2, r2 ; top_right
1570 jnz .do_top
1571.fix_tr_1:
1572 movq mm5, mm3
1573 pxor mm5, mm1
1574 psrlq mm5, 56
1575 psllq mm5, 56
1576 pxor mm1, mm5
1577 jmp .do_top
1578.body:
1579 lea r1, [r0+r3*2]
1580 movq mm1, mm7
1581 movq mm7, mm5
1582 movq mm5, mm6
1583 movq mm2, mm7
1584 lea r2, [r1+r3*2]
1585 PALIGNR mm2, mm6, 1, mm0
1586 movq mm3, mm7
1587 PALIGNR mm3, mm6, 7, mm0
1588 movq mm4, mm7
1589 lea r4, [r2+r3*2]
1590 psrlq mm4, 8
1591 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1592 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1593 movq [r4+r3*2], mm0
1594 movq mm2, mm1
1595 psrlq mm0, 8
1596 psllq mm2, 56
1597 psrlq mm1, 8
1598 por mm0, mm2
1599 movq [r4+r3*1], mm0
1600 movq mm2, mm1
1601 psrlq mm0, 8
1602 psllq mm2, 56
1603 psrlq mm1, 8
1604 por mm0, mm2
1605 movq [r2+r3*2], mm0
1606 movq mm2, mm1
1607 psrlq mm0, 8
1608 psllq mm2, 56
1609 psrlq mm1, 8
1610 por mm0, mm2
1611 movq [r2+r3*1], mm0
1612 movq mm2, mm1
1613 psrlq mm0, 8
1614 psllq mm2, 56
1615 psrlq mm1, 8
1616 por mm0, mm2
1617 movq [r1+r3*2], mm0
1618 movq mm2, mm1
1619 psrlq mm0, 8
1620 psllq mm2, 56
1621 psrlq mm1, 8
1622 por mm0, mm2
1623 movq [r1+r3*1], mm0
1624 movq mm2, mm1
1625 psrlq mm0, 8
1626 psllq mm2, 56
1627 psrlq mm1, 8
1628 por mm0, mm2
1629 movq [r0+r3*2], mm0
1630 psrlq mm0, 8
1631 psllq mm1, 56
1632 por mm0, mm1
1633 movq [r0+r3*1], mm0
1634 RET
1635
1636%macro PRED8x8L_DOWN_RIGHT 0
1637cglobal pred8x8l_down_right_8, 4,5
1638 sub r0, r3
1639 lea r4, [r0+r3*2]
1640 movq mm0, [r0+r3*1-8]
1641 punpckhbw mm0, [r0+r3*0-8]
1642 movq mm1, [r4+r3*1-8]
1643 punpckhbw mm1, [r0+r3*2-8]
1644 mov r4, r0
1645 punpckhwd mm1, mm0
1646 lea r0, [r0+r3*4]
1647 movq mm2, [r0+r3*1-8]
1648 punpckhbw mm2, [r0+r3*0-8]
1649 lea r0, [r0+r3*2]
1650 movq mm3, [r0+r3*1-8]
1651 punpckhbw mm3, [r0+r3*0-8]
1652 punpckhwd mm3, mm2
1653 punpckhdq mm3, mm1
1654 lea r0, [r0+r3*2]
1655 movq mm0, [r0+r3*0-8]
1656 movq mm1, [r4]
1657 mov r0, r4
1658 movq mm4, mm3
1659 movq mm2, mm3
1660 PALIGNR mm4, mm0, 7, mm0
1661 PALIGNR mm1, mm2, 1, mm2
1662 test r1, r1
1663 jz .fix_lt_1
1664 jmp .do_left
1665.fix_lt_1:
1666 movq mm5, mm3
1667 pxor mm5, mm4
1668 psrlq mm5, 56
1669 psllq mm5, 48
1670 pxor mm1, mm5
1671 jmp .do_left
1672.fix_lt_2:
1673 movq mm5, mm3
1674 pxor mm5, mm2
1675 psllq mm5, 56
1676 psrlq mm5, 56
1677 pxor mm2, mm5
1678 test r2, r2
1679 jnz .do_top
1680.fix_tr_1:
1681 movq mm5, mm3
1682 pxor mm5, mm1
1683 psrlq mm5, 56
1684 psllq mm5, 56
1685 pxor mm1, mm5
1686 jmp .do_top
1687.do_left:
1688 movq mm0, mm4
1689 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1690 movq mm4, mm0
1691 movq mm7, mm2
1692 movq2dq xmm3, mm2
1693 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1694 psllq mm1, 56
1695 PALIGNR mm7, mm1, 7, mm3
1696 movq2dq xmm1, mm7
1697 movq mm0, [r0-8]
1698 movq mm3, [r0]
1699 movq mm1, [r0+8]
1700 movq mm2, mm3
1701 movq mm4, mm3
1702 PALIGNR mm2, mm0, 7, mm0
1703 PALIGNR mm1, mm4, 1, mm4
1704 test r1, r1
1705 jz .fix_lt_2
1706 test r2, r2
1707 jz .fix_tr_1
1708.do_top:
1709 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1710 movq2dq xmm4, mm4
1711 lea r1, [r0+r3*2]
1712 movdqa xmm0, xmm3
1713 pslldq xmm4, 8
1714 por xmm3, xmm4
1715 lea r2, [r1+r3*2]
1716 pslldq xmm4, 1
1717 por xmm1, xmm4
1718 psrldq xmm0, 7
1719 pslldq xmm0, 15
1720 psrldq xmm0, 7
1721 por xmm1, xmm0
1722 lea r0, [r2+r3*2]
1723 movdqa xmm2, xmm3
1724 psrldq xmm2, 1
1725INIT_XMM cpuname
1726 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1727 movdqa xmm1, xmm0
1728 psrldq xmm1, 1
1729 movq [r0+r3*2], xmm0
1730 movq [r0+r3*1], xmm1
1731 psrldq xmm0, 2
1732 psrldq xmm1, 2
1733 movq [r2+r3*2], xmm0
1734 movq [r2+r3*1], xmm1
1735 psrldq xmm0, 2
1736 psrldq xmm1, 2
1737 movq [r1+r3*2], xmm0
1738 movq [r1+r3*1], xmm1
1739 psrldq xmm0, 2
1740 psrldq xmm1, 2
1741 movq [r4+r3*2], xmm0
1742 movq [r4+r3*1], xmm1
1743 RET
1744%endmacro
1745
1746INIT_MMX sse2
1747PRED8x8L_DOWN_RIGHT
1748INIT_MMX ssse3
1749PRED8x8L_DOWN_RIGHT
1750
1751;-----------------------------------------------------------------------------
1752; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1753; int has_topright, int stride)
1754;-----------------------------------------------------------------------------
1755
1756INIT_MMX mmxext
1757cglobal pred8x8l_vertical_right_8, 4,5
1758 sub r0, r3
1759 lea r4, [r0+r3*2]
1760 movq mm0, [r0+r3*1-8]
1761 punpckhbw mm0, [r0+r3*0-8]
1762 movq mm1, [r4+r3*1-8]
1763 punpckhbw mm1, [r0+r3*2-8]
1764 mov r4, r0
1765 punpckhwd mm1, mm0
1766 lea r0, [r0+r3*4]
1767 movq mm2, [r0+r3*1-8]
1768 punpckhbw mm2, [r0+r3*0-8]
1769 lea r0, [r0+r3*2]
1770 movq mm3, [r0+r3*1-8]
1771 punpckhbw mm3, [r0+r3*0-8]
1772 punpckhwd mm3, mm2
1773 punpckhdq mm3, mm1
1774 lea r0, [r0+r3*2]
1775 movq mm0, [r0+r3*0-8]
1776 movq mm1, [r4]
1777 mov r0, r4
1778 movq mm4, mm3
1779 movq mm2, mm3
1780 PALIGNR mm4, mm0, 7, mm0
1781 PALIGNR mm1, mm2, 1, mm2
1782 test r1, r1
1783 jz .fix_lt_1
1784 jmp .do_left
1785.fix_lt_1:
1786 movq mm5, mm3
1787 pxor mm5, mm4
1788 psrlq mm5, 56
1789 psllq mm5, 48
1790 pxor mm1, mm5
1791 jmp .do_left
1792.fix_lt_2:
1793 movq mm5, mm3
1794 pxor mm5, mm2
1795 psllq mm5, 56
1796 psrlq mm5, 56
1797 pxor mm2, mm5
1798 test r2, r2
1799 jnz .do_top
1800.fix_tr_1:
1801 movq mm5, mm3
1802 pxor mm5, mm1
1803 psrlq mm5, 56
1804 psllq mm5, 56
1805 pxor mm1, mm5
1806 jmp .do_top
1807.do_left:
1808 movq mm0, mm4
1809 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1810 movq mm7, mm2
1811 movq mm0, [r0-8]
1812 movq mm3, [r0]
1813 movq mm1, [r0+8]
1814 movq mm2, mm3
1815 movq mm4, mm3
1816 PALIGNR mm2, mm0, 7, mm0
1817 PALIGNR mm1, mm4, 1, mm4
1818 test r1, r1
1819 jz .fix_lt_2
1820 test r2, r2
1821 jz .fix_tr_1
1822.do_top:
1823 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1824 lea r1, [r0+r3*2]
1825 movq mm2, mm6
1826 movq mm3, mm6
1827 PALIGNR mm3, mm7, 7, mm0
1828 PALIGNR mm6, mm7, 6, mm1
1829 movq mm4, mm3
1830 pavgb mm3, mm2
1831 lea r2, [r1+r3*2]
1832 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1833 movq [r0+r3*1], mm3
1834 movq [r0+r3*2], mm0
1835 movq mm5, mm0
1836 movq mm6, mm3
1837 movq mm1, mm7
1838 movq mm2, mm1
1839 psllq mm2, 8
1840 movq mm3, mm1
1841 psllq mm3, 16
1842 lea r4, [r2+r3*2]
1843 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1844 PALIGNR mm6, mm0, 7, mm2
1845 movq [r1+r3*1], mm6
1846 psllq mm0, 8
1847 PALIGNR mm5, mm0, 7, mm1
1848 movq [r1+r3*2], mm5
1849 psllq mm0, 8
1850 PALIGNR mm6, mm0, 7, mm2
1851 movq [r2+r3*1], mm6
1852 psllq mm0, 8
1853 PALIGNR mm5, mm0, 7, mm1
1854 movq [r2+r3*2], mm5
1855 psllq mm0, 8
1856 PALIGNR mm6, mm0, 7, mm2
1857 movq [r4+r3*1], mm6
1858 psllq mm0, 8
1859 PALIGNR mm5, mm0, 7, mm1
1860 movq [r4+r3*2], mm5
1861 RET
1862
1863%macro PRED8x8L_VERTICAL_RIGHT 0
1864cglobal pred8x8l_vertical_right_8, 4,5,7
1865 ; manually spill XMM registers for Win64 because
1866 ; the code here is initialized with INIT_MMX
1867 WIN64_SPILL_XMM 7
1868 sub r0, r3
1869 lea r4, [r0+r3*2]
1870 movq mm0, [r0+r3*1-8]
1871 punpckhbw mm0, [r0+r3*0-8]
1872 movq mm1, [r4+r3*1-8]
1873 punpckhbw mm1, [r0+r3*2-8]
1874 mov r4, r0
1875 punpckhwd mm1, mm0
1876 lea r0, [r0+r3*4]
1877 movq mm2, [r0+r3*1-8]
1878 punpckhbw mm2, [r0+r3*0-8]
1879 lea r0, [r0+r3*2]
1880 movq mm3, [r0+r3*1-8]
1881 punpckhbw mm3, [r0+r3*0-8]
1882 punpckhwd mm3, mm2
1883 punpckhdq mm3, mm1
1884 lea r0, [r0+r3*2]
1885 movq mm0, [r0+r3*0-8]
1886 movq mm1, [r4]
1887 mov r0, r4
1888 movq mm4, mm3
1889 movq mm2, mm3
1890 PALIGNR mm4, mm0, 7, mm0
1891 PALIGNR mm1, mm2, 1, mm2
1892 test r1, r1
1893 jnz .do_left
1894.fix_lt_1:
1895 movq mm5, mm3
1896 pxor mm5, mm4
1897 psrlq mm5, 56
1898 psllq mm5, 48
1899 pxor mm1, mm5
1900 jmp .do_left
1901.fix_lt_2:
1902 movq mm5, mm3
1903 pxor mm5, mm2
1904 psllq mm5, 56
1905 psrlq mm5, 56
1906 pxor mm2, mm5
1907 test r2, r2
1908 jnz .do_top
1909.fix_tr_1:
1910 movq mm5, mm3
1911 pxor mm5, mm1
1912 psrlq mm5, 56
1913 psllq mm5, 56
1914 pxor mm1, mm5
1915 jmp .do_top
1916.do_left:
1917 movq mm0, mm4
1918 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1919 movq2dq xmm0, mm2
1920 movq mm0, [r0-8]
1921 movq mm3, [r0]
1922 movq mm1, [r0+8]
1923 movq mm2, mm3
1924 movq mm4, mm3
1925 PALIGNR mm2, mm0, 7, mm0
1926 PALIGNR mm1, mm4, 1, mm4
1927 test r1, r1
1928 jz .fix_lt_2
1929 test r2, r2
1930 jz .fix_tr_1
1931.do_top:
1932 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1933 lea r1, [r0+r3*2]
1934 movq2dq xmm4, mm6
1935 pslldq xmm4, 8
1936 por xmm0, xmm4
1937 movdqa xmm6, [pw_ff00]
1938 movdqa xmm1, xmm0
1939 lea r2, [r1+r3*2]
1940 movdqa xmm2, xmm0
1941 movdqa xmm3, xmm0
1942 pslldq xmm0, 1
1943 pslldq xmm1, 2
1944 pavgb xmm2, xmm0
1945INIT_XMM cpuname
1946 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1947 pandn xmm6, xmm4
1948 movdqa xmm5, xmm4
1949 psrlw xmm4, 8
1950 packuswb xmm6, xmm4
1951 movhlps xmm4, xmm6
1952 movhps [r0+r3*2], xmm5
1953 movhps [r0+r3*1], xmm2
1954 psrldq xmm5, 4
1955 movss xmm5, xmm6
1956 psrldq xmm2, 4
1957 movss xmm2, xmm4
1958 lea r0, [r2+r3*2]
1959 psrldq xmm5, 1
1960 psrldq xmm2, 1
1961 movq [r0+r3*2], xmm5
1962 movq [r0+r3*1], xmm2
1963 psrldq xmm5, 1
1964 psrldq xmm2, 1
1965 movq [r2+r3*2], xmm5
1966 movq [r2+r3*1], xmm2
1967 psrldq xmm5, 1
1968 psrldq xmm2, 1
1969 movq [r1+r3*2], xmm5
1970 movq [r1+r3*1], xmm2
1971 RET
1972%endmacro
1973
1974INIT_MMX sse2
1975PRED8x8L_VERTICAL_RIGHT
1976INIT_MMX ssse3
1977PRED8x8L_VERTICAL_RIGHT
1978
1979;-----------------------------------------------------------------------------
1980; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
1981; int has_topright, int stride)
1982;-----------------------------------------------------------------------------
1983
1984%macro PRED8x8L_VERTICAL_LEFT 0
1985cglobal pred8x8l_vertical_left_8, 4,4
1986 sub r0, r3
1987 movq mm0, [r0-8]
1988 movq mm3, [r0]
1989 movq mm1, [r0+8]
1990 movq mm2, mm3
1991 movq mm4, mm3
1992 PALIGNR mm2, mm0, 7, mm0
1993 PALIGNR mm1, mm4, 1, mm4
1994 test r1, r1
1995 jz .fix_lt_2
1996 test r2, r2
1997 jz .fix_tr_1
1998 jmp .do_top
1999.fix_lt_2:
2000 movq mm5, mm3
2001 pxor mm5, mm2
2002 psllq mm5, 56
2003 psrlq mm5, 56
2004 pxor mm2, mm5
2005 test r2, r2
2006 jnz .do_top
2007.fix_tr_1:
2008 movq mm5, mm3
2009 pxor mm5, mm1
2010 psrlq mm5, 56
2011 psllq mm5, 56
2012 pxor mm1, mm5
2013 jmp .do_top
2014.fix_tr_2:
2015 punpckhbw mm3, mm3
2016 pshufw mm1, mm3, 0xFF
2017 jmp .do_topright
2018.do_top:
2019 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2020 movq2dq xmm4, mm4
2021 test r2, r2
2022 jz .fix_tr_2
2023 movq mm0, [r0+8]
2024 movq mm5, mm0
2025 movq mm2, mm0
2026 movq mm4, mm0
2027 psrlq mm5, 56
2028 PALIGNR mm2, mm3, 7, mm3
2029 PALIGNR mm5, mm4, 1, mm4
2030 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2031.do_topright:
2032 movq2dq xmm3, mm1
2033 lea r1, [r0+r3*2]
2034 pslldq xmm3, 8
2035 por xmm4, xmm3
2036 movdqa xmm2, xmm4
2037 movdqa xmm1, xmm4
2038 movdqa xmm3, xmm4
2039 psrldq xmm2, 1
2040 pslldq xmm1, 1
2041 pavgb xmm3, xmm2
2042 lea r2, [r1+r3*2]
2043INIT_XMM cpuname
2044 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2045 psrldq xmm0, 1
2046 movq [r0+r3*1], xmm3
2047 movq [r0+r3*2], xmm0
2048 lea r0, [r2+r3*2]
2049 psrldq xmm3, 1
2050 psrldq xmm0, 1
2051 movq [r1+r3*1], xmm3
2052 movq [r1+r3*2], xmm0
2053 psrldq xmm3, 1
2054 psrldq xmm0, 1
2055 movq [r2+r3*1], xmm3
2056 movq [r2+r3*2], xmm0
2057 psrldq xmm3, 1
2058 psrldq xmm0, 1
2059 movq [r0+r3*1], xmm3
2060 movq [r0+r3*2], xmm0
2061 RET
2062%endmacro
2063
2064INIT_MMX sse2
2065PRED8x8L_VERTICAL_LEFT
2066INIT_MMX ssse3
2067PRED8x8L_VERTICAL_LEFT
2068
2069;-----------------------------------------------------------------------------
2070; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
2071; int has_topright, int stride)
2072;-----------------------------------------------------------------------------
2073
2074%macro PRED8x8L_HORIZONTAL_UP 0
2075cglobal pred8x8l_horizontal_up_8, 4,4
2076 sub r0, r3
2077 lea r2, [r0+r3*2]
2078 movq mm0, [r0+r3*1-8]
2079 test r1, r1
2080 lea r1, [r0+r3]
2081 cmovnz r1, r0
2082 punpckhbw mm0, [r1+r3*0-8]
2083 movq mm1, [r2+r3*1-8]
2084 punpckhbw mm1, [r0+r3*2-8]
2085 mov r2, r0
2086 punpckhwd mm1, mm0
2087 lea r0, [r0+r3*4]
2088 movq mm2, [r0+r3*1-8]
2089 punpckhbw mm2, [r0+r3*0-8]
2090 lea r0, [r0+r3*2]
2091 movq mm3, [r0+r3*1-8]
2092 punpckhbw mm3, [r0+r3*0-8]
2093 punpckhwd mm3, mm2
2094 punpckhdq mm3, mm1
2095 lea r0, [r0+r3*2]
2096 movq mm0, [r0+r3*0-8]
2097 movq mm1, [r1+r3*0-8]
2098 mov r0, r2
2099 movq mm4, mm3
2100 movq mm2, mm3
2101 PALIGNR mm4, mm0, 7, mm0
2102 PALIGNR mm1, mm2, 1, mm2
2103 movq mm0, mm4
2104 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2105 movq mm4, mm0
2106 movq mm7, mm2
2107 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2108 psllq mm1, 56
2109 PALIGNR mm7, mm1, 7, mm3
2110 lea r1, [r0+r3*2]
2111 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2112 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2113 movq mm2, mm0
2114 psllw mm0, 8
2115 psrlw mm2, 8
2116 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2117 movq mm3, mm2
2118 movq mm4, mm2
2119 movq mm5, mm2
2120 psrlq mm2, 8
2121 psrlq mm3, 16
2122 lea r2, [r1+r3*2]
2123 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2124 punpckhbw mm7, mm7
2125 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2126 pavgb mm4, mm2
2127 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2128 movq mm5, mm4
2129 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2130 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2131 movq mm6, mm5
2132 movq mm7, mm5
2133 movq mm0, mm5
2134 PALIGNR mm5, mm4, 2, mm1
2135 pshufw mm1, mm6, 11111001b
2136 PALIGNR mm6, mm4, 4, mm2
2137 pshufw mm2, mm7, 11111110b
2138 PALIGNR mm7, mm4, 6, mm3
2139 pshufw mm3, mm0, 11111111b
2140 movq [r0+r3*1], mm4
2141 movq [r0+r3*2], mm5
2142 lea r0, [r2+r3*2]
2143 movq [r1+r3*1], mm6
2144 movq [r1+r3*2], mm7
2145 movq [r2+r3*1], mm0
2146 movq [r2+r3*2], mm1
2147 movq [r0+r3*1], mm2
2148 movq [r0+r3*2], mm3
2149 RET
2150%endmacro
2151
2152INIT_MMX mmxext
2153PRED8x8L_HORIZONTAL_UP
2154INIT_MMX ssse3
2155PRED8x8L_HORIZONTAL_UP
2156
2157;-----------------------------------------------------------------------------
2158; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
2159; int has_topright, int stride)
2160;-----------------------------------------------------------------------------
2161
2162INIT_MMX mmxext
2163cglobal pred8x8l_horizontal_down_8, 4,5
2164 sub r0, r3
2165 lea r4, [r0+r3*2]
2166 movq mm0, [r0+r3*1-8]
2167 punpckhbw mm0, [r0+r3*0-8]
2168 movq mm1, [r4+r3*1-8]
2169 punpckhbw mm1, [r0+r3*2-8]
2170 mov r4, r0
2171 punpckhwd mm1, mm0
2172 lea r0, [r0+r3*4]
2173 movq mm2, [r0+r3*1-8]
2174 punpckhbw mm2, [r0+r3*0-8]
2175 lea r0, [r0+r3*2]
2176 movq mm3, [r0+r3*1-8]
2177 punpckhbw mm3, [r0+r3*0-8]
2178 punpckhwd mm3, mm2
2179 punpckhdq mm3, mm1
2180 lea r0, [r0+r3*2]
2181 movq mm0, [r0+r3*0-8]
2182 movq mm1, [r4]
2183 mov r0, r4
2184 movq mm4, mm3
2185 movq mm2, mm3
2186 PALIGNR mm4, mm0, 7, mm0
2187 PALIGNR mm1, mm2, 1, mm2
2188 test r1, r1
2189 jnz .do_left
2190.fix_lt_1:
2191 movq mm5, mm3
2192 pxor mm5, mm4
2193 psrlq mm5, 56
2194 psllq mm5, 48
2195 pxor mm1, mm5
2196 jmp .do_left
2197.fix_lt_2:
2198 movq mm5, mm3
2199 pxor mm5, mm2
2200 psllq mm5, 56
2201 psrlq mm5, 56
2202 pxor mm2, mm5
2203 test r2, r2
2204 jnz .do_top
2205.fix_tr_1:
2206 movq mm5, mm3
2207 pxor mm5, mm1
2208 psrlq mm5, 56
2209 psllq mm5, 56
2210 pxor mm1, mm5
2211 jmp .do_top
2212.do_left:
2213 movq mm0, mm4
2214 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2215 movq mm4, mm0
2216 movq mm7, mm2
2217 movq mm6, mm2
2218 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2219 psllq mm1, 56
2220 PALIGNR mm7, mm1, 7, mm3
2221 movq mm0, [r0-8]
2222 movq mm3, [r0]
2223 movq mm1, [r0+8]
2224 movq mm2, mm3
2225 movq mm4, mm3
2226 PALIGNR mm2, mm0, 7, mm0
2227 PALIGNR mm1, mm4, 1, mm4
2228 test r1, r1
2229 jz .fix_lt_2
2230 test r2, r2
2231 jz .fix_tr_1
2232.do_top:
2233 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2234 movq mm5, mm4
2235 lea r1, [r0+r3*2]
2236 psllq mm7, 56
2237 movq mm2, mm5
2238 movq mm3, mm6
2239 movq mm4, mm2
2240 PALIGNR mm2, mm6, 7, mm5
2241 PALIGNR mm6, mm7, 7, mm0
2242 lea r2, [r1+r3*2]
2243 PALIGNR mm4, mm3, 1, mm7
2244 movq mm5, mm3
2245 pavgb mm3, mm6
2246 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2247 movq mm4, mm2
2248 movq mm1, mm2
2249 lea r4, [r2+r3*2]
2250 psrlq mm4, 16
2251 psrlq mm1, 8
2252 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2253 movq mm7, mm3
2254 punpcklbw mm3, mm0
2255 punpckhbw mm7, mm0
2256 movq mm1, mm7
2257 movq mm0, mm7
2258 movq mm4, mm7
2259 movq [r4+r3*2], mm3
2260 PALIGNR mm7, mm3, 2, mm5
2261 movq [r4+r3*1], mm7
2262 PALIGNR mm1, mm3, 4, mm5
2263 movq [r2+r3*2], mm1
2264 PALIGNR mm0, mm3, 6, mm3
2265 movq [r2+r3*1], mm0
2266 movq mm2, mm6
2267 movq mm3, mm6
2268 movq [r1+r3*2], mm4
2269 PALIGNR mm6, mm4, 2, mm5
2270 movq [r1+r3*1], mm6
2271 PALIGNR mm2, mm4, 4, mm5
2272 movq [r0+r3*2], mm2
2273 PALIGNR mm3, mm4, 6, mm4
2274 movq [r0+r3*1], mm3
2275 RET
2276
2277%macro PRED8x8L_HORIZONTAL_DOWN 0
2278cglobal pred8x8l_horizontal_down_8, 4,5
2279 sub r0, r3
2280 lea r4, [r0+r3*2]
2281 movq mm0, [r0+r3*1-8]
2282 punpckhbw mm0, [r0+r3*0-8]
2283 movq mm1, [r4+r3*1-8]
2284 punpckhbw mm1, [r0+r3*2-8]
2285 mov r4, r0
2286 punpckhwd mm1, mm0
2287 lea r0, [r0+r3*4]
2288 movq mm2, [r0+r3*1-8]
2289 punpckhbw mm2, [r0+r3*0-8]
2290 lea r0, [r0+r3*2]
2291 movq mm3, [r0+r3*1-8]
2292 punpckhbw mm3, [r0+r3*0-8]
2293 punpckhwd mm3, mm2
2294 punpckhdq mm3, mm1
2295 lea r0, [r0+r3*2]
2296 movq mm0, [r0+r3*0-8]
2297 movq mm1, [r4]
2298 mov r0, r4
2299 movq mm4, mm3
2300 movq mm2, mm3
2301 PALIGNR mm4, mm0, 7, mm0
2302 PALIGNR mm1, mm2, 1, mm2
2303 test r1, r1
2304 jnz .do_left
2305.fix_lt_1:
2306 movq mm5, mm3
2307 pxor mm5, mm4
2308 psrlq mm5, 56
2309 psllq mm5, 48
2310 pxor mm1, mm5
2311 jmp .do_left
2312.fix_lt_2:
2313 movq mm5, mm3
2314 pxor mm5, mm2
2315 psllq mm5, 56
2316 psrlq mm5, 56
2317 pxor mm2, mm5
2318 test r2, r2
2319 jnz .do_top
2320.fix_tr_1:
2321 movq mm5, mm3
2322 pxor mm5, mm1
2323 psrlq mm5, 56
2324 psllq mm5, 56
2325 pxor mm1, mm5
2326 jmp .do_top
2327.fix_tr_2:
2328 punpckhbw mm3, mm3
2329 pshufw mm1, mm3, 0xFF
2330 jmp .do_topright
2331.do_left:
2332 movq mm0, mm4
2333 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2334 movq2dq xmm0, mm2
2335 pslldq xmm0, 8
2336 movq mm4, mm0
2337 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2338 movq2dq xmm2, mm1
2339 pslldq xmm2, 15
2340 psrldq xmm2, 8
2341 por xmm0, xmm2
2342 movq mm0, [r0-8]
2343 movq mm3, [r0]
2344 movq mm1, [r0+8]
2345 movq mm2, mm3
2346 movq mm4, mm3
2347 PALIGNR mm2, mm0, 7, mm0
2348 PALIGNR mm1, mm4, 1, mm4
2349 test r1, r1
2350 jz .fix_lt_2
2351 test r2, r2
2352 jz .fix_tr_1
2353.do_top:
2354 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2355 movq2dq xmm1, mm4
2356 test r2, r2
2357 jz .fix_tr_2
2358 movq mm0, [r0+8]
2359 movq mm5, mm0
2360 movq mm2, mm0
2361 movq mm4, mm0
2362 psrlq mm5, 56
2363 PALIGNR mm2, mm3, 7, mm3
2364 PALIGNR mm5, mm4, 1, mm4
2365 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2366.do_topright:
2367 movq2dq xmm5, mm1
2368 pslldq xmm5, 8
2369 por xmm1, xmm5
2370INIT_XMM cpuname
2371 lea r2, [r4+r3*2]
2372 movdqa xmm2, xmm1
2373 movdqa xmm3, xmm1
2374 PALIGNR xmm1, xmm0, 7, xmm4
2375 PALIGNR xmm2, xmm0, 9, xmm5
2376 lea r1, [r2+r3*2]
2377 PALIGNR xmm3, xmm0, 8, xmm0
2378 movdqa xmm4, xmm1
2379 pavgb xmm4, xmm3
2380 lea r0, [r1+r3*2]
2381 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2382 punpcklbw xmm4, xmm0
2383 movhlps xmm0, xmm4
2384 movq [r0+r3*2], xmm4
2385 movq [r2+r3*2], xmm0
2386 psrldq xmm4, 2
2387 psrldq xmm0, 2
2388 movq [r0+r3*1], xmm4
2389 movq [r2+r3*1], xmm0
2390 psrldq xmm4, 2
2391 psrldq xmm0, 2
2392 movq [r1+r3*2], xmm4
2393 movq [r4+r3*2], xmm0
2394 psrldq xmm4, 2
2395 psrldq xmm0, 2
2396 movq [r1+r3*1], xmm4
2397 movq [r4+r3*1], xmm0
2398 RET
2399%endmacro
2400
2401INIT_MMX sse2
2402PRED8x8L_HORIZONTAL_DOWN
2403INIT_MMX ssse3
2404PRED8x8L_HORIZONTAL_DOWN
2405
2406;-------------------------------------------------------------------------------
2407; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2408;-------------------------------------------------------------------------------
2409
2410INIT_MMX mmxext
2411cglobal pred4x4_dc_8, 3,5
2412 pxor mm7, mm7
2413 mov r4, r0
2414 sub r0, r2
2415 movd mm0, [r0]
2416 psadbw mm0, mm7
2417 movzx r1d, byte [r0+r2*1-1]
2418 movd r3d, mm0
2419 add r3d, r1d
2420 movzx r1d, byte [r0+r2*2-1]
2421 lea r0, [r0+r2*2]
2422 add r3d, r1d
2423 movzx r1d, byte [r0+r2*1-1]
2424 add r3d, r1d
2425 movzx r1d, byte [r0+r2*2-1]
2426 add r3d, r1d
2427 add r3d, 4
2428 shr r3d, 3
2429 imul r3d, 0x01010101
2430 mov [r4+r2*0], r3d
2431 mov [r0+r2*0], r3d
2432 mov [r0+r2*1], r3d
2433 mov [r0+r2*2], r3d
2434 RET
2435
2436;-----------------------------------------------------------------------------
2437; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2438; int stride)
2439;-----------------------------------------------------------------------------
2440
2441%macro PRED4x4_TM 0
2442cglobal pred4x4_tm_vp8_8, 3,6
2443 sub r0, r2
2444 pxor mm7, mm7
2445 movd mm0, [r0]
2446 punpcklbw mm0, mm7
2447 movzx r4d, byte [r0-1]
2448 mov r5d, 2
2449.loop:
2450 movzx r1d, byte [r0+r2*1-1]
2451 movzx r3d, byte [r0+r2*2-1]
2452 sub r1d, r4d
2453 sub r3d, r4d
2454 movd mm2, r1d
2455 movd mm4, r3d
2456%if cpuflag(mmxext)
2457 pshufw mm2, mm2, 0
2458 pshufw mm4, mm4, 0
2459%else
2460 punpcklwd mm2, mm2
2461 punpcklwd mm4, mm4
2462 punpckldq mm2, mm2
2463 punpckldq mm4, mm4
2464%endif
2465 paddw mm2, mm0
2466 paddw mm4, mm0
2467 packuswb mm2, mm2
2468 packuswb mm4, mm4
2469 movd [r0+r2*1], mm2
2470 movd [r0+r2*2], mm4
2471 lea r0, [r0+r2*2]
2472 dec r5d
2473 jg .loop
2474 REP_RET
2475%endmacro
2476
2477INIT_MMX mmx
2478PRED4x4_TM
2479INIT_MMX mmxext
2480PRED4x4_TM
2481
2482INIT_XMM ssse3
2483cglobal pred4x4_tm_vp8_8, 3,3
2484 sub r0, r2
2485 movq mm6, [tm_shuf]
2486 pxor mm1, mm1
2487 movd mm0, [r0]
2488 punpcklbw mm0, mm1
2489 movd mm7, [r0-4]
2490 pshufb mm7, mm6
2491 lea r1, [r0+r2*2]
2492 movd mm2, [r0+r2*1-4]
2493 movd mm3, [r0+r2*2-4]
2494 movd mm4, [r1+r2*1-4]
2495 movd mm5, [r1+r2*2-4]
2496 pshufb mm2, mm6
2497 pshufb mm3, mm6
2498 pshufb mm4, mm6
2499 pshufb mm5, mm6
2500 psubw mm0, mm7
2501 paddw mm2, mm0
2502 paddw mm3, mm0
2503 paddw mm4, mm0
2504 paddw mm5, mm0
2505 packuswb mm2, mm2
2506 packuswb mm3, mm3
2507 packuswb mm4, mm4
2508 packuswb mm5, mm5
2509 movd [r0+r2*1], mm2
2510 movd [r0+r2*2], mm3
2511 movd [r1+r2*1], mm4
2512 movd [r1+r2*2], mm5
2513 RET
2514
2515;-----------------------------------------------------------------------------
2516; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2517; int stride)
2518;-----------------------------------------------------------------------------
2519
2520INIT_MMX mmxext
2521cglobal pred4x4_vertical_vp8_8, 3,3
2522 sub r0, r2
2523 movd m1, [r0-1]
2524 movd m0, [r0]
2525 mova m2, m0 ;t0 t1 t2 t3
2526 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2527 lea r1, [r0+r2*2]
2528 psrlq m0, 8 ;t1 t2 t3 t4
2529 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2530 movd [r0+r2*1], m3
2531 movd [r0+r2*2], m3
2532 movd [r1+r2*1], m3
2533 movd [r1+r2*2], m3
2534 RET
2535
2536;-----------------------------------------------------------------------------
2537; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2538; int stride)
2539;-----------------------------------------------------------------------------
2540INIT_MMX mmxext
2541cglobal pred4x4_down_left_8, 3,3
2542 sub r0, r2
2543 movq m1, [r0]
2544 punpckldq m1, [r1]
2545 movq m2, m1
2546 movq m3, m1
2547 psllq m1, 8
2548 pxor m2, m1
2549 psrlq m2, 8
2550 pxor m2, m3
2551 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2552 lea r1, [r0+r2*2]
2553 psrlq m0, 8
2554 movd [r0+r2*1], m0
2555 psrlq m0, 8
2556 movd [r0+r2*2], m0
2557 psrlq m0, 8
2558 movd [r1+r2*1], m0
2559 psrlq m0, 8
2560 movd [r1+r2*2], m0
2561 RET
2562
2563;------------------------------------------------------------------------------
2564; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2565; int stride)
2566;------------------------------------------------------------------------------
2567
2568INIT_MMX mmxext
2569cglobal pred4x4_vertical_left_8, 3,3
2570 sub r0, r2
2571 movq m1, [r0]
2572 punpckldq m1, [r1]
2573 movq m3, m1
2574 movq m2, m1
2575 psrlq m3, 8
2576 psrlq m2, 16
2577 movq m4, m3
2578 pavgb m4, m1
2579 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2580 lea r1, [r0+r2*2]
2581 movh [r0+r2*1], m4
2582 movh [r0+r2*2], m0
2583 psrlq m4, 8
2584 psrlq m0, 8
2585 movh [r1+r2*1], m4
2586 movh [r1+r2*2], m0
2587 RET
2588
2589;------------------------------------------------------------------------------
2590; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
2591; int stride)
2592;------------------------------------------------------------------------------
2593
2594INIT_MMX mmxext
2595cglobal pred4x4_horizontal_up_8, 3,3
2596 sub r0, r2
2597 lea r1, [r0+r2*2]
2598 movd m0, [r0+r2*1-4]
2599 punpcklbw m0, [r0+r2*2-4]
2600 movd m1, [r1+r2*1-4]
2601 punpcklbw m1, [r1+r2*2-4]
2602 punpckhwd m0, m1
2603 movq m1, m0
2604 punpckhbw m1, m1
2605 pshufw m1, m1, 0xFF
2606 punpckhdq m0, m1
2607 movq m2, m0
2608 movq m3, m0
2609 movq m7, m0
2610 psrlq m2, 16
2611 psrlq m3, 8
2612 pavgb m7, m3
2613 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2614 punpcklbw m7, m4
2615 movd [r0+r2*1], m7
2616 psrlq m7, 16
2617 movd [r0+r2*2], m7
2618 psrlq m7, 16
2619 movd [r1+r2*1], m7
2620 movd [r1+r2*2], m1
2621 RET
2622
2623;------------------------------------------------------------------------------
2624; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
2625; const uint8_t *topright, int stride)
2626;------------------------------------------------------------------------------
2627
2628INIT_MMX mmxext
2629cglobal pred4x4_horizontal_down_8, 3,3
2630 sub r0, r2
2631 lea r1, [r0+r2*2]
2632 movh m0, [r0-4] ; lt ..
2633 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2634 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2635 movd m1, [r1+r2*2-4] ; l3
2636 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2637 movd m2, [r0+r2*2-4] ; l1
2638 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2639 punpckhwd m1, m2 ; l0 l1 l2 l3
2640 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2641 movq m0, m1
2642 movq m2, m1
2643 movq m5, m1
2644 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2645 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2646 pavgb m5, m2
2647 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2648 punpcklbw m5, m3
2649 psrlq m3, 32
2650 PALIGNR m3, m5, 6, m4
2651 movh [r1+r2*2], m5
2652 psrlq m5, 16
2653 movh [r1+r2*1], m5
2654 psrlq m5, 16
2655 movh [r0+r2*2], m5
2656 movh [r0+r2*1], m3
2657 RET
2658
2659;-----------------------------------------------------------------------------
2660; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
2661; const uint8_t *topright, int stride)
2662;-----------------------------------------------------------------------------
2663
2664INIT_MMX mmxext
2665cglobal pred4x4_vertical_right_8, 3,3
2666 sub r0, r2
2667 lea r1, [r0+r2*2]
2668 movh m0, [r0] ; ........t3t2t1t0
2669 movq m5, m0
2670 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2671 pavgb m5, m0
2672 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2673 movq m1, m0
2674 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2675 movq m2, m0
2676 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2677 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2678 movq m1, m3
2679 psrlq m3, 16
2680 psllq m1, 48
2681 movh [r0+r2*1], m5
2682 movh [r0+r2*2], m3
2683 PALIGNR m5, m1, 7, m2
2684 psllq m1, 8
2685 movh [r1+r2*1], m5
2686 PALIGNR m3, m1, 7, m1
2687 movh [r1+r2*2], m3
2688 RET
2689
2690;-----------------------------------------------------------------------------
2691; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2692; int stride)
2693;-----------------------------------------------------------------------------
2694
2695INIT_MMX mmxext
2696cglobal pred4x4_down_right_8, 3,3
2697 sub r0, r2
2698 lea r1, [r0+r2*2]
2699 movq m1, [r1-8]
2700 movq m2, [r0+r2*1-8]
2701 punpckhbw m2, [r0-8]
2702 movh m3, [r0]
2703 punpckhwd m1, m2
2704 PALIGNR m3, m1, 5, m1
2705 movq m1, m3
2706 PALIGNR m3, [r1+r2*1-8], 7, m4
2707 movq m2, m3
2708 PALIGNR m3, [r1+r2*2-8], 7, m4
2709 PRED4x4_LOWPASS m0, m3, m1, m2, m4
2710 movh [r1+r2*2], m0
2711 psrlq m0, 8
2712 movh [r1+r2*1], m0
2713 psrlq m0, 8
2714 movh [r0+r2*2], m0
2715 psrlq m0, 8
2716 movh [r0+r2*1], m0
2717 RET