Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp9intrapred.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* VP9 Intra prediction SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* Parts based on:
7;* H.264 intra prediction asm optimizations
8;* Copyright (c) 2010 Fiona Glaser
9;* Copyright (c) 2010 Holger Lubitz
10;* Copyright (c) 2010 Loren Merritt
11;* Copyright (c) 2010 Ronald S. Bultje
12;*
13;* This file is part of FFmpeg.
14;*
15;* FFmpeg is free software; you can redistribute it and/or
16;* modify it under the terms of the GNU Lesser General Public
17;* License as published by the Free Software Foundation; either
18;* version 2.1 of the License, or (at your option) any later version.
19;*
20;* FFmpeg is distributed in the hope that it will be useful,
21;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23;* Lesser General Public License for more details.
24;*
25;* You should have received a copy of the GNU Lesser General Public
26;* License along with FFmpeg; if not, write to the Free Software
27;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28;******************************************************************************
29
30%include "libavutil/x86/x86util.asm"
31
32SECTION_RODATA 32
33
34pw_m256: times 16 dw -256
35pw_m255: times 16 dw -255
36pw_4096: times 8 dw 4096
37
38pb_4x3_4x2_4x1_4x0: times 4 db 3
39 times 4 db 2
40 times 4 db 1
41 times 4 db 0
42pb_8x1_8x0: times 8 db 1
43 times 8 db 0
44pb_8x3_8x2: times 8 db 3
45 times 8 db 2
46pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7
47 times 8 db -1
48pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6
49 times 9 db 7
50pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
51 times 10 db 7
52pb_2to6_3x7:
53pb_2to6_11x7: db 2, 3, 4, 5, 6
54 times 11 db 7
55pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
56pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
57pb_13456_3xm1: db 1, 3, 4, 5, 6
58 times 3 db -1
59pb_6012_4xm1: db 6, 0, 1, 2
60 times 4 db -1
61pb_6xm1_246_8toE: times 6 db -1
62 db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
63pb_6xm1_BDF_0to6: times 6 db -1
64 db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
65pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66pb_7to1_9x0: db 7, 6, 5, 4
67pb_3to1_5x0: db 3, 2, 1
68 times 9 db 0
69pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
70
71pb_2: times 32 db 2
72pb_15: times 16 db 15
73
74cextern pb_1
75cextern pb_3
76cextern pw_512
77cextern pw_1024
78cextern pw_2048
79cextern pw_8192
80
81SECTION .text
82
83; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
84
85INIT_MMX ssse3
86cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
87 movd m0, [lq]
88 punpckldq m0, [aq]
89 pxor m1, m1
90 psadbw m0, m1
91 pmulhrsw m0, [pw_4096]
92 pshufb m0, m1
93 movd [dstq+strideq*0], m0
94 movd [dstq+strideq*1], m0
95 lea dstq, [dstq+strideq*2]
96 movd [dstq+strideq*0], m0
97 movd [dstq+strideq*1], m0
98 RET
99
100INIT_MMX ssse3
101cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
102 movq m0, [lq]
103 movq m1, [aq]
104 DEFINE_ARGS dst, stride, stride3
105 lea stride3q, [strideq*3]
106 pxor m2, m2
107 psadbw m0, m2
108 psadbw m1, m2
109 paddw m0, m1
110 pmulhrsw m0, [pw_2048]
111 pshufb m0, m2
112 movq [dstq+strideq*0], m0
113 movq [dstq+strideq*1], m0
114 movq [dstq+strideq*2], m0
115 movq [dstq+stride3q ], m0
116 lea dstq, [dstq+strideq*4]
117 movq [dstq+strideq*0], m0
118 movq [dstq+strideq*1], m0
119 movq [dstq+strideq*2], m0
120 movq [dstq+stride3q ], m0
121 RET
122
123INIT_XMM ssse3
124cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
125 mova m0, [lq]
126 mova m1, [aq]
127 DEFINE_ARGS dst, stride, stride3, cnt
128 lea stride3q, [strideq*3]
129 pxor m2, m2
130 psadbw m0, m2
131 psadbw m1, m2
132 paddw m0, m1
133 movhlps m1, m0
134 paddw m0, m1
135 pmulhrsw m0, [pw_1024]
136 pshufb m0, m2
137 mov cntd, 4
138.loop:
139 mova [dstq+strideq*0], m0
140 mova [dstq+strideq*1], m0
141 mova [dstq+strideq*2], m0
142 mova [dstq+stride3q ], m0
143 lea dstq, [dstq+strideq*4]
144 dec cntd
145 jg .loop
146 RET
147
148INIT_XMM ssse3
149cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
150 mova m0, [lq]
151 mova m1, [lq+16]
152 mova m2, [aq]
153 mova m3, [aq+16]
154 DEFINE_ARGS dst, stride, stride3, cnt
155 lea stride3q, [strideq*3]
156 pxor m4, m4
157 psadbw m0, m4
158 psadbw m1, m4
159 psadbw m2, m4
160 psadbw m3, m4
161 paddw m0, m1
162 paddw m2, m3
163 paddw m0, m2
164 movhlps m1, m0
165 paddw m0, m1
166 pmulhrsw m0, [pw_512]
167 pshufb m0, m4
168 mov cntd, 8
169.loop:
170 mova [dstq+strideq*0+ 0], m0
171 mova [dstq+strideq*0+16], m0
172 mova [dstq+strideq*1+ 0], m0
173 mova [dstq+strideq*1+16], m0
174 mova [dstq+strideq*2+ 0], m0
175 mova [dstq+strideq*2+16], m0
176 mova [dstq+stride3q + 0], m0
177 mova [dstq+stride3q +16], m0
178 lea dstq, [dstq+strideq*4]
179 dec cntd
180 jg .loop
181 RET
182
183%if HAVE_AVX2_EXTERNAL
184INIT_YMM avx2
185cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
186 mova m0, [lq]
187 mova m1, [aq]
188 DEFINE_ARGS dst, stride, stride3, cnt
189 lea stride3q, [strideq*3]
190 pxor m2, m2
191 psadbw m0, m2
192 psadbw m1, m2
193 paddw m0, m1
194 vextracti128 xm1, m0, 1
195 paddw xm0, xm1
196 movhlps xm1, xm0
197 paddw xm0, xm1
198 pmulhrsw xm0, [pw_512]
199 vpbroadcastb m0, xm0
200 mov cntd, 4
201.loop:
202 mova [dstq+strideq*0], m0
203 mova [dstq+strideq*1], m0
204 mova [dstq+strideq*2], m0
205 mova [dstq+stride3q ], m0
206 lea dstq, [dstq+strideq*4]
207 mova [dstq+strideq*0], m0
208 mova [dstq+strideq*1], m0
209 mova [dstq+strideq*2], m0
210 mova [dstq+stride3q ], m0
211 lea dstq, [dstq+strideq*4]
212 dec cntd
213 jg .loop
214 RET
215%endif
216
217; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
218
219%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
220INIT_MMX ssse3
221cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
222 movd m0, [%2q]
223 pxor m1, m1
224 psadbw m0, m1
225 pmulhrsw m0, [pw_8192]
226 pshufb m0, m1
227 movd [dstq+strideq*0], m0
228 movd [dstq+strideq*1], m0
229 lea dstq, [dstq+strideq*2]
230 movd [dstq+strideq*0], m0
231 movd [dstq+strideq*1], m0
232 RET
233
234INIT_MMX ssse3
235cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
236 movq m0, [%2q]
237 DEFINE_ARGS dst, stride, stride3
238 lea stride3q, [strideq*3]
239 pxor m1, m1
240 psadbw m0, m1
241 pmulhrsw m0, [pw_4096]
242 pshufb m0, m1
243 movq [dstq+strideq*0], m0
244 movq [dstq+strideq*1], m0
245 movq [dstq+strideq*2], m0
246 movq [dstq+stride3q ], m0
247 lea dstq, [dstq+strideq*4]
248 movq [dstq+strideq*0], m0
249 movq [dstq+strideq*1], m0
250 movq [dstq+strideq*2], m0
251 movq [dstq+stride3q ], m0
252 RET
253
254INIT_XMM ssse3
255cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
256 mova m0, [%2q]
257 DEFINE_ARGS dst, stride, stride3, cnt
258 lea stride3q, [strideq*3]
259 pxor m2, m2
260 psadbw m0, m2
261 movhlps m1, m0
262 paddw m0, m1
263 pmulhrsw m0, [pw_2048]
264 pshufb m0, m2
265 mov cntd, 4
266.loop:
267 mova [dstq+strideq*0], m0
268 mova [dstq+strideq*1], m0
269 mova [dstq+strideq*2], m0
270 mova [dstq+stride3q ], m0
271 lea dstq, [dstq+strideq*4]
272 dec cntd
273 jg .loop
274 RET
275
276INIT_XMM ssse3
277cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
278 mova m0, [%2q]
279 mova m1, [%2q+16]
280 DEFINE_ARGS dst, stride, stride3, cnt
281 lea stride3q, [strideq*3]
282 pxor m2, m2
283 psadbw m0, m2
284 psadbw m1, m2
285 paddw m0, m1
286 movhlps m1, m0
287 paddw m0, m1
288 pmulhrsw m0, [pw_1024]
289 pshufb m0, m2
290 mov cntd, 8
291.loop:
292 mova [dstq+strideq*0+ 0], m0
293 mova [dstq+strideq*0+16], m0
294 mova [dstq+strideq*1+ 0], m0
295 mova [dstq+strideq*1+16], m0
296 mova [dstq+strideq*2+ 0], m0
297 mova [dstq+strideq*2+16], m0
298 mova [dstq+stride3q + 0], m0
299 mova [dstq+stride3q +16], m0
300 lea dstq, [dstq+strideq*4]
301 dec cntd
302 jg .loop
303 RET
304
305%if HAVE_AVX2_EXTERNAL
306INIT_YMM avx2
307cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
308 mova m0, [%2q]
309 DEFINE_ARGS dst, stride, stride3, cnt
310 lea stride3q, [strideq*3]
311 pxor m2, m2
312 psadbw m0, m2
313 vextracti128 xm1, m0, 1
314 paddw xm0, xm1
315 movhlps xm1, xm0
316 paddw xm0, xm1
317 pmulhrsw xm0, [pw_1024]
318 vpbroadcastb m0, xm0
319 mov cntd, 4
320.loop:
321 mova [dstq+strideq*0], m0
322 mova [dstq+strideq*1], m0
323 mova [dstq+strideq*2], m0
324 mova [dstq+stride3q ], m0
325 lea dstq, [dstq+strideq*4]
326 mova [dstq+strideq*0], m0
327 mova [dstq+strideq*1], m0
328 mova [dstq+strideq*2], m0
329 mova [dstq+stride3q ], m0
330 lea dstq, [dstq+strideq*4]
331 dec cntd
332 jg .loop
333 RET
334%endif
335%endmacro
336
337DC_1D_FUNCS top, a
338DC_1D_FUNCS left, l
339
340; v
341
342INIT_MMX mmx
343cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
344 movq m0, [aq]
345 DEFINE_ARGS dst, stride, stride3
346 lea stride3q, [strideq*3]
347 movq [dstq+strideq*0], m0
348 movq [dstq+strideq*1], m0
349 movq [dstq+strideq*2], m0
350 movq [dstq+stride3q ], m0
351 lea dstq, [dstq+strideq*4]
352 movq [dstq+strideq*0], m0
353 movq [dstq+strideq*1], m0
354 movq [dstq+strideq*2], m0
355 movq [dstq+stride3q ], m0
356 RET
357
358INIT_XMM sse2
359cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
360 mova m0, [aq]
361 DEFINE_ARGS dst, stride, stride3, cnt
362 lea stride3q, [strideq*3]
363 mov cntd, 4
364.loop:
365 mova [dstq+strideq*0], m0
366 mova [dstq+strideq*1], m0
367 mova [dstq+strideq*2], m0
368 mova [dstq+stride3q ], m0
369 lea dstq, [dstq+strideq*4]
370 dec cntd
371 jg .loop
372 RET
373
374INIT_XMM sse2
375cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
376 mova m0, [aq]
377 mova m1, [aq+16]
378 DEFINE_ARGS dst, stride, stride3, cnt
379 lea stride3q, [strideq*3]
380 mov cntd, 8
381.loop:
382 mova [dstq+strideq*0+ 0], m0
383 mova [dstq+strideq*0+16], m1
384 mova [dstq+strideq*1+ 0], m0
385 mova [dstq+strideq*1+16], m1
386 mova [dstq+strideq*2+ 0], m0
387 mova [dstq+strideq*2+16], m1
388 mova [dstq+stride3q + 0], m0
389 mova [dstq+stride3q +16], m1
390 lea dstq, [dstq+strideq*4]
391 dec cntd
392 jg .loop
393 RET
394
395%if HAVE_AVX2_EXTERNAL
396INIT_YMM avx2
397cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
398 mova m0, [aq]
399 DEFINE_ARGS dst, stride, stride3, cnt
400 lea stride3q, [strideq*3]
401 mov cntd, 4
402.loop:
403 mova [dstq+strideq*0], m0
404 mova [dstq+strideq*1], m0
405 mova [dstq+strideq*2], m0
406 mova [dstq+stride3q ], m0
407 lea dstq, [dstq+strideq*4]
408 mova [dstq+strideq*0], m0
409 mova [dstq+strideq*1], m0
410 mova [dstq+strideq*2], m0
411 mova [dstq+stride3q ], m0
412 lea dstq, [dstq+strideq*4]
413 dec cntd
414 jg .loop
415 RET
416%endif
417
418; h
419
420INIT_XMM ssse3
421cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
422 movd m0, [lq]
423 pshufb m0, [pb_4x3_4x2_4x1_4x0]
424 lea stride3q, [strideq*3]
425 movd [dstq+strideq*0], m0
426 psrldq m0, 4
427 movd [dstq+strideq*1], m0
428 psrldq m0, 4
429 movd [dstq+strideq*2], m0
430 psrldq m0, 4
431 movd [dstq+stride3q ], m0
432 RET
433
434%macro H_XMM_FUNCS 1
435INIT_XMM %1
436cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
437 mova m2, [pb_8x1_8x0]
438 mova m3, [pb_8x3_8x2]
439 lea stride3q, [strideq*3]
440 mov cntq, 1
441.loop:
442 movd m0, [lq+cntq*4]
443 pshufb m1, m0, m3
444 pshufb m0, m2
445 movq [dstq+strideq*0], m1
446 movhps [dstq+strideq*1], m1
447 movq [dstq+strideq*2], m0
448 movhps [dstq+stride3q ], m0
449 lea dstq, [dstq+strideq*4]
450 dec cntq
451 jge .loop
452 RET
453
454INIT_XMM %1
455cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
456 mova m5, [pb_1]
457 mova m6, [pb_2]
458 mova m7, [pb_3]
459 pxor m4, m4
460 lea stride3q, [strideq*3]
461 mov cntq, 3
462.loop:
463 movd m3, [lq+cntq*4]
464 pshufb m0, m3, m7
465 pshufb m1, m3, m6
466 mova [dstq+strideq*0], m0
467 mova [dstq+strideq*1], m1
468 pshufb m2, m3, m5
469 pshufb m3, m4
470 mova [dstq+strideq*2], m2
471 mova [dstq+stride3q ], m3
472 lea dstq, [dstq+strideq*4]
473 dec cntq
474 jge .loop
475 RET
476
477INIT_XMM %1
478cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
479 mova m5, [pb_1]
480 mova m6, [pb_2]
481 mova m7, [pb_3]
482 pxor m4, m4
483 lea stride3q, [strideq*3]
484 mov cntq, 7
485.loop:
486 movd m3, [lq+cntq*4]
487 pshufb m0, m3, m7
488 pshufb m1, m3, m6
489 mova [dstq+strideq*0+ 0], m0
490 mova [dstq+strideq*0+16], m0
491 mova [dstq+strideq*1+ 0], m1
492 mova [dstq+strideq*1+16], m1
493 pshufb m2, m3, m5
494 pshufb m3, m4
495 mova [dstq+strideq*2+ 0], m2
496 mova [dstq+strideq*2+16], m2
497 mova [dstq+stride3q + 0], m3
498 mova [dstq+stride3q +16], m3
499 lea dstq, [dstq+strideq*4]
500 dec cntq
501 jge .loop
502 RET
503%endmacro
504
505H_XMM_FUNCS ssse3
506H_XMM_FUNCS avx
507
508%if HAVE_AVX2_EXTERNAL
509INIT_YMM avx2
510cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
511 mova m5, [pb_1]
512 mova m6, [pb_2]
513 mova m7, [pb_3]
514 pxor m4, m4
515 lea stride3q, [strideq*3]
516 mov cntq, 7
517.loop:
518 movd xm3, [lq+cntq*4]
519 vinserti128 m3, m3, xm3, 1
520 pshufb m0, m3, m7
521 pshufb m1, m3, m6
522 mova [dstq+strideq*0], m0
523 mova [dstq+strideq*1], m1
524 pshufb m2, m3, m5
525 pshufb m3, m4
526 mova [dstq+strideq*2], m2
527 mova [dstq+stride3q ], m3
528 lea dstq, [dstq+strideq*4]
529 dec cntq
530 jge .loop
531 RET
532%endif
533
534; tm
535
536INIT_MMX ssse3
537cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
538 pxor m1, m1
539 pinsrw m2, [aq-1], 0
540 movd m0, [aq]
541 DEFINE_ARGS dst, stride, l, cnt
542 mova m3, [pw_m256]
543 mova m4, [pw_m255]
544 pshufb m2, m3
545 punpcklbw m0, m1
546 psubw m0, m2
547 mov cntq, 1
548.loop:
549 pinsrw m2, [lq+cntq*2], 0
550 pshufb m1, m2, m4
551 pshufb m2, m3
552 paddw m1, m0
553 paddw m2, m0
554 packuswb m1, m1
555 packuswb m2, m2
556 movd [dstq+strideq*0], m1
557 movd [dstq+strideq*1], m2
558 lea dstq, [dstq+strideq*2]
559 dec cntq
560 jge .loop
561 RET
562
563%macro TM_XMM_FUNCS 1
564INIT_XMM %1
565cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
566 pxor m1, m1
567 pinsrw m2, [aq-1], 0
568 movh m0, [aq]
569 DEFINE_ARGS dst, stride, l, cnt
570 mova m3, [pw_m256]
571 mova m4, [pw_m255]
572 pshufb m2, m3
573 punpcklbw m0, m1
574 psubw m0, m2
575 mov cntq, 3
576.loop:
577 pinsrw m2, [lq+cntq*2], 0
578 pshufb m1, m2, m4
579 pshufb m2, m3
580 paddw m1, m0
581 paddw m2, m0
582 packuswb m1, m2
583 movh [dstq+strideq*0], m1
584 movhps [dstq+strideq*1], m1
585 lea dstq, [dstq+strideq*2]
586 dec cntq
587 jge .loop
588 RET
589
590INIT_XMM %1
591cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
592 pxor m3, m3
593 pinsrw m2, [aq-1], 0
594 mova m0, [aq]
595 DEFINE_ARGS dst, stride, l, cnt
596 mova m4, [pw_m256]
597 mova m5, [pw_m255]
598 pshufb m2, m4
599 punpckhbw m1, m0, m3
600 punpcklbw m0, m3
601 psubw m1, m2
602 psubw m0, m2
603 mov cntq, 7
604.loop:
605 pinsrw m7, [lq+cntq*2], 0
606 pshufb m3, m7, m5
607 pshufb m7, m4
608 paddw m2, m3, m0
609 paddw m3, m1
610 paddw m6, m7, m0
611 paddw m7, m1
612 packuswb m2, m3
613 packuswb m6, m7
614 mova [dstq+strideq*0], m2
615 mova [dstq+strideq*1], m6
616 lea dstq, [dstq+strideq*2]
617 dec cntq
618 jge .loop
619 RET
620
621%if ARCH_X86_64
622INIT_XMM %1
623cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
624 pxor m5, m5
625 pinsrw m4, [aq-1], 0
626 mova m0, [aq]
627 mova m2, [aq+16]
628 DEFINE_ARGS dst, stride, l, cnt
629 mova m8, [pw_m256]
630 mova m9, [pw_m255]
631 pshufb m4, m8
632 punpckhbw m1, m0, m5
633 punpckhbw m3, m2, m5
634 punpcklbw m0, m5
635 punpcklbw m2, m5
636 psubw m1, m4
637 psubw m0, m4
638 psubw m3, m4
639 psubw m2, m4
640 mov cntq, 15
641.loop:
642 pinsrw m13, [lq+cntq*2], 0
643 pshufb m7, m13, m9
644 pshufb m13, m8
645 paddw m4, m7, m0
646 paddw m5, m7, m1
647 paddw m6, m7, m2
648 paddw m7, m3
649 paddw m10, m13, m0
650 paddw m11, m13, m1
651 paddw m12, m13, m2
652 paddw m13, m3
653 packuswb m4, m5
654 packuswb m6, m7
655 packuswb m10, m11
656 packuswb m12, m13
657 mova [dstq+strideq*0+ 0], m4
658 mova [dstq+strideq*0+16], m6
659 mova [dstq+strideq*1+ 0], m10
660 mova [dstq+strideq*1+16], m12
661 lea dstq, [dstq+strideq*2]
662 dec cntq
663 jge .loop
664 RET
665%endif
666%endmacro
667
668TM_XMM_FUNCS ssse3
669TM_XMM_FUNCS avx
670
671%if HAVE_AVX2_EXTERNAL
672INIT_YMM avx2
673cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
674 pxor m3, m3
675 pinsrw xm2, [aq-1], 0
676 vinserti128 m2, m2, xm2, 1
677 mova m0, [aq]
678 DEFINE_ARGS dst, stride, l, cnt
679 mova m4, [pw_m256]
680 mova m5, [pw_m255]
681 pshufb m2, m4
682 punpckhbw m1, m0, m3
683 punpcklbw m0, m3
684 psubw m1, m2
685 psubw m0, m2
686 mov cntq, 15
687.loop:
688 pinsrw xm7, [lq+cntq*2], 0
689 vinserti128 m7, m7, xm7, 1
690 pshufb m3, m7, m5
691 pshufb m7, m4
692 paddw m2, m3, m0
693 paddw m3, m1
694 paddw m6, m7, m0
695 paddw m7, m1
696 packuswb m2, m3
697 packuswb m6, m7
698 mova [dstq+strideq*0], m2
699 mova [dstq+strideq*1], m6
700 lea dstq, [dstq+strideq*2]
701 dec cntq
702 jge .loop
703 RET
704%endif
705
706; dl
707
708%macro LOWPASS 4 ; left [dst], center, right, tmp
709 pxor m%4, m%1, m%3
710 pand m%4, [pb_1]
711 pavgb m%1, m%3
712 psubusb m%1, m%4
713 pavgb m%1, m%2
714%endmacro
715
716INIT_MMX ssse3
717cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
718 movq m1, [aq]
719 pshufb m0, m1, [pb_0to5_2x7]
720 pshufb m2, m1, [pb_2to6_3x7]
721 psrlq m1, 8
722 LOWPASS 0, 1, 2, 3
723
724 pshufw m1, m0, q3321
725 movd [dstq+strideq*0], m0
726 movd [dstq+strideq*2], m1
727 psrlq m0, 8
728 psrlq m1, 8
729 add dstq, strideq
730 movd [dstq+strideq*0], m0
731 movd [dstq+strideq*2], m1
732 RET
733
734%macro DL_XMM_FUNCS 1
735INIT_XMM %1
736cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
737 movq m0, [aq]
738 lea stride5q, [strideq*5]
739 pshufb m1, m0, [pb_1to6_10x7]
740 psrldq m2, m1, 1
741 shufps m0, m1, q3210
742 LOWPASS 0, 1, 2, 3
743
744 pshufd m1, m0, q3321
745 movq [dstq+strideq*0], m0
746 movq [dstq+strideq*4], m1
747 psrldq m0, 1
748 psrldq m1, 1
749 movq [dstq+strideq*1], m0
750 movq [dstq+stride5q ], m1
751 lea dstq, [dstq+strideq*2]
752 psrldq m0, 1
753 psrldq m1, 1
754 movq [dstq+strideq*0], m0
755 movq [dstq+strideq*4], m1
756 psrldq m0, 1
757 psrldq m1, 1
758 movq [dstq+strideq*1], m0
759 movq [dstq+stride5q ], m1
760 RET
761
762INIT_XMM %1
763cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
764 mova m5, [pb_1toE_2xF]
765 mova m0, [aq]
766 pshufb m1, m0, m5
767 pshufb m2, m1, m5
768 pshufb m4, m0, [pb_15]
769 LOWPASS 0, 1, 2, 3
770 DEFINE_ARGS dst, stride, cnt, stride9
771 lea stride9q, [strideq*3]
772 mov cntd, 4
773 lea stride9q, [stride9q*3]
774
775.loop:
776 movhlps m4, m0
777 mova [dstq+strideq*0], m0
778 pshufb m0, m5
779 mova [dstq+strideq*8], m4
780 movhlps m4, m0
781 mova [dstq+strideq*1], m0
782 pshufb m0, m5
783 mova [dstq+stride9q ], m4
784 lea dstq, [dstq+strideq*2]
785 dec cntd
786 jg .loop
787 RET
788
789INIT_XMM %1
790cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
791 mova m5, [pb_1toE_2xF]
792 mova m0, [aq]
793 mova m1, [aq+16]
794 palignr m2, m1, m0, 1
795 palignr m3, m1, m0, 2
796 LOWPASS 0, 2, 3, 4
797 pshufb m2, m1, m5
798 pshufb m3, m2, m5
799 pshufb m6, m1, [pb_15]
800 LOWPASS 1, 2, 3, 4
801 mova m7, m6
802 lea dst16q, [dstq +strideq*8]
803 mov cntd, 8
804 lea dst16q, [dst16q+strideq*8]
805.loop:
806 movhlps m7, m1
807 mova [dstq +strideq*0+ 0], m0
808 mova [dstq +strideq*0+16], m1
809 movhps [dstq+strideq*8+ 0], m0
810 movq [dstq +strideq*8+ 8], m1
811 mova [dstq +strideq*8+16], m7
812 mova [dst16q+strideq*0+ 0], m1
813 mova [dst16q+strideq*0+16], m6
814 mova [dst16q+strideq*8+ 0], m7
815 mova [dst16q+strideq*8+16], m6
816%if cpuflag(avx)
817 vpalignr m0, m1, m0, 1
818 pshufb m1, m5
819%else
820 palignr m2, m1, m0, 1
821 pshufb m1, m5
822 mova m0, m2
823%endif
824 add dstq, strideq
825 add dst16q, strideq
826 dec cntd
827 jg .loop
828 RET
829%endmacro
830
831DL_XMM_FUNCS ssse3
832DL_XMM_FUNCS avx
833
834; dr
835
836INIT_MMX ssse3
837cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
838 movd m0, [lq]
839 punpckldq m0, [aq-1]
840 movd m1, [aq+3]
841 DEFINE_ARGS dst, stride, stride3
842 lea stride3q, [strideq*3]
843 palignr m1, m0, 1
844 psrlq m2, m1, 8
845 LOWPASS 0, 1, 2, 3
846
847 movd [dstq+stride3q ], m0
848 psrlq m0, 8
849 movd [dstq+strideq*2], m0
850 psrlq m0, 8
851 movd [dstq+strideq*1], m0
852 psrlq m0, 8
853 movd [dstq+strideq*0], m0
854 RET
855
856%macro DR_XMM_FUNCS 1
857INIT_XMM %1
858cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
859 movq m1, [lq]
860 movhps m1, [aq-1]
861 movd m2, [aq+7]
862 DEFINE_ARGS dst, stride, stride3
863 lea stride3q, [strideq*3]
864 pslldq m0, m1, 1
865 palignr m2, m1, 1
866 LOWPASS 0, 1, 2, 3
867
868 movhps [dstq+strideq*0], m0
869 pslldq m0, 1
870 movhps [dstq+strideq*1], m0
871 pslldq m0, 1
872 movhps [dstq+strideq*2], m0
873 pslldq m0, 1
874 movhps [dstq+stride3q ], m0
875 pslldq m0, 1
876 lea dstq, [dstq+strideq*4]
877 movhps [dstq+strideq*0], m0
878 pslldq m0, 1
879 movhps [dstq+strideq*1], m0
880 pslldq m0, 1
881 movhps [dstq+strideq*2], m0
882 pslldq m0, 1
883 movhps [dstq+stride3q ], m0
884 RET
885
886INIT_XMM %1
887cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
888 mova m1, [lq]
889 movu m2, [aq-1]
890 movd m4, [aq+15]
891 DEFINE_ARGS dst, stride, stride9, cnt
892 lea stride9q, [strideq *3]
893 mov cntd, 4
894 lea stride9q, [stride9q*3]
895 palignr m4, m2, 1
896 palignr m3, m2, m1, 15
897 LOWPASS 3, 2, 4, 5
898 pslldq m0, m1, 1
899 palignr m2, m1, 1
900 LOWPASS 0, 1, 2, 4
901
902.loop:
903 mova [dstq+strideq*0 ], m3
904 movhps [dstq+strideq*8+0], m0
905 movq [dstq+strideq*8+8], m3
906 palignr m3, m0, 15
907 pslldq m0, 1
908 mova [dstq+strideq*1 ], m3
909 movhps [dstq+stride9q +0], m0
910 movq [dstq+stride9q +8], m3
911 palignr m3, m0, 15
912 pslldq m0, 1
913 lea dstq, [dstq+strideq*2]
914 dec cntd
915 jg .loop
916 RET
917
918INIT_XMM %1
919cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
920 mova m1, [lq]
921 mova m2, [lq+16]
922 movu m3, [aq-1]
923 movu m4, [aq+15]
924 movd m5, [aq+31]
925 DEFINE_ARGS dst, stride, stride8, cnt
926 lea stride8q, [strideq*8]
927 palignr m5, m4, 1
928 palignr m6, m4, m3, 15
929 LOWPASS 5, 4, 6, 7
930 palignr m4, m3, 1
931 palignr m6, m3, m2, 15
932 LOWPASS 4, 3, 6, 7
933 palignr m3, m2, 1
934 palignr m6, m2, m1, 15
935 LOWPASS 3, 2, 6, 7
936 palignr m2, m1, 1
937 pslldq m0, m1, 1
938 LOWPASS 2, 1, 0, 6
939 mov cntd, 16
940
941 ; out=m2/m3/m4/m5
942.loop:
943 mova [dstq+stride8q*0+ 0], m4
944 mova [dstq+stride8q*0+16], m5
945 mova [dstq+stride8q*2+ 0], m3
946 mova [dstq+stride8q*2+16], m4
947 palignr m5, m4, 15
948 palignr m4, m3, 15
949 palignr m3, m2, 15
950 pslldq m2, 1
951 add dstq, strideq
952 dec cntd
953 jg .loop
954 RET
955%endmacro
956
957DR_XMM_FUNCS ssse3
958DR_XMM_FUNCS avx
959
960; vl
961
962INIT_MMX ssse3
963cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
964 movq m0, [aq]
965 psrlq m1, m0, 8
966 psrlq m2, m1, 8
967 LOWPASS 2, 1, 0, 3
968 pavgb m1, m0
969 movd [dstq+strideq*0], m1
970 movd [dstq+strideq*1], m2
971 lea dstq, [dstq+strideq*2]
972 psrlq m1, 8
973 psrlq m2, 8
974 movd [dstq+strideq*0], m1
975 movd [dstq+strideq*1], m2
976 RET
977
978%macro VL_XMM_FUNCS 1
979INIT_XMM %1
980cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
981 movq m0, [aq]
982 pshufb m0, [pb_0to6_9x7]
983 DEFINE_ARGS dst, stride, stride3
984 lea stride3q, [strideq*3]
985 psrldq m1, m0, 1
986 psrldq m2, m0, 2
987 LOWPASS 2, 1, 0, 3
988 pavgb m1, m0
989
990 movq [dstq+strideq*0], m1
991 movq [dstq+strideq*1], m2
992 psrldq m1, 1
993 psrldq m2, 1
994 movq [dstq+strideq*2], m1
995 movq [dstq+stride3q ], m2
996 lea dstq, [dstq+strideq*4]
997 psrldq m1, 1
998 psrldq m2, 1
999 movq [dstq+strideq*0], m1
1000 movq [dstq+strideq*1], m2
1001 psrldq m1, 1
1002 psrldq m2, 1
1003 movq [dstq+strideq*2], m1
1004 movq [dstq+stride3q ], m2
1005 RET
1006
1007INIT_XMM %1
1008cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
1009 mova m0, [aq]
1010 mova m4, [pb_1toE_2xF]
1011 DEFINE_ARGS dst, stride, stride3, cnt
1012 lea stride3q, [strideq*3]
1013 pshufb m1, m0, m4
1014 pshufb m2, m1, m4
1015 LOWPASS 2, 1, 0, 3
1016 pavgb m1, m0
1017 mov cntd, 4
1018.loop:
1019 mova [dstq+strideq*0], m1
1020 mova [dstq+strideq*1], m2
1021 pshufb m1, m4
1022 pshufb m2, m4
1023 mova [dstq+strideq*2], m1
1024 mova [dstq+stride3q ], m2
1025 pshufb m1, m4
1026 pshufb m2, m4
1027 lea dstq, [dstq+strideq*4]
1028 dec cntd
1029 jg .loop
1030 RET
1031
1032INIT_XMM %1
1033cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
1034 mova m0, [aq]
1035 mova m5, [aq+16]
1036 mova m4, [pb_1toE_2xF]
1037 DEFINE_ARGS dst, stride, dst16, cnt
1038 palignr m2, m5, m0, 1
1039 palignr m3, m5, m0, 2
1040 lea dst16q, [dstq +strideq*8]
1041 LOWPASS 3, 2, 0, 6
1042 pavgb m2, m0
1043 pshufb m0, m5, m4
1044 pshufb m1, m0, m4
1045 lea dst16q, [dst16q+strideq*8]
1046 LOWPASS 1, 0, 5, 6
1047 pavgb m0, m5
1048 pshufb m5, [pb_15]
1049 mov cntd, 8
1050
1051.loop:
1052%macro %%write 3
1053 mova [dstq+stride%1+ 0], %2
1054 mova [dstq+stride%1+16], %3
1055 movhps [dst16q+stride%1 ], %2
1056 movu [dst16q+stride%1+ 8], %3
1057 movq [dst16q+stride%1+24], m5
1058%if cpuflag(avx)
1059 palignr %2, %3, %2, 1
1060 pshufb %3, m4
1061%else
1062 palignr m6, %3, %2, 1
1063 pshufb %3, m4
1064 mova %2, m6
1065%endif
1066%endmacro
1067
1068 %%write q*0, m2, m0
1069 %%write q*1, m3, m1
1070 lea dstq, [dstq +strideq*2]
1071 lea dst16q, [dst16q+strideq*2]
1072 dec cntd
1073 jg .loop
1074 RET
1075%endmacro
1076
1077VL_XMM_FUNCS ssse3
1078VL_XMM_FUNCS avx
1079
1080; vr
1081
1082INIT_MMX ssse3
1083cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
1084 movq m1, [aq-1]
1085 punpckldq m2, [lq]
1086 movd m0, [aq]
1087 DEFINE_ARGS dst, stride, stride3
1088 lea stride3q, [strideq*3]
1089 pavgb m0, m1
1090 palignr m1, m2, 5
1091 psrlq m2, m1, 8
1092 psllq m3, m1, 8
1093 LOWPASS 2, 1, 3, 4
1094
1095 ; ABCD <- for the following predictor:
1096 ; EFGH
1097 ; IABC | m0 contains ABCDxxxx
1098 ; JEFG | m2 contains xJIEFGHx
1099
1100 punpckldq m0, m2
1101 pshufb m2, [pb_13456_3xm1]
1102 movd [dstq+strideq*0], m0
1103 pshufb m0, [pb_6012_4xm1]
1104 movd [dstq+stride3q ], m2
1105 psrlq m2, 8
1106 movd [dstq+strideq*2], m0
1107 movd [dstq+strideq*1], m2
1108 RET
1109
1110%macro VR_XMM_FUNCS 1
1111INIT_XMM %1
1112cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
1113 movu m1, [aq-1]
1114 movhps m2, [lq]
1115 movq m0, [aq]
1116 DEFINE_ARGS dst, stride, stride3
1117 lea stride3q, [strideq*3]
1118 pavgb m0, m1
1119 palignr m1, m2, 9
1120 pslldq m2, m1, 1
1121 pslldq m3, m1, 2
1122 LOWPASS 1, 2, 3, 4
1123
1124 ; ABCDEFGH <- for the following predictor:
1125 ; IJKLMNOP
1126 ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx
1127 ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP
1128 ; SQABCDEF
1129 ; TRIJKLMN
1130 ; USQABCDE
1131 ; VTRIJKLM
1132
1133 punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
1134 movq [dstq+strideq*0], m0
1135 pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
1136 movhps [dstq+strideq*1], m1
1137 pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
1138 movhps [dstq+strideq*2], m0
1139 pslldq m0, 1
1140 movhps [dstq+stride3q ], m1
1141 lea dstq, [dstq+strideq*4]
1142 pslldq m1, 1
1143 movhps [dstq+strideq*0], m0
1144 pslldq m0, 1
1145 movhps [dstq+strideq*1], m1
1146 pslldq m1, 1
1147 movhps [dstq+strideq*2], m0
1148 movhps [dstq+stride3q ], m1
1149 RET
1150
1151INIT_XMM %1
1152cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a
1153 mova m0, [aq]
1154 movu m1, [aq-1]
1155 mova m2, [lq]
1156 DEFINE_ARGS dst, stride, stride3, cnt
1157 lea stride3q, [strideq*3]
1158 palignr m3, m1, m2, 15
1159 LOWPASS 3, 1, 0, 4
1160 pavgb m0, m1
1161 palignr m1, m2, 1
1162 pslldq m4, m2, 1
1163 LOWPASS 1, 2, 4, 5
1164 pshufb m1, [pb_02468ACE_13579BDF]
1165 mov cntd, 4
1166
1167.loop:
1168 movlhps m2, m1
1169 mova [dstq+strideq*0], m0
1170 mova [dstq+strideq*1], m3
1171 palignr m4, m0, m1, 15
1172 palignr m5, m3, m2, 15
1173 mova [dstq+strideq*2], m4
1174 mova [dstq+stride3q ], m5
1175 lea dstq, [dstq+strideq*4]
1176 palignr m0, m1, 14
1177 palignr m3, m2, 14
1178 pslldq m1, 2
1179 dec cntd
1180 jg .loop
1181 RET
1182
1183%if ARCH_X86_64
1184INIT_XMM %1
1185cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
1186 mova m0, [aq]
1187 mova m2, [aq+16]
1188 movu m1, [aq-1]
1189 palignr m3, m2, m0, 15
1190 palignr m4, m2, m0, 14
1191 LOWPASS 4, 3, 2, 5
1192 pavgb m3, m2
1193 mova m2, [lq+16]
1194 palignr m5, m1, m2, 15
1195 LOWPASS 5, 1, 0, 6
1196 pavgb m0, m1
1197 mova m6, [lq]
1198 palignr m1, m2, 1
1199 palignr m7, m2, m6, 15
1200 LOWPASS 1, 2, 7, 8
1201 palignr m2, m6, 1
1202 pslldq m7, m6, 1
1203 LOWPASS 2, 6, 7, 8
1204 pshufb m1, [pb_02468ACE_13579BDF]
1205 pshufb m2, [pb_02468ACE_13579BDF]
1206 DEFINE_ARGS dst, stride, dst16, cnt
1207 lea dst16q, [dstq +strideq*8]
1208 lea dst16q, [dst16q+strideq*8]
1209 SBUTTERFLY qdq, 2, 1, 6
1210 mov cntd, 8
1211
1212.loop:
1213 ; even lines (0, 2, 4, ...): m1 | m0, m3
1214 ; odd lines (1, 3, 5, ...): m2 | m5, m4
1215%macro %%write 4
1216 mova [dstq+stride%1+ 0], %3
1217 mova [dstq+stride%1+16], %4
1218 movhps [dst16q+stride%1 ], %2
1219 movu [dst16q+stride%1+ 8], %3
1220 movq [dst16q+stride%1+24], %4
1221 palignr %4, %3, 15
1222 palignr %3, %2, 15
1223 pslldq %2, 1
1224%endmacro
1225
1226 %%write q*0, m1, m0, m3
1227 %%write q*1, m2, m5, m4
1228 lea dstq, [dstq +strideq*2]
1229 lea dst16q, [dst16q+strideq*2]
1230 dec cntd
1231 jg .loop
1232 RET
1233%endif
1234%endmacro
1235
1236VR_XMM_FUNCS ssse3
1237VR_XMM_FUNCS avx
1238
1239; hd
1240
1241INIT_MMX ssse3
1242cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
1243 movd m0, [lq]
1244 punpckldq m0, [aq-1]
1245 DEFINE_ARGS dst, stride, stride3
1246 lea stride3q, [strideq*3]
1247 psrlq m1, m0, 8
1248 psrlq m2, m1, 8
1249 LOWPASS 2, 1, 0, 3
1250 pavgb m1, m0
1251
1252 ; DHIJ <- for the following predictor:
1253 ; CGDH
1254 ; BFCG | m1 contains ABCDxxxx
1255 ; AEBF | m2 contains EFGHIJxx
1256
1257 punpcklbw m1, m2
1258 punpckhdq m0, m1, m2
1259
1260 ; m1 contains AEBFCGDH
1261 ; m0 contains CGDHIJxx
1262
1263 movd [dstq+stride3q ], m1
1264 movd [dstq+strideq*1], m0
1265 psrlq m1, 16
1266 psrlq m0, 16
1267 movd [dstq+strideq*2], m1
1268 movd [dstq+strideq*0], m0
1269 RET
1270
1271%macro HD_XMM_FUNCS 1
1272INIT_XMM %1
1273cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
1274 movq m0, [lq]
1275 movhps m0, [aq-1]
1276 DEFINE_ARGS dst, stride, stride3, dst4
1277 lea stride3q, [strideq*3]
1278 lea dst4q, [dstq+strideq*4]
1279 psrldq m1, m0, 1
1280 psrldq m2, m1, 1
1281 LOWPASS 2, 1, 0, 3
1282 pavgb m1, m0
1283
1284 ; HPQRSTUV <- for the following predictor
1285 ; GOHPQRST
1286 ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx
1287 ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx
1288 ; DLEMFNGO
1289 ; CKDLEMFN
1290 ; BJCKDLEM
1291 ; AIBJCKDL
1292
1293 punpcklbw m1, m2
1294 movhlps m2, m2
1295
1296 ; m1 contains AIBJCKDLEMFNGOHP
1297 ; m2 contains QRSTUVxxxxxxxxxx
1298
1299 movhps [dstq +stride3q ], m1
1300 movq [dst4q+stride3q ], m1
1301 palignr m3, m2, m1, 2
1302 movhps [dstq +strideq*2], m3
1303 movq [dst4q+strideq*2], m3
1304 palignr m3, m2, m1, 4
1305 movhps [dstq +strideq*1], m3
1306 movq [dst4q+strideq*1], m3
1307 palignr m2, m1, 6
1308 movhps [dstq +strideq*0], m2
1309 movq [dst4q+strideq*0], m2
1310 RET
1311
1312INIT_XMM %1
1313cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
1314 mova m0, [lq]
1315 movu m3, [aq-1]
1316 DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
1317 lea stride4q, [strideq*4]
1318 lea dst4q, [dstq +stride4q]
1319 lea dst8q, [dst4q+stride4q]
1320 lea dst12q, [dst8q+stride4q]
1321 psrldq m4, m3, 1
1322 psrldq m5, m3, 2
1323 LOWPASS 5, 4, 3, 6
1324 palignr m1, m3, m0, 1
1325 palignr m2, m3, m0, 2
1326 LOWPASS 2, 1, 0, 6
1327 pavgb m1, m0
1328 SBUTTERFLY bw, 1, 2, 6
1329
1330 ; I PROBABLY INVERTED L0 ad L16 here
1331 ; m1, m2, m5
1332.loop:
1333 sub stride4q, strideq
1334 movhps [dstq +stride4q +0], m2
1335 movq [dstq +stride4q +8], m5
1336 mova [dst4q+stride4q ], m2
1337 movhps [dst8q+stride4q +0], m1
1338 movq [dst8q+stride4q +8], m2
1339 mova [dst12q+stride4q ], m1
1340%if cpuflag(avx)
1341 palignr m1, m2, m1, 2
1342 palignr m2, m5, m2, 2
1343%else
1344 palignr m3, m2, m1, 2
1345 palignr m0, m5, m2, 2
1346 mova m1, m3
1347 mova m2, m0
1348%endif
1349 psrldq m5, 2
1350 jg .loop
1351 RET
1352
1353INIT_XMM %1
1354cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
1355 mova m0, [lq]
1356 mova m1, [lq+16]
1357 movu m2, [aq-1]
1358 movu m3, [aq+15]
1359 DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
1360 lea stride8q, [strideq*8]
1361 lea dst8q, [dstq +stride8q]
1362 lea dst16q, [dst8q +stride8q]
1363 lea dst24q, [dst16q+stride8q]
1364 psrldq m4, m3, 1
1365 psrldq m5, m3, 2
1366 LOWPASS 5, 4, 3, 6
1367 palignr m4, m3, m2, 2
1368 palignr m3, m2, 1
1369 LOWPASS 4, 3, 2, 6
1370 palignr m3, m2, m1, 2
1371 palignr m2, m1, 1
1372 LOWPASS 3, 2, 1, 6
1373 pavgb m2, m1
1374 palignr m6, m1, m0, 1
1375 palignr m1, m0, 2
1376 LOWPASS 1, 6, 0, 7
1377 pavgb m0, m6
1378 SBUTTERFLY bw, 2, 3, 6
1379 SBUTTERFLY bw, 0, 1, 6
1380
1381 ; m0, m1, m2, m3, m4, m5
1382.loop:
1383 sub stride8q, strideq
1384 mova [dstq +stride8q+ 0], m3
1385 mova [dstq +stride8q+16], m4
1386 mova [dst8q +stride8q+ 0], m2
1387 mova [dst8q +stride8q+16], m3
1388 mova [dst16q+stride8q+ 0], m1
1389 mova [dst16q+stride8q+16], m2
1390 mova [dst24q+stride8q+ 0], m0
1391 mova [dst24q+stride8q+16], m1
1392%if cpuflag(avx)
1393 palignr m0, m1, m0, 2
1394 palignr m1, m2, m1, 2
1395 palignr m2, m3, m2, 2
1396 palignr m3, m4, m3, 2
1397 palignr m4, m5, m4, 2
1398 psrldq m5, 2
1399%else
1400 psrldq m6, m5, 2
1401 palignr m5, m4, 2
1402 palignr m4, m3, 2
1403 palignr m3, m2, 2
1404 palignr m2, m1, 2
1405 palignr m1, m0, 2
1406 mova m0, m1
1407 mova m1, m2
1408 mova m2, m3
1409 mova m3, m4
1410 mova m4, m5
1411 mova m5, m6
1412%endif
1413 jg .loop
1414 RET
1415%endmacro
1416
1417HD_XMM_FUNCS ssse3
1418HD_XMM_FUNCS avx
1419
1420INIT_MMX ssse3
1421cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
1422 movd m0, [lq]
1423 pshufb m0, [pb_3to1_5x0]
1424 psrlq m1, m0, 8
1425 psrlq m2, m1, 8
1426 LOWPASS 2, 1, 0, 3
1427 pavgb m1, m0
1428 DEFINE_ARGS dst, stride, stride3
1429 lea stride3q, [strideq*3]
1430 SBUTTERFLY bw, 1, 2, 0
1431 palignr m2, m1, 2
1432 movd [dstq+strideq*0], m1
1433 movd [dstq+strideq*1], m2
1434 punpckhdq m1, m1
1435 punpckhdq m2, m2
1436 movd [dstq+strideq*2], m1
1437 movd [dstq+stride3q ], m2
1438 RET
1439
1440%macro HU_XMM_FUNCS 1
1441INIT_XMM %1
1442cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
1443 movq m0, [lq]
1444 pshufb m0, [pb_7to1_9x0]
1445 psrldq m1, m0, 1
1446 psrldq m2, m1, 1
1447 LOWPASS 2, 1, 0, 3
1448 pavgb m1, m0
1449 DEFINE_ARGS dst, stride, stride3, dst4
1450 lea stride3q, [strideq*3]
1451 lea dst4q, [dstq+strideq*4]
1452 SBUTTERFLY bw, 1, 2, 0
1453 movq [dstq +strideq*0], m1
1454 movhps [dst4q+strideq*0], m1
1455 palignr m0, m2, m1, 2
1456 movq [dstq +strideq*1], m0
1457 movhps [dst4q+strideq*1], m0
1458 palignr m0, m2, m1, 4
1459 movq [dstq +strideq*2], m0
1460 movhps [dst4q+strideq*2], m0
1461 palignr m2, m1, 6
1462 movq [dstq +stride3q ], m2
1463 movhps [dst4q+stride3q ], m2
1464 RET
1465
1466INIT_XMM %1
1467cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
1468 mova m0, [lq]
1469 pshufb m0, [pb_Fto0]
1470 mova m3, [pb_2toE_3xF]
1471 pshufb m1, m0, [pb_1toE_2xF]
1472 pshufb m2, m0, m3
1473 LOWPASS 2, 1, 0, 4
1474 pavgb m1, m0
1475 DEFINE_ARGS dst, stride, stride9, cnt
1476 lea stride9q, [strideq *3]
1477 mov cntd, 4
1478 lea stride9q, [stride9q*3]
1479 SBUTTERFLY bw, 1, 2, 0
1480
1481.loop:
1482 mova [dstq+strideq*0], m1
1483 mova [dstq+strideq*8], m2
1484 palignr m0, m2, m1, 2
1485 pshufb m2, m3
1486 mova [dstq+strideq*1], m0
1487 mova [dstq+stride9q ], m2
1488 palignr m1, m2, m0, 2
1489 pshufb m2, m3
1490 lea dstq, [dstq+strideq*2]
1491 dec cntd
1492 jg .loop
1493 RET
1494
1495INIT_XMM %1
1496cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
1497 mova m0, [lq]
1498 mova m1, [lq+16]
1499 mova m2, [pb_Fto0]
1500 mova m4, [pb_2toE_3xF]
1501 pshufb m0, m2
1502 pshufb m1, m2
1503 palignr m2, m0, m1, 1
1504 palignr m3, m0, m1, 2
1505 LOWPASS 3, 2, 1, 5
1506 pavgb m2, m1
1507 pshufb m1, m0, m4
1508 pshufb m5, m0, [pb_1toE_2xF]
1509 LOWPASS 1, 5, 0, 6
1510 pavgb m0, m5
1511 DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
1512 mov cntd, 8
1513 xor stride0q, stride0q
1514 lea dst8q, [dstq +strideq*8]
1515 lea dst16q, [dst8q +strideq*8]
1516 lea dst24q, [dst16q+strideq*8]
1517 SBUTTERFLY bw, 0, 1, 5
1518 SBUTTERFLY bw, 2, 3, 5
1519 pshufb m6, m1, [pb_15]
1520
1521.loop:
1522 mova [dstq +stride0q+ 0], m2
1523 mova [dstq +stride0q+16], m3
1524 mova [dst8q +stride0q+ 0], m3
1525 mova [dst8q +stride0q+16], m0
1526 mova [dst16q+stride0q+ 0], m0
1527 mova [dst16q+stride0q+16], m1
1528 mova [dst24q+stride0q+ 0], m1
1529 mova [dst24q+stride0q+16], m6
1530%if cpuflag(avx)
1531 palignr m2, m3, m2, 2
1532 palignr m3, m0, m3, 2
1533 palignr m0, m1, m0, 2
1534 pshufb m1, m4
1535%else
1536 pshufb m5, m1, m4
1537 palignr m1, m0, 2
1538 palignr m0, m3, 2
1539 palignr m3, m2, 2
1540 mova m2, m3
1541 mova m3, m0
1542 mova m0, m1
1543 mova m1, m5
1544%endif
1545 add stride0q, strideq
1546 dec cntd
1547 jg .loop
1548 RET
1549%endmacro
1550
1551HU_XMM_FUNCS ssse3
1552HU_XMM_FUNCS avx
1553
1554; FIXME 127, 128, 129 ?