Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / mc-a2.asm
1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2013 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
11 ;*
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
16 ;*
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
21 ;*
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;*
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at license @ x265.com.
28 ;*****************************************************************************
29
30 %include "x86inc.asm"
31 %include "x86util.asm"
32
33 SECTION_RODATA 32
34
35 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
36
37 %if HIGH_BIT_DEPTH
38 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
39 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
40 %else
41 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
42 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
43 %endif
44 pw_1024: times 16 dw 1024
45
46 pd_16: times 4 dd 16
47 pd_0f: times 4 dd 0xffff
48 pf_inv256: times 8 dd 0.00390625
49
50 SECTION .text
51
52 cextern pb_0
53 cextern pw_1
54 cextern pw_16
55 cextern pw_32
56 cextern pw_512
57 cextern pw_00ff
58 cextern pw_3fff
59 cextern pw_pixel_max
60 cextern pd_ffff
61
62 ;The hpel_filter routines use non-temporal writes for output.
63 ;The following defines may be uncommented for testing.
64 ;Doing the hpel_filter temporal may be a win if the last level cache
65 ;is big enough (preliminary benching suggests on the order of 4* framesize).
66
67 ;%define movntq movq
68 ;%define movntps movaps
69 ;%define sfence
70
71 %if HIGH_BIT_DEPTH == 0
72 %undef movntq
73 %undef movntps
74 %undef sfence
75 %endif ; !HIGH_BIT_DEPTH
76
77 ;-----------------------------------------------------------------------------
78 ; void plane_copy_core( pixel *dst, intptr_t i_dst,
79 ; pixel *src, intptr_t i_src, int w, int h )
80 ;-----------------------------------------------------------------------------
81 ; assumes i_dst and w are multiples of 16, and i_dst>w
82 INIT_MMX
83 cglobal plane_copy_core_mmx2, 6,7
84 FIX_STRIDES r1, r3, r4d
85 %if HIGH_BIT_DEPTH == 0
86 movsxdifnidn r4, r4d
87 %endif
88 sub r1, r4
89 sub r3, r4
90 .loopy:
91 lea r6d, [r4-63]
92 .loopx:
93 prefetchnta [r2+256]
94 movq m0, [r2 ]
95 movq m1, [r2+ 8]
96 movntq [r0 ], m0
97 movntq [r0+ 8], m1
98 movq m2, [r2+16]
99 movq m3, [r2+24]
100 movntq [r0+16], m2
101 movntq [r0+24], m3
102 movq m4, [r2+32]
103 movq m5, [r2+40]
104 movntq [r0+32], m4
105 movntq [r0+40], m5
106 movq m6, [r2+48]
107 movq m7, [r2+56]
108 movntq [r0+48], m6
109 movntq [r0+56], m7
110 add r2, 64
111 add r0, 64
112 sub r6d, 64
113 jg .loopx
114 prefetchnta [r2+256]
115 add r6d, 63
116 jle .end16
117 .loop16:
118 movq m0, [r2 ]
119 movq m1, [r2+8]
120 movntq [r0 ], m0
121 movntq [r0+8], m1
122 add r2, 16
123 add r0, 16
124 sub r6d, 16
125 jg .loop16
126 .end16:
127 add r0, r1
128 add r2, r3
129 dec r5d
130 jg .loopy
131 sfence
132 emms
133 RET
134
135
136 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
137 %if HIGH_BIT_DEPTH
138 %assign x 0
139 %rep 16/mmsize
140 mov%4 m0, [%2+(x/2)*mmsize]
141 mov%4 m1, [%3+(x/2)*mmsize]
142 punpckhwd m2, m0, m1
143 punpcklwd m0, m1
144 mov%5a [%1+(x+0)*mmsize], m0
145 mov%5a [%1+(x+1)*mmsize], m2
146 %assign x (x+2)
147 %endrep
148 %else
149 movq m0, [%2]
150 %if mmsize==16
151 %ifidn %4, a
152 punpcklbw m0, [%3]
153 %else
154 movq m1, [%3]
155 punpcklbw m0, m1
156 %endif
157 mov%5a [%1], m0
158 %else
159 movq m1, [%3]
160 punpckhbw m2, m0, m1
161 punpcklbw m0, m1
162 mov%5a [%1+0], m0
163 mov%5a [%1+8], m2
164 %endif
165 %endif ; HIGH_BIT_DEPTH
166 %endmacro
167
168 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
169 %if HIGH_BIT_DEPTH
170 %assign n 0
171 %rep 16/mmsize
172 mova m0, [%3+(n+0)*mmsize]
173 mova m1, [%3+(n+1)*mmsize]
174 psrld m2, m0, 16
175 psrld m3, m1, 16
176 pand m0, %5
177 pand m1, %5
178 packssdw m0, m1
179 packssdw m2, m3
180 mov%6 [%1+(n/2)*mmsize], m0
181 mov%6 [%2+(n/2)*mmsize], m2
182 %assign n (n+2)
183 %endrep
184 %else ; !HIGH_BIT_DEPTH
185 %if mmsize==16
186 mova m0, [%3]
187 %if cpuflag(ssse3)
188 pshufb m0, %5
189 %else
190 mova m1, m0
191 pand m0, %5
192 psrlw m1, 8
193 packuswb m0, m1
194 %endif
195 %if %4
196 mova [%1], m0
197 %else
198 movq [%1], m0
199 movhps [%2], m0
200 %endif
201 %else
202 mova m0, [%3]
203 mova m1, [%3+8]
204 mova m2, m0
205 mova m3, m1
206 pand m0, %5
207 pand m1, %5
208 psrlw m2, 8
209 psrlw m3, 8
210 packuswb m0, m1
211 packuswb m2, m3
212 mova [%1], m0
213 mova [%2], m2
214 %endif ; mmsize == 16
215 %endif ; HIGH_BIT_DEPTH
216 %endmacro
217
218 %macro PLANE_INTERLEAVE 0
219 ;-----------------------------------------------------------------------------
220 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
221 ; uint8_t *srcu, intptr_t i_srcu,
222 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
223 ;-----------------------------------------------------------------------------
224 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
225 cglobal plane_copy_interleave_core, 6,9
226 mov r6d, r6m
227 %if HIGH_BIT_DEPTH
228 FIX_STRIDES r1, r3, r5, r6d
229 movifnidn r1mp, r1
230 movifnidn r3mp, r3
231 mov r6m, r6d
232 %endif
233 lea r0, [r0+r6*2]
234 add r2, r6
235 add r4, r6
236 %if ARCH_X86_64
237 DECLARE_REG_TMP 7,8
238 %else
239 DECLARE_REG_TMP 1,3
240 %endif
241 mov t1, r1
242 shr t1, SIZEOF_PIXEL
243 sub t1, r6
244 mov t0d, r7m
245 .loopy:
246 mov r6d, r6m
247 neg r6
248 .prefetch:
249 prefetchnta [r2+r6]
250 prefetchnta [r4+r6]
251 add r6, 64
252 jl .prefetch
253 mov r6d, r6m
254 neg r6
255 .loopx:
256 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
257 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
258 add r6, 16*SIZEOF_PIXEL
259 jl .loopx
260 .pad:
261 %assign n 0
262 %rep SIZEOF_PIXEL
263 %if mmsize==8
264 movntq [r0+r6*2+(n+ 0)], m0
265 movntq [r0+r6*2+(n+ 8)], m0
266 movntq [r0+r6*2+(n+16)], m0
267 movntq [r0+r6*2+(n+24)], m0
268 %else
269 movntdq [r0+r6*2+(n+ 0)], m0
270 movntdq [r0+r6*2+(n+16)], m0
271 %endif
272 %assign n n+32
273 %endrep
274 add r6, 16*SIZEOF_PIXEL
275 cmp r6, t1
276 jl .pad
277 add r0, r1mp
278 add r2, r3mp
279 add r4, r5
280 dec t0d
281 jg .loopy
282 sfence
283 emms
284 RET
285
286 ;-----------------------------------------------------------------------------
287 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
288 ;-----------------------------------------------------------------------------
289 cglobal store_interleave_chroma, 5,5
290 FIX_STRIDES r1
291 .loop:
292 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
293 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
294 add r2, FDEC_STRIDEB*2
295 add r3, FDEC_STRIDEB*2
296 lea r0, [r0+r1*2]
297 sub r4d, 2
298 jg .loop
299 RET
300 %endmacro ; PLANE_INTERLEAVE
301
302 %macro DEINTERLEAVE_START 0
303 %if HIGH_BIT_DEPTH
304 mova m4, [pd_ffff]
305 %elif cpuflag(ssse3)
306 mova m4, [deinterleave_shuf]
307 %else
308 mova m4, [pw_00ff]
309 %endif ; HIGH_BIT_DEPTH
310 %endmacro
311
312 %macro PLANE_DEINTERLEAVE 0
313 ;-----------------------------------------------------------------------------
314 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
315 ; pixel *dstv, intptr_t i_dstv,
316 ; pixel *src, intptr_t i_src, int w, int h )
317 ;-----------------------------------------------------------------------------
318 cglobal plane_copy_deinterleave, 6,7
319 DEINTERLEAVE_START
320 mov r6d, r6m
321 FIX_STRIDES r1, r3, r5, r6d
322 %if HIGH_BIT_DEPTH
323 mov r6m, r6d
324 %endif
325 add r0, r6
326 add r2, r6
327 lea r4, [r4+r6*2]
328 .loopy:
329 mov r6d, r6m
330 neg r6
331 .loopx:
332 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
333 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
334 add r6, 16*SIZEOF_PIXEL
335 jl .loopx
336 add r0, r1
337 add r2, r3
338 add r4, r5
339 dec dword r7m
340 jg .loopy
341 RET
342
343 ;-----------------------------------------------------------------------------
344 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
345 ;-----------------------------------------------------------------------------
346 cglobal load_deinterleave_chroma_fenc, 4,4
347 DEINTERLEAVE_START
348 FIX_STRIDES r2
349 .loop:
350 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
351 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
352 add r0, FENC_STRIDEB*2
353 lea r1, [r1+r2*2]
354 sub r3d, 2
355 jg .loop
356 RET
357
358 ;-----------------------------------------------------------------------------
359 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
360 ;-----------------------------------------------------------------------------
361 cglobal load_deinterleave_chroma_fdec, 4,4
362 DEINTERLEAVE_START
363 FIX_STRIDES r2
364 .loop:
365 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
366 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
367 add r0, FDEC_STRIDEB*2
368 lea r1, [r1+r2*2]
369 sub r3d, 2
370 jg .loop
371 RET
372 %endmacro ; PLANE_DEINTERLEAVE
373
374 %if HIGH_BIT_DEPTH
375 INIT_MMX mmx2
376 PLANE_INTERLEAVE
377 INIT_MMX mmx
378 PLANE_DEINTERLEAVE
379 INIT_XMM sse2
380 PLANE_INTERLEAVE
381 PLANE_DEINTERLEAVE
382 INIT_XMM avx
383 PLANE_INTERLEAVE
384 PLANE_DEINTERLEAVE
385 %else
386 INIT_MMX mmx2
387 PLANE_INTERLEAVE
388 INIT_MMX mmx
389 PLANE_DEINTERLEAVE
390 INIT_XMM sse2
391 PLANE_INTERLEAVE
392 PLANE_DEINTERLEAVE
393 INIT_XMM ssse3
394 PLANE_DEINTERLEAVE
395 %endif
396
397 ; These functions are not general-use; not only do the SSE ones require aligned input,
398 ; but they also will fail if given a non-mod16 size.
399 ; memzero SSE will fail for non-mod128.
400
401 ;-----------------------------------------------------------------------------
402 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
403 ;-----------------------------------------------------------------------------
404 %macro MEMCPY 0
405 cglobal memcpy_aligned, 3,3
406 %if mmsize == 16
407 test r2d, 16
408 jz .copy2
409 mova m0, [r1+r2-16]
410 mova [r0+r2-16], m0
411 sub r2d, 16
412 .copy2:
413 %endif
414 test r2d, 2*mmsize
415 jz .copy4start
416 mova m0, [r1+r2-1*mmsize]
417 mova m1, [r1+r2-2*mmsize]
418 mova [r0+r2-1*mmsize], m0
419 mova [r0+r2-2*mmsize], m1
420 sub r2d, 2*mmsize
421 .copy4start:
422 test r2d, r2d
423 jz .ret
424 .copy4:
425 mova m0, [r1+r2-1*mmsize]
426 mova m1, [r1+r2-2*mmsize]
427 mova m2, [r1+r2-3*mmsize]
428 mova m3, [r1+r2-4*mmsize]
429 mova [r0+r2-1*mmsize], m0
430 mova [r0+r2-2*mmsize], m1
431 mova [r0+r2-3*mmsize], m2
432 mova [r0+r2-4*mmsize], m3
433 sub r2d, 4*mmsize
434 jg .copy4
435 .ret:
436 REP_RET
437 %endmacro
438
439 INIT_MMX mmx
440 MEMCPY
441 INIT_XMM sse
442 MEMCPY
443
444 ;-----------------------------------------------------------------------------
445 ; void *memzero_aligned( void *dst, size_t n );
446 ;-----------------------------------------------------------------------------
447 %macro MEMZERO 1
448 cglobal memzero_aligned, 2,2
449 add r0, r1
450 neg r1
451 %if mmsize == 8
452 pxor m0, m0
453 %else
454 xorps m0, m0
455 %endif
456 .loop:
457 %assign i 0
458 %rep %1
459 mova [r0 + r1 + i], m0
460 %assign i i+mmsize
461 %endrep
462 add r1, mmsize*%1
463 jl .loop
464 RET
465 %endmacro
466
467 INIT_MMX mmx
468 MEMZERO 8
469 INIT_XMM sse
470 MEMZERO 8
471 INIT_YMM avx
472 MEMZERO 4
473
474 %if HIGH_BIT_DEPTH == 0
475 ;-----------------------------------------------------------------------------
476 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
477 ;-----------------------------------------------------------------------------
478 %macro INTEGRAL_INIT4H 0
479 cglobal integral_init4h, 3,4
480 lea r3, [r0+r2*2]
481 add r1, r2
482 neg r2
483 pxor m4, m4
484 .loop:
485 mova m0, [r1+r2]
486 %if mmsize==32
487 movu m1, [r1+r2+8]
488 %else
489 mova m1, [r1+r2+16]
490 palignr m1, m0, 8
491 %endif
492 mpsadbw m0, m4, 0
493 mpsadbw m1, m4, 0
494 paddw m0, [r0+r2*2]
495 paddw m1, [r0+r2*2+mmsize]
496 mova [r3+r2*2 ], m0
497 mova [r3+r2*2+mmsize], m1
498 add r2, mmsize
499 jl .loop
500 RET
501 %endmacro
502
503 INIT_XMM sse4
504 INTEGRAL_INIT4H
505 INIT_YMM avx2
506 INTEGRAL_INIT4H
507
508 %macro INTEGRAL_INIT8H 0
509 cglobal integral_init8h, 3,4
510 lea r3, [r0+r2*2]
511 add r1, r2
512 neg r2
513 pxor m4, m4
514 .loop:
515 mova m0, [r1+r2]
516 %if mmsize==32
517 movu m1, [r1+r2+8]
518 mpsadbw m2, m0, m4, 100100b
519 mpsadbw m3, m1, m4, 100100b
520 %else
521 mova m1, [r1+r2+16]
522 palignr m1, m0, 8
523 mpsadbw m2, m0, m4, 100b
524 mpsadbw m3, m1, m4, 100b
525 %endif
526 mpsadbw m0, m4, 0
527 mpsadbw m1, m4, 0
528 paddw m0, [r0+r2*2]
529 paddw m1, [r0+r2*2+mmsize]
530 paddw m0, m2
531 paddw m1, m3
532 mova [r3+r2*2 ], m0
533 mova [r3+r2*2+mmsize], m1
534 add r2, mmsize
535 jl .loop
536 RET
537 %endmacro
538
539 INIT_XMM sse4
540 INTEGRAL_INIT8H
541 INIT_XMM avx
542 INTEGRAL_INIT8H
543 INIT_YMM avx2
544 INTEGRAL_INIT8H
545 %endif ; !HIGH_BIT_DEPTH
546
547 %macro INTEGRAL_INIT_8V 0
548 ;-----------------------------------------------------------------------------
549 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
550 ;-----------------------------------------------------------------------------
551 cglobal integral_init8v, 3,3
552 add r1, r1
553 add r0, r1
554 lea r2, [r0+r1*8]
555 neg r1
556 .loop:
557 mova m0, [r2+r1]
558 mova m1, [r2+r1+mmsize]
559 psubw m0, [r0+r1]
560 psubw m1, [r0+r1+mmsize]
561 mova [r0+r1], m0
562 mova [r0+r1+mmsize], m1
563 add r1, 2*mmsize
564 jl .loop
565 RET
566 %endmacro
567
568 INIT_MMX mmx
569 INTEGRAL_INIT_8V
570 INIT_XMM sse2
571 INTEGRAL_INIT_8V
572 INIT_YMM avx2
573 INTEGRAL_INIT_8V
574
575 ;-----------------------------------------------------------------------------
576 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
577 ;-----------------------------------------------------------------------------
578 INIT_MMX mmx
579 cglobal integral_init4v, 3,5
580 shl r2, 1
581 lea r3, [r0+r2*4]
582 lea r4, [r0+r2*8]
583 mova m0, [r0+r2]
584 mova m4, [r4+r2]
585 .loop:
586 mova m1, m4
587 psubw m1, m0
588 mova m4, [r4+r2-8]
589 mova m0, [r0+r2-8]
590 paddw m1, m4
591 mova m3, [r3+r2-8]
592 psubw m1, m0
593 psubw m3, m0
594 mova [r0+r2-8], m1
595 mova [r1+r2-8], m3
596 sub r2, 8
597 jge .loop
598 RET
599
600 INIT_XMM sse2
601 cglobal integral_init4v, 3,5
602 shl r2, 1
603 add r0, r2
604 add r1, r2
605 lea r3, [r0+r2*4]
606 lea r4, [r0+r2*8]
607 neg r2
608 .loop:
609 mova m0, [r0+r2]
610 mova m1, [r4+r2]
611 mova m2, m0
612 mova m4, m1
613 shufpd m0, [r0+r2+16], 1
614 shufpd m1, [r4+r2+16], 1
615 paddw m0, m2
616 paddw m1, m4
617 mova m3, [r3+r2]
618 psubw m1, m0
619 psubw m3, m2
620 mova [r0+r2], m1
621 mova [r1+r2], m3
622 add r2, 16
623 jl .loop
624 RET
625
626 INIT_XMM ssse3
627 cglobal integral_init4v, 3,5
628 shl r2, 1
629 add r0, r2
630 add r1, r2
631 lea r3, [r0+r2*4]
632 lea r4, [r0+r2*8]
633 neg r2
634 .loop:
635 mova m2, [r0+r2]
636 mova m0, [r0+r2+16]
637 mova m4, [r4+r2]
638 mova m1, [r4+r2+16]
639 palignr m0, m2, 8
640 palignr m1, m4, 8
641 paddw m0, m2
642 paddw m1, m4
643 mova m3, [r3+r2]
644 psubw m1, m0
645 psubw m3, m2
646 mova [r0+r2], m1
647 mova [r1+r2], m3
648 add r2, 16
649 jl .loop
650 RET
651
652 INIT_YMM avx2
653 cglobal integral_init4v, 3,5
654 add r2, r2
655 add r0, r2
656 add r1, r2
657 lea r3, [r0+r2*4]
658 lea r4, [r0+r2*8]
659 neg r2
660 .loop:
661 mova m2, [r0+r2]
662 movu m1, [r4+r2+8]
663 paddw m0, m2, [r0+r2+8]
664 paddw m1, [r4+r2]
665 mova m3, [r3+r2]
666 psubw m1, m0
667 psubw m3, m2
668 mova [r0+r2], m1
669 mova [r1+r2], m3
670 add r2, 32
671 jl .loop
672 RET
673
674 %macro FILT8x4 7
675 mova %3, [r0+%7]
676 mova %4, [r0+r5+%7]
677 pavgb %3, %4
678 pavgb %4, [r0+r5*2+%7]
679 PALIGNR %1, %3, 1, m6
680 PALIGNR %2, %4, 1, m6
681 %if cpuflag(xop)
682 pavgb %1, %3
683 pavgb %2, %4
684 %else
685 pavgb %1, %3
686 pavgb %2, %4
687 psrlw %5, %1, 8
688 psrlw %6, %2, 8
689 pand %1, m7
690 pand %2, m7
691 %endif
692 %endmacro
693
694 %macro FILT32x4U 4
695 mova m1, [r0+r5]
696 pavgb m0, m1, [r0]
697 movu m3, [r0+r5+1]
698 pavgb m2, m3, [r0+1]
699 pavgb m1, [r0+r5*2]
700 pavgb m3, [r0+r5*2+1]
701 pavgb m0, m2
702 pavgb m1, m3
703
704 mova m3, [r0+r5+mmsize]
705 pavgb m2, m3, [r0+mmsize]
706 movu m5, [r0+r5+1+mmsize]
707 pavgb m4, m5, [r0+1+mmsize]
708 pavgb m3, [r0+r5*2+mmsize]
709 pavgb m5, [r0+r5*2+1+mmsize]
710 pavgb m2, m4
711 pavgb m3, m5
712
713 pshufb m0, m7
714 pshufb m1, m7
715 pshufb m2, m7
716 pshufb m3, m7
717 punpckhqdq m4, m0, m2
718 punpcklqdq m0, m0, m2
719 punpckhqdq m5, m1, m3
720 punpcklqdq m2, m1, m3
721 vpermq m0, m0, q3120
722 vpermq m1, m4, q3120
723 vpermq m2, m2, q3120
724 vpermq m3, m5, q3120
725 mova [%1], m0
726 mova [%2], m1
727 mova [%3], m2
728 mova [%4], m3
729 %endmacro
730
731 %macro FILT16x2 4
732 mova m3, [r0+%4+mmsize]
733 mova m2, [r0+%4]
734 pavgb m3, [r0+%4+r5+mmsize]
735 pavgb m2, [r0+%4+r5]
736 PALIGNR %1, m3, 1, m6
737 pavgb %1, m3
738 PALIGNR m3, m2, 1, m6
739 pavgb m3, m2
740 %if cpuflag(xop)
741 vpperm m5, m3, %1, m7
742 vpperm m3, m3, %1, m6
743 %else
744 psrlw m5, m3, 8
745 psrlw m4, %1, 8
746 pand m3, m7
747 pand %1, m7
748 packuswb m3, %1
749 packuswb m5, m4
750 %endif
751 mova [%2], m3
752 mova [%3], m5
753 mova %1, m2
754 %endmacro
755
756 %macro FILT8x2U 3
757 mova m3, [r0+%3+8]
758 mova m2, [r0+%3]
759 pavgb m3, [r0+%3+r5+8]
760 pavgb m2, [r0+%3+r5]
761 mova m1, [r0+%3+9]
762 mova m0, [r0+%3+1]
763 pavgb m1, [r0+%3+r5+9]
764 pavgb m0, [r0+%3+r5+1]
765 pavgb m1, m3
766 pavgb m0, m2
767 psrlw m3, m1, 8
768 psrlw m2, m0, 8
769 pand m1, m7
770 pand m0, m7
771 packuswb m0, m1
772 packuswb m2, m3
773 mova [%1], m0
774 mova [%2], m2
775 %endmacro
776
777 %macro FILT8xU 3
778 mova m3, [r0+%3+8]
779 mova m2, [r0+%3]
780 pavgw m3, [r0+%3+r5+8]
781 pavgw m2, [r0+%3+r5]
782 movu m1, [r0+%3+10]
783 movu m0, [r0+%3+2]
784 pavgw m1, [r0+%3+r5+10]
785 pavgw m0, [r0+%3+r5+2]
786 pavgw m1, m3
787 pavgw m0, m2
788 psrld m3, m1, 16
789 psrld m2, m0, 16
790 pand m1, m7
791 pand m0, m7
792 packssdw m0, m1
793 packssdw m2, m3
794 movu [%1], m0
795 mova [%2], m2
796 %endmacro
797
798 %macro FILT8xA 4
799 mova m3, [r0+%4+mmsize]
800 mova m2, [r0+%4]
801 pavgw m3, [r0+%4+r5+mmsize]
802 pavgw m2, [r0+%4+r5]
803 PALIGNR %1, m3, 2, m6
804 pavgw %1, m3
805 PALIGNR m3, m2, 2, m6
806 pavgw m3, m2
807 %if cpuflag(xop)
808 vpperm m5, m3, %1, m7
809 vpperm m3, m3, %1, m6
810 %else
811 psrld m5, m3, 16
812 psrld m4, %1, 16
813 pand m3, m7
814 pand %1, m7
815 packssdw m3, %1
816 packssdw m5, m4
817 %endif
818 mova [%2], m3
819 mova [%3], m5
820 mova %1, m2
821 %endmacro
822
823 ;-----------------------------------------------------------------------------
824 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
825 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
826 ;-----------------------------------------------------------------------------
827 %macro FRAME_INIT_LOWRES 0
828 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
829 %if HIGH_BIT_DEPTH
830 shl dword r6m, 1
831 FIX_STRIDES r5
832 shl dword r7m, 1
833 %endif
834 %if mmsize >= 16
835 add dword r7m, mmsize-1
836 and dword r7m, ~(mmsize-1)
837 %endif
838 ; src += 2*(height-1)*stride + 2*width
839 mov r6d, r8m
840 dec r6d
841 imul r6d, r5d
842 add r6d, r7m
843 lea r0, [r0+r6*2]
844 ; dst += (height-1)*stride + width
845 mov r6d, r8m
846 dec r6d
847 imul r6d, r6m
848 add r6d, r7m
849 add r1, r6
850 add r2, r6
851 add r3, r6
852 add r4, r6
853 ; gap = stride - width
854 mov r6d, r6m
855 sub r6d, r7m
856 PUSH r6
857 %define dst_gap [rsp+gprsize]
858 mov r6d, r5d
859 sub r6d, r7m
860 shl r6d, 1
861 PUSH r6
862 %define src_gap [rsp]
863 %if HIGH_BIT_DEPTH
864 %if cpuflag(xop)
865 mova m6, [deinterleave_shuf32a]
866 mova m7, [deinterleave_shuf32b]
867 %else
868 pcmpeqw m7, m7
869 psrld m7, 16
870 %endif
871 .vloop:
872 mov r6d, r7m
873 %ifnidn cpuname, mmx2
874 mova m0, [r0]
875 mova m1, [r0+r5]
876 pavgw m0, m1
877 pavgw m1, [r0+r5*2]
878 %endif
879 .hloop:
880 sub r0, mmsize*2
881 sub r1, mmsize
882 sub r2, mmsize
883 sub r3, mmsize
884 sub r4, mmsize
885 %ifidn cpuname, mmx2
886 FILT8xU r1, r2, 0
887 FILT8xU r3, r4, r5
888 %else
889 FILT8xA m0, r1, r2, 0
890 FILT8xA m1, r3, r4, r5
891 %endif
892 sub r6d, mmsize
893 jg .hloop
894 %else ; !HIGH_BIT_DEPTH
895 %if cpuflag(avx2)
896 mova m7, [deinterleave_shuf]
897 %elif cpuflag(xop)
898 mova m6, [deinterleave_shuf32a]
899 mova m7, [deinterleave_shuf32b]
900 %else
901 pcmpeqb m7, m7
902 psrlw m7, 8
903 %endif
904 .vloop:
905 mov r6d, r7m
906 %ifnidn cpuname, mmx2
907 %if mmsize <= 16
908 mova m0, [r0]
909 mova m1, [r0+r5]
910 pavgb m0, m1
911 pavgb m1, [r0+r5*2]
912 %endif
913 %endif
914 .hloop:
915 sub r0, mmsize*2
916 sub r1, mmsize
917 sub r2, mmsize
918 sub r3, mmsize
919 sub r4, mmsize
920 %if mmsize==32
921 FILT32x4U r1, r2, r3, r4
922 %elifdef m8
923 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
924 mova m8, m0
925 mova m9, m1
926 FILT8x4 m2, m3, m0, m1, m4, m5, 0
927 %if cpuflag(xop)
928 vpperm m4, m2, m8, m7
929 vpperm m2, m2, m8, m6
930 vpperm m5, m3, m9, m7
931 vpperm m3, m3, m9, m6
932 %else
933 packuswb m2, m8
934 packuswb m3, m9
935 packuswb m4, m10
936 packuswb m5, m11
937 %endif
938 mova [r1], m2
939 mova [r2], m4
940 mova [r3], m3
941 mova [r4], m5
942 %elifidn cpuname, mmx2
943 FILT8x2U r1, r2, 0
944 FILT8x2U r3, r4, r5
945 %else
946 FILT16x2 m0, r1, r2, 0
947 FILT16x2 m1, r3, r4, r5
948 %endif
949 sub r6d, mmsize
950 jg .hloop
951 %endif ; HIGH_BIT_DEPTH
952 .skip:
953 mov r6, dst_gap
954 sub r0, src_gap
955 sub r1, r6
956 sub r2, r6
957 sub r3, r6
958 sub r4, r6
959 dec dword r8m
960 jg .vloop
961 ADD rsp, 2*gprsize
962 emms
963 RET
964 %endmacro ; FRAME_INIT_LOWRES
965
966 INIT_MMX mmx2
967 FRAME_INIT_LOWRES
968 %if ARCH_X86_64 == 0
969 INIT_MMX cache32, mmx2
970 FRAME_INIT_LOWRES
971 %endif
972 INIT_XMM sse2
973 FRAME_INIT_LOWRES
974 INIT_XMM ssse3
975 FRAME_INIT_LOWRES
976 INIT_XMM avx
977 FRAME_INIT_LOWRES
978 INIT_XMM xop
979 FRAME_INIT_LOWRES
980 %if HIGH_BIT_DEPTH==0
981 INIT_YMM avx2
982 FRAME_INIT_LOWRES
983 %endif
984
985 ;-----------------------------------------------------------------------------
986 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
987 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
988 ;-----------------------------------------------------------------------------
989 %macro MBTREE 0
990 cglobal mbtree_propagate_cost, 7,7,7
991 add r6d, r6d
992 lea r0, [r0+r6*2]
993 add r1, r6
994 add r2, r6
995 add r3, r6
996 add r4, r6
997 neg r6
998 pxor xmm4, xmm4
999 movss xmm6, [r5]
1000 shufps xmm6, xmm6, 0
1001 mulps xmm6, [pf_inv256]
1002 movdqa xmm5, [pw_3fff]
1003 .loop:
1004 movq xmm2, [r2+r6] ; intra
1005 movq xmm0, [r4+r6] ; invq
1006 movq xmm3, [r3+r6] ; inter
1007 movq xmm1, [r1+r6] ; prop
1008 punpcklwd xmm2, xmm4
1009 punpcklwd xmm0, xmm4
1010 pmaddwd xmm0, xmm2
1011 pand xmm3, xmm5
1012 punpcklwd xmm1, xmm4
1013 punpcklwd xmm3, xmm4
1014 %if cpuflag(fma4)
1015 cvtdq2ps xmm0, xmm0
1016 cvtdq2ps xmm1, xmm1
1017 fmaddps xmm0, xmm0, xmm6, xmm1
1018 cvtdq2ps xmm1, xmm2
1019 psubd xmm2, xmm3
1020 cvtdq2ps xmm2, xmm2
1021 rcpps xmm3, xmm1
1022 mulps xmm1, xmm3
1023 mulps xmm0, xmm2
1024 addps xmm2, xmm3, xmm3
1025 fnmaddps xmm3, xmm1, xmm3, xmm2
1026 mulps xmm0, xmm3
1027 %else
1028 cvtdq2ps xmm0, xmm0
1029 mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
1030 cvtdq2ps xmm1, xmm1 ; prop
1031 addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
1032 cvtdq2ps xmm1, xmm2 ; intra
1033 psubd xmm2, xmm3 ; intra - inter
1034 cvtdq2ps xmm2, xmm2 ; intra - inter
1035 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1036 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1037 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1038 mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1039 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1040 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1041 mulps xmm0, xmm3 ; / intra
1042 %endif
1043 cvtps2dq xmm0, xmm0
1044 movdqa [r0+r6*2], xmm0
1045 add r6, 8
1046 jl .loop
1047 RET
1048 %endmacro
1049
1050 INIT_XMM sse2
1051 MBTREE
1052 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
1053 INIT_XMM fma4
1054 MBTREE
1055
1056 %macro INT16_UNPACK 1
1057 vpunpckhwd xm4, xm%1, xm7
1058 vpunpcklwd xm%1, xm7
1059 vinsertf128 m%1, m%1, xm4, 1
1060 %endmacro
1061
1062 ; FIXME: align loads/stores to 16 bytes
1063 %macro MBTREE_AVX 0
1064 cglobal mbtree_propagate_cost, 7,7,8
1065 add r6d, r6d
1066 lea r0, [r0+r6*2]
1067 add r1, r6
1068 add r2, r6
1069 add r3, r6
1070 add r4, r6
1071 neg r6
1072 mova xm5, [pw_3fff]
1073 vbroadcastss m6, [r5]
1074 mulps m6, [pf_inv256]
1075 %if notcpuflag(avx2)
1076 pxor xm7, xm7
1077 %endif
1078 .loop:
1079 %if cpuflag(avx2)
1080 pmovzxwd m0, [r2+r6] ; intra
1081 pmovzxwd m1, [r4+r6] ; invq
1082 pmovzxwd m2, [r1+r6] ; prop
1083 pand xm3, xm5, [r3+r6] ; inter
1084 pmovzxwd m3, xm3
1085 pmaddwd m1, m0
1086 psubd m4, m0, m3
1087 cvtdq2ps m0, m0
1088 cvtdq2ps m1, m1
1089 cvtdq2ps m2, m2
1090 cvtdq2ps m4, m4
1091 fmaddps m1, m1, m6, m2
1092 rcpps m3, m0
1093 mulps m2, m0, m3
1094 mulps m1, m4
1095 addps m4, m3, m3
1096 fnmaddps m4, m2, m3, m4
1097 mulps m1, m4
1098 %else
1099 movu xm0, [r2+r6]
1100 movu xm1, [r4+r6]
1101 movu xm2, [r1+r6]
1102 pand xm3, xm5, [r3+r6]
1103 INT16_UNPACK 0
1104 INT16_UNPACK 1
1105 INT16_UNPACK 2
1106 INT16_UNPACK 3
1107 cvtdq2ps m0, m0
1108 cvtdq2ps m1, m1
1109 cvtdq2ps m2, m2
1110 cvtdq2ps m3, m3
1111 mulps m1, m0
1112 subps m4, m0, m3
1113 mulps m1, m6 ; intra*invq*fps_factor>>8
1114 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
1115 rcpps m3, m0 ; 1 / intra 1st approximation
1116 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
1117 mulps m2, m3 ; intra * (1/intra 1st approx)^2
1118 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1119 addps m3, m3 ; 2 * (1/intra 1st approx)
1120 subps m3, m2 ; 2nd approximation for 1/intra
1121 mulps m1, m3 ; / intra
1122 %endif
1123 vcvtps2dq m1, m1
1124 movu [r0+r6*2], m1
1125 add r6, 16
1126 jl .loop
1127 RET
1128 %endmacro
1129
1130 INIT_YMM avx
1131 MBTREE_AVX
1132 INIT_YMM avx2,fma3
1133 MBTREE_AVX