1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at license @ x265.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
38 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
39 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
41 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
42 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
44 pw_1024: times 16 dw 1024
47 pd_0f: times 4 dd 0xffff
48 pf_inv256: times 8 dd 0.00390625
62 ;The hpel_filter routines use non-temporal writes for output.
63 ;The following defines may be uncommented for testing.
64 ;Doing the hpel_filter temporal may be a win if the last level cache
65 ;is big enough (preliminary benching suggests on the order of 4* framesize).
68 ;%define movntps movaps
71 %if HIGH_BIT_DEPTH == 0
75 %endif ; !HIGH_BIT_DEPTH
77 ;-----------------------------------------------------------------------------
78 ; void plane_copy_core( pixel *dst, intptr_t i_dst,
79 ; pixel *src, intptr_t i_src, int w, int h )
80 ;-----------------------------------------------------------------------------
81 ; assumes i_dst and w are multiples of 16, and i_dst>w
83 cglobal plane_copy_core_mmx2, 6,7
84 FIX_STRIDES r1, r3, r4d
85 %if HIGH_BIT_DEPTH == 0
136 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
140 mov%4 m0, [%2+(x/2)*mmsize]
141 mov%4 m1, [%3+(x/2)*mmsize]
144 mov%5a [%1+(x+0)*mmsize], m0
145 mov%5a [%1+(x+1)*mmsize], m2
165 %endif ; HIGH_BIT_DEPTH
168 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
172 mova m0, [%3+(n+0)*mmsize]
173 mova m1, [%3+(n+1)*mmsize]
180 mov%6 [%1+(n/2)*mmsize], m0
181 mov%6 [%2+(n/2)*mmsize], m2
184 %else ; !HIGH_BIT_DEPTH
214 %endif ; mmsize == 16
215 %endif ; HIGH_BIT_DEPTH
218 %macro PLANE_INTERLEAVE 0
219 ;-----------------------------------------------------------------------------
220 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
221 ; uint8_t *srcu, intptr_t i_srcu,
222 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
223 ;-----------------------------------------------------------------------------
224 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
225 cglobal plane_copy_interleave_core, 6,9
228 FIX_STRIDES r1, r3, r5, r6d
256 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
257 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
258 add r6, 16*SIZEOF_PIXEL
264 movntq [r0+r6*2+(n+ 0)], m0
265 movntq [r0+r6*2+(n+ 8)], m0
266 movntq [r0+r6*2+(n+16)], m0
267 movntq [r0+r6*2+(n+24)], m0
269 movntdq [r0+r6*2+(n+ 0)], m0
270 movntdq [r0+r6*2+(n+16)], m0
274 add r6, 16*SIZEOF_PIXEL
286 ;-----------------------------------------------------------------------------
287 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
288 ;-----------------------------------------------------------------------------
289 cglobal store_interleave_chroma, 5,5
292 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
293 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
294 add r2, FDEC_STRIDEB*2
295 add r3, FDEC_STRIDEB*2
300 %endmacro ; PLANE_INTERLEAVE
302 %macro DEINTERLEAVE_START 0
306 mova m4, [deinterleave_shuf]
309 %endif ; HIGH_BIT_DEPTH
312 %macro PLANE_DEINTERLEAVE 0
313 ;-----------------------------------------------------------------------------
314 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
315 ; pixel *dstv, intptr_t i_dstv,
316 ; pixel *src, intptr_t i_src, int w, int h )
317 ;-----------------------------------------------------------------------------
318 cglobal plane_copy_deinterleave, 6,7
321 FIX_STRIDES r1, r3, r5, r6d
332 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
333 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
334 add r6, 16*SIZEOF_PIXEL
343 ;-----------------------------------------------------------------------------
344 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
345 ;-----------------------------------------------------------------------------
346 cglobal load_deinterleave_chroma_fenc, 4,4
350 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
351 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
352 add r0, FENC_STRIDEB*2
358 ;-----------------------------------------------------------------------------
359 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
360 ;-----------------------------------------------------------------------------
361 cglobal load_deinterleave_chroma_fdec, 4,4
365 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
366 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
367 add r0, FDEC_STRIDEB*2
372 %endmacro ; PLANE_DEINTERLEAVE
397 ; These functions are not general-use; not only do the SSE ones require aligned input,
398 ; but they also will fail if given a non-mod16 size.
399 ; memzero SSE will fail for non-mod128.
401 ;-----------------------------------------------------------------------------
402 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
403 ;-----------------------------------------------------------------------------
405 cglobal memcpy_aligned, 3,3
416 mova m0, [r1+r2-1*mmsize]
417 mova m1, [r1+r2-2*mmsize]
418 mova [r0+r2-1*mmsize], m0
419 mova [r0+r2-2*mmsize], m1
425 mova m0, [r1+r2-1*mmsize]
426 mova m1, [r1+r2-2*mmsize]
427 mova m2, [r1+r2-3*mmsize]
428 mova m3, [r1+r2-4*mmsize]
429 mova [r0+r2-1*mmsize], m0
430 mova [r0+r2-2*mmsize], m1
431 mova [r0+r2-3*mmsize], m2
432 mova [r0+r2-4*mmsize], m3
444 ;-----------------------------------------------------------------------------
445 ; void *memzero_aligned( void *dst, size_t n );
446 ;-----------------------------------------------------------------------------
448 cglobal memzero_aligned, 2,2
459 mova [r0 + r1 + i], m0
474 %if HIGH_BIT_DEPTH == 0
475 ;-----------------------------------------------------------------------------
476 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
477 ;-----------------------------------------------------------------------------
478 %macro INTEGRAL_INIT4H 0
479 cglobal integral_init4h, 3,4
495 paddw m1, [r0+r2*2+mmsize]
497 mova [r3+r2*2+mmsize], m1
508 %macro INTEGRAL_INIT8H 0
509 cglobal integral_init8h, 3,4
518 mpsadbw m2, m0, m4, 100100b
519 mpsadbw m3, m1, m4, 100100b
523 mpsadbw m2, m0, m4, 100b
524 mpsadbw m3, m1, m4, 100b
529 paddw m1, [r0+r2*2+mmsize]
533 mova [r3+r2*2+mmsize], m1
545 %endif ; !HIGH_BIT_DEPTH
547 %macro INTEGRAL_INIT_8V 0
548 ;-----------------------------------------------------------------------------
549 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
550 ;-----------------------------------------------------------------------------
551 cglobal integral_init8v, 3,3
558 mova m1, [r2+r1+mmsize]
560 psubw m1, [r0+r1+mmsize]
562 mova [r0+r1+mmsize], m1
575 ;-----------------------------------------------------------------------------
576 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
577 ;-----------------------------------------------------------------------------
579 cglobal integral_init4v, 3,5
601 cglobal integral_init4v, 3,5
613 shufpd m0, [r0+r2+16], 1
614 shufpd m1, [r4+r2+16], 1
627 cglobal integral_init4v, 3,5
653 cglobal integral_init4v, 3,5
663 paddw m0, m2, [r0+r2+8]
678 pavgb %4, [r0+r5*2+%7]
679 PALIGNR %1, %3, 1, m6
680 PALIGNR %2, %4, 1, m6
700 pavgb m3, [r0+r5*2+1]
704 mova m3, [r0+r5+mmsize]
705 pavgb m2, m3, [r0+mmsize]
706 movu m5, [r0+r5+1+mmsize]
707 pavgb m4, m5, [r0+1+mmsize]
708 pavgb m3, [r0+r5*2+mmsize]
709 pavgb m5, [r0+r5*2+1+mmsize]
717 punpckhqdq m4, m0, m2
718 punpcklqdq m0, m0, m2
719 punpckhqdq m5, m1, m3
720 punpcklqdq m2, m1, m3
732 mova m3, [r0+%4+mmsize]
734 pavgb m3, [r0+%4+r5+mmsize]
736 PALIGNR %1, m3, 1, m6
738 PALIGNR m3, m2, 1, m6
741 vpperm m5, m3, %1, m7
742 vpperm m3, m3, %1, m6
759 pavgb m3, [r0+%3+r5+8]
763 pavgb m1, [r0+%3+r5+9]
764 pavgb m0, [r0+%3+r5+1]
780 pavgw m3, [r0+%3+r5+8]
784 pavgw m1, [r0+%3+r5+10]
785 pavgw m0, [r0+%3+r5+2]
799 mova m3, [r0+%4+mmsize]
801 pavgw m3, [r0+%4+r5+mmsize]
803 PALIGNR %1, m3, 2, m6
805 PALIGNR m3, m2, 2, m6
808 vpperm m5, m3, %1, m7
809 vpperm m3, m3, %1, m6
823 ;-----------------------------------------------------------------------------
824 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
825 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
826 ;-----------------------------------------------------------------------------
827 %macro FRAME_INIT_LOWRES 0
828 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
835 add dword r7m, mmsize-1
836 and dword r7m, ~(mmsize-1)
838 ; src += 2*(height-1)*stride + 2*width
844 ; dst += (height-1)*stride + width
853 ; gap = stride - width
857 %define dst_gap [rsp+gprsize]
862 %define src_gap [rsp]
865 mova m6, [deinterleave_shuf32a]
866 mova m7, [deinterleave_shuf32b]
873 %ifnidn cpuname, mmx2
889 FILT8xA m0, r1, r2, 0
890 FILT8xA m1, r3, r4, r5
894 %else ; !HIGH_BIT_DEPTH
896 mova m7, [deinterleave_shuf]
898 mova m6, [deinterleave_shuf32a]
899 mova m7, [deinterleave_shuf32b]
906 %ifnidn cpuname, mmx2
921 FILT32x4U r1, r2, r3, r4
923 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
926 FILT8x4 m2, m3, m0, m1, m4, m5, 0
928 vpperm m4, m2, m8, m7
929 vpperm m2, m2, m8, m6
930 vpperm m5, m3, m9, m7
931 vpperm m3, m3, m9, m6
942 %elifidn cpuname, mmx2
946 FILT16x2 m0, r1, r2, 0
947 FILT16x2 m1, r3, r4, r5
951 %endif ; HIGH_BIT_DEPTH
964 %endmacro ; FRAME_INIT_LOWRES
969 INIT_MMX cache32, mmx2
980 %if HIGH_BIT_DEPTH==0
985 ;-----------------------------------------------------------------------------
986 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
987 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
988 ;-----------------------------------------------------------------------------
990 cglobal mbtree_propagate_cost, 7,7,7
1000 shufps xmm6, xmm6, 0
1001 mulps xmm6, [pf_inv256]
1002 movdqa xmm5, [pw_3fff]
1004 movq xmm2, [r2+r6] ; intra
1005 movq xmm0, [r4+r6] ; invq
1006 movq xmm3, [r3+r6] ; inter
1007 movq xmm1, [r1+r6] ; prop
1008 punpcklwd xmm2, xmm4
1009 punpcklwd xmm0, xmm4
1012 punpcklwd xmm1, xmm4
1013 punpcklwd xmm3, xmm4
1017 fmaddps xmm0, xmm0, xmm6, xmm1
1024 addps xmm2, xmm3, xmm3
1025 fnmaddps xmm3, xmm1, xmm3, xmm2
1029 mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
1030 cvtdq2ps xmm1, xmm1 ; prop
1031 addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
1032 cvtdq2ps xmm1, xmm2 ; intra
1033 psubd xmm2, xmm3 ; intra - inter
1034 cvtdq2ps xmm2, xmm2 ; intra - inter
1035 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1036 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1037 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1038 mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1039 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1040 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1041 mulps xmm0, xmm3 ; / intra
1044 movdqa [r0+r6*2], xmm0
1052 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
1056 %macro INT16_UNPACK 1
1057 vpunpckhwd xm4, xm%1, xm7
1058 vpunpcklwd xm%1, xm7
1059 vinsertf128 m%1, m%1, xm4, 1
1062 ; FIXME: align loads/stores to 16 bytes
1064 cglobal mbtree_propagate_cost, 7,7,8
1073 vbroadcastss m6, [r5]
1074 mulps m6, [pf_inv256]
1075 %if notcpuflag(avx2)
1080 pmovzxwd m0, [r2+r6] ; intra
1081 pmovzxwd m1, [r4+r6] ; invq
1082 pmovzxwd m2, [r1+r6] ; prop
1083 pand xm3, xm5, [r3+r6] ; inter
1091 fmaddps m1, m1, m6, m2
1096 fnmaddps m4, m2, m3, m4
1102 pand xm3, xm5, [r3+r6]
1113 mulps m1, m6 ; intra*invq*fps_factor>>8
1114 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
1115 rcpps m3, m0 ; 1 / intra 1st approximation
1116 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
1117 mulps m2, m3 ; intra * (1/intra 1st approx)^2
1118 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1119 addps m3, m3 ; 2 * (1/intra 1st approx)
1120 subps m3, m2 ; 2nd approximation for 1/intra
1121 mulps m1, m3 ; / intra