Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / lossless_audiodsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* Copyright (c) 2008 Loren Merritt
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION_TEXT
24
25%macro SCALARPRODUCT 0
26; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
27; int order, int mul)
28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
f6fa7814
DM
29%if mmsize == 16
30 test orderq, 8
31 jnz scalarproduct_and_madd_int16_fallback
32%else
33 scalarproduct_and_madd_int16_fallback
34%endif
2ba45a60
DM
35 shl orderq, 1
36 movd m7, mulm
37%if mmsize == 16
38 pshuflw m7, m7, 0
39 punpcklqdq m7, m7
40%else
41 pshufw m7, m7, 0
42%endif
43 pxor m6, m6
44 add v1q, orderq
45 add v2q, orderq
46 add v3q, orderq
47 neg orderq
48.loop:
49 movu m0, [v2q + orderq]
50 movu m1, [v2q + orderq + mmsize]
51 mova m4, [v1q + orderq]
52 mova m5, [v1q + orderq + mmsize]
53 movu m2, [v3q + orderq]
54 movu m3, [v3q + orderq + mmsize]
55 pmaddwd m0, m4
56 pmaddwd m1, m5
57 pmullw m2, m7
58 pmullw m3, m7
59 paddd m6, m0
60 paddd m6, m1
61 paddw m2, m4
62 paddw m3, m5
63 mova [v1q + orderq], m2
64 mova [v1q + orderq + mmsize], m3
65 add orderq, mmsize*2
66 jl .loop
67 HADDD m6, m0
68 movd eax, m6
69 RET
70%endmacro
71
72INIT_MMX mmxext
73SCALARPRODUCT
74INIT_XMM sse2
75SCALARPRODUCT
76
77%macro SCALARPRODUCT_LOOP 1
78align 16
79.loop%1:
80 sub orderq, mmsize*2
81%if %1
82 mova m1, m4
83 mova m4, [v2q + orderq]
84 mova m0, [v2q + orderq + mmsize]
85 palignr m1, m0, %1
86 palignr m0, m4, %1
87 mova m3, m5
88 mova m5, [v3q + orderq]
89 mova m2, [v3q + orderq + mmsize]
90 palignr m3, m2, %1
91 palignr m2, m5, %1
92%else
93 mova m0, [v2q + orderq]
94 mova m1, [v2q + orderq + mmsize]
95 mova m2, [v3q + orderq]
96 mova m3, [v3q + orderq + mmsize]
97%endif
98 %define t0 [v1q + orderq]
99 %define t1 [v1q + orderq + mmsize]
100%if ARCH_X86_64
101 mova m8, t0
102 mova m9, t1
103 %define t0 m8
104 %define t1 m9
105%endif
106 pmaddwd m0, t0
107 pmaddwd m1, t1
108 pmullw m2, m7
109 pmullw m3, m7
110 paddw m2, t0
111 paddw m3, t1
112 paddd m6, m0
113 paddd m6, m1
114 mova [v1q + orderq], m2
115 mova [v1q + orderq + mmsize], m3
116 jg .loop%1
117%if %1
118 jmp .end
119%endif
120%endmacro
121
122; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
123; int order, int mul)
124INIT_XMM ssse3
125cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
f6fa7814
DM
126 test orderq, 8
127 jnz scalarproduct_and_madd_int16_fallback
2ba45a60
DM
128 shl orderq, 1
129 movd m7, mulm
130 pshuflw m7, m7, 0
131 punpcklqdq m7, m7
132 pxor m6, m6
133 mov r4d, v2d
134 and r4d, 15
135 and v2q, ~15
136 and v3q, ~15
137 mova m4, [v2q + orderq]
138 mova m5, [v3q + orderq]
139 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
140 cmp r4d, 0
141 je .loop0
142 cmp r4d, 2
143 je .loop2
144 cmp r4d, 4
145 je .loop4
146 cmp r4d, 6
147 je .loop6
148 cmp r4d, 8
149 je .loop8
150 cmp r4d, 10
151 je .loop10
152 cmp r4d, 12
153 je .loop12
154SCALARPRODUCT_LOOP 14
155SCALARPRODUCT_LOOP 12
156SCALARPRODUCT_LOOP 10
157SCALARPRODUCT_LOOP 8
158SCALARPRODUCT_LOOP 6
159SCALARPRODUCT_LOOP 4
160SCALARPRODUCT_LOOP 2
161SCALARPRODUCT_LOOP 0
162.end:
163 HADDD m6, m0
164 movd eax, m6
165 RET