Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / lossless_audiodsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* Copyright (c) 2008 Loren Merritt
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION_TEXT
24
25%macro SCALARPRODUCT 0
26; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
27; int order, int mul)
28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
29 shl orderq, 1
30 movd m7, mulm
31%if mmsize == 16
32 pshuflw m7, m7, 0
33 punpcklqdq m7, m7
34%else
35 pshufw m7, m7, 0
36%endif
37 pxor m6, m6
38 add v1q, orderq
39 add v2q, orderq
40 add v3q, orderq
41 neg orderq
42.loop:
43 movu m0, [v2q + orderq]
44 movu m1, [v2q + orderq + mmsize]
45 mova m4, [v1q + orderq]
46 mova m5, [v1q + orderq + mmsize]
47 movu m2, [v3q + orderq]
48 movu m3, [v3q + orderq + mmsize]
49 pmaddwd m0, m4
50 pmaddwd m1, m5
51 pmullw m2, m7
52 pmullw m3, m7
53 paddd m6, m0
54 paddd m6, m1
55 paddw m2, m4
56 paddw m3, m5
57 mova [v1q + orderq], m2
58 mova [v1q + orderq + mmsize], m3
59 add orderq, mmsize*2
60 jl .loop
61 HADDD m6, m0
62 movd eax, m6
63 RET
64%endmacro
65
66INIT_MMX mmxext
67SCALARPRODUCT
68INIT_XMM sse2
69SCALARPRODUCT
70
71%macro SCALARPRODUCT_LOOP 1
72align 16
73.loop%1:
74 sub orderq, mmsize*2
75%if %1
76 mova m1, m4
77 mova m4, [v2q + orderq]
78 mova m0, [v2q + orderq + mmsize]
79 palignr m1, m0, %1
80 palignr m0, m4, %1
81 mova m3, m5
82 mova m5, [v3q + orderq]
83 mova m2, [v3q + orderq + mmsize]
84 palignr m3, m2, %1
85 palignr m2, m5, %1
86%else
87 mova m0, [v2q + orderq]
88 mova m1, [v2q + orderq + mmsize]
89 mova m2, [v3q + orderq]
90 mova m3, [v3q + orderq + mmsize]
91%endif
92 %define t0 [v1q + orderq]
93 %define t1 [v1q + orderq + mmsize]
94%if ARCH_X86_64
95 mova m8, t0
96 mova m9, t1
97 %define t0 m8
98 %define t1 m9
99%endif
100 pmaddwd m0, t0
101 pmaddwd m1, t1
102 pmullw m2, m7
103 pmullw m3, m7
104 paddw m2, t0
105 paddw m3, t1
106 paddd m6, m0
107 paddd m6, m1
108 mova [v1q + orderq], m2
109 mova [v1q + orderq + mmsize], m3
110 jg .loop%1
111%if %1
112 jmp .end
113%endif
114%endmacro
115
116; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
117; int order, int mul)
118INIT_XMM ssse3
119cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
120 shl orderq, 1
121 movd m7, mulm
122 pshuflw m7, m7, 0
123 punpcklqdq m7, m7
124 pxor m6, m6
125 mov r4d, v2d
126 and r4d, 15
127 and v2q, ~15
128 and v3q, ~15
129 mova m4, [v2q + orderq]
130 mova m5, [v3q + orderq]
131 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
132 cmp r4d, 0
133 je .loop0
134 cmp r4d, 2
135 je .loop2
136 cmp r4d, 4
137 je .loop4
138 cmp r4d, 6
139 je .loop6
140 cmp r4d, 8
141 je .loop8
142 cmp r4d, 10
143 je .loop10
144 cmp r4d, 12
145 je .loop12
146SCALARPRODUCT_LOOP 14
147SCALARPRODUCT_LOOP 12
148SCALARPRODUCT_LOOP 10
149SCALARPRODUCT_LOOP 8
150SCALARPRODUCT_LOOP 6
151SCALARPRODUCT_LOOP 4
152SCALARPRODUCT_LOOP 2
153SCALARPRODUCT_LOOP 0
154.end:
155 HADDD m6, m0
156 movd eax, m6
157 RET