Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* Copyright (c) 2008 Loren Merritt | |
3 | ;* | |
4 | ;* This file is part of FFmpeg. | |
5 | ;* | |
6 | ;* FFmpeg is free software; you can redistribute it and/or | |
7 | ;* modify it under the terms of the GNU Lesser General Public | |
8 | ;* License as published by the Free Software Foundation; either | |
9 | ;* version 2.1 of the License, or (at your option) any later version. | |
10 | ;* | |
11 | ;* FFmpeg is distributed in the hope that it will be useful, | |
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | ;* Lesser General Public License for more details. | |
15 | ;* | |
16 | ;* You should have received a copy of the GNU Lesser General Public | |
17 | ;* License along with FFmpeg; if not, write to the Free Software | |
18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | ;****************************************************************************** | |
20 | ||
21 | %include "libavutil/x86/x86util.asm" | |
22 | ||
23 | SECTION_TEXT | |
24 | ||
25 | %macro SCALARPRODUCT 0 | |
26 | ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |
27 | ; int order, int mul) | |
28 | cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul | |
f6fa7814 DM |
29 | %if mmsize == 16 |
30 | test orderq, 8 | |
31 | jnz scalarproduct_and_madd_int16_fallback | |
32 | %else | |
33 | scalarproduct_and_madd_int16_fallback | |
34 | %endif | |
2ba45a60 DM |
35 | shl orderq, 1 |
36 | movd m7, mulm | |
37 | %if mmsize == 16 | |
38 | pshuflw m7, m7, 0 | |
39 | punpcklqdq m7, m7 | |
40 | %else | |
41 | pshufw m7, m7, 0 | |
42 | %endif | |
43 | pxor m6, m6 | |
44 | add v1q, orderq | |
45 | add v2q, orderq | |
46 | add v3q, orderq | |
47 | neg orderq | |
48 | .loop: | |
49 | movu m0, [v2q + orderq] | |
50 | movu m1, [v2q + orderq + mmsize] | |
51 | mova m4, [v1q + orderq] | |
52 | mova m5, [v1q + orderq + mmsize] | |
53 | movu m2, [v3q + orderq] | |
54 | movu m3, [v3q + orderq + mmsize] | |
55 | pmaddwd m0, m4 | |
56 | pmaddwd m1, m5 | |
57 | pmullw m2, m7 | |
58 | pmullw m3, m7 | |
59 | paddd m6, m0 | |
60 | paddd m6, m1 | |
61 | paddw m2, m4 | |
62 | paddw m3, m5 | |
63 | mova [v1q + orderq], m2 | |
64 | mova [v1q + orderq + mmsize], m3 | |
65 | add orderq, mmsize*2 | |
66 | jl .loop | |
67 | HADDD m6, m0 | |
68 | movd eax, m6 | |
69 | RET | |
70 | %endmacro | |
71 | ||
72 | INIT_MMX mmxext | |
73 | SCALARPRODUCT | |
74 | INIT_XMM sse2 | |
75 | SCALARPRODUCT | |
76 | ||
77 | %macro SCALARPRODUCT_LOOP 1 | |
78 | align 16 | |
79 | .loop%1: | |
80 | sub orderq, mmsize*2 | |
81 | %if %1 | |
82 | mova m1, m4 | |
83 | mova m4, [v2q + orderq] | |
84 | mova m0, [v2q + orderq + mmsize] | |
85 | palignr m1, m0, %1 | |
86 | palignr m0, m4, %1 | |
87 | mova m3, m5 | |
88 | mova m5, [v3q + orderq] | |
89 | mova m2, [v3q + orderq + mmsize] | |
90 | palignr m3, m2, %1 | |
91 | palignr m2, m5, %1 | |
92 | %else | |
93 | mova m0, [v2q + orderq] | |
94 | mova m1, [v2q + orderq + mmsize] | |
95 | mova m2, [v3q + orderq] | |
96 | mova m3, [v3q + orderq + mmsize] | |
97 | %endif | |
98 | %define t0 [v1q + orderq] | |
99 | %define t1 [v1q + orderq + mmsize] | |
100 | %if ARCH_X86_64 | |
101 | mova m8, t0 | |
102 | mova m9, t1 | |
103 | %define t0 m8 | |
104 | %define t1 m9 | |
105 | %endif | |
106 | pmaddwd m0, t0 | |
107 | pmaddwd m1, t1 | |
108 | pmullw m2, m7 | |
109 | pmullw m3, m7 | |
110 | paddw m2, t0 | |
111 | paddw m3, t1 | |
112 | paddd m6, m0 | |
113 | paddd m6, m1 | |
114 | mova [v1q + orderq], m2 | |
115 | mova [v1q + orderq + mmsize], m3 | |
116 | jg .loop%1 | |
117 | %if %1 | |
118 | jmp .end | |
119 | %endif | |
120 | %endmacro | |
121 | ||
122 | ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |
123 | ; int order, int mul) | |
124 | INIT_XMM ssse3 | |
125 | cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul | |
f6fa7814 DM |
126 | test orderq, 8 |
127 | jnz scalarproduct_and_madd_int16_fallback | |
2ba45a60 DM |
128 | shl orderq, 1 |
129 | movd m7, mulm | |
130 | pshuflw m7, m7, 0 | |
131 | punpcklqdq m7, m7 | |
132 | pxor m6, m6 | |
133 | mov r4d, v2d | |
134 | and r4d, 15 | |
135 | and v2q, ~15 | |
136 | and v3q, ~15 | |
137 | mova m4, [v2q + orderq] | |
138 | mova m5, [v3q + orderq] | |
139 | ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |
140 | cmp r4d, 0 | |
141 | je .loop0 | |
142 | cmp r4d, 2 | |
143 | je .loop2 | |
144 | cmp r4d, 4 | |
145 | je .loop4 | |
146 | cmp r4d, 6 | |
147 | je .loop6 | |
148 | cmp r4d, 8 | |
149 | je .loop8 | |
150 | cmp r4d, 10 | |
151 | je .loop10 | |
152 | cmp r4d, 12 | |
153 | je .loop12 | |
154 | SCALARPRODUCT_LOOP 14 | |
155 | SCALARPRODUCT_LOOP 12 | |
156 | SCALARPRODUCT_LOOP 10 | |
157 | SCALARPRODUCT_LOOP 8 | |
158 | SCALARPRODUCT_LOOP 6 | |
159 | SCALARPRODUCT_LOOP 4 | |
160 | SCALARPRODUCT_LOOP 2 | |
161 | SCALARPRODUCT_LOOP 0 | |
162 | .end: | |
163 | HADDD m6, m0 | |
164 | movd eax, m6 | |
165 | RET |