Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* Copyright (c) 2008 Loren Merritt | |
3 | ;* | |
4 | ;* This file is part of FFmpeg. | |
5 | ;* | |
6 | ;* FFmpeg is free software; you can redistribute it and/or | |
7 | ;* modify it under the terms of the GNU Lesser General Public | |
8 | ;* License as published by the Free Software Foundation; either | |
9 | ;* version 2.1 of the License, or (at your option) any later version. | |
10 | ;* | |
11 | ;* FFmpeg is distributed in the hope that it will be useful, | |
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | ;* Lesser General Public License for more details. | |
15 | ;* | |
16 | ;* You should have received a copy of the GNU Lesser General Public | |
17 | ;* License along with FFmpeg; if not, write to the Free Software | |
18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | ;****************************************************************************** | |
20 | ||
21 | %include "libavutil/x86/x86util.asm" | |
22 | ||
23 | SECTION_TEXT | |
24 | ||
25 | %macro SCALARPRODUCT 0 | |
26 | ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |
27 | ; int order, int mul) | |
28 | cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul | |
29 | shl orderq, 1 | |
30 | movd m7, mulm | |
31 | %if mmsize == 16 | |
32 | pshuflw m7, m7, 0 | |
33 | punpcklqdq m7, m7 | |
34 | %else | |
35 | pshufw m7, m7, 0 | |
36 | %endif | |
37 | pxor m6, m6 | |
38 | add v1q, orderq | |
39 | add v2q, orderq | |
40 | add v3q, orderq | |
41 | neg orderq | |
42 | .loop: | |
43 | movu m0, [v2q + orderq] | |
44 | movu m1, [v2q + orderq + mmsize] | |
45 | mova m4, [v1q + orderq] | |
46 | mova m5, [v1q + orderq + mmsize] | |
47 | movu m2, [v3q + orderq] | |
48 | movu m3, [v3q + orderq + mmsize] | |
49 | pmaddwd m0, m4 | |
50 | pmaddwd m1, m5 | |
51 | pmullw m2, m7 | |
52 | pmullw m3, m7 | |
53 | paddd m6, m0 | |
54 | paddd m6, m1 | |
55 | paddw m2, m4 | |
56 | paddw m3, m5 | |
57 | mova [v1q + orderq], m2 | |
58 | mova [v1q + orderq + mmsize], m3 | |
59 | add orderq, mmsize*2 | |
60 | jl .loop | |
61 | HADDD m6, m0 | |
62 | movd eax, m6 | |
63 | RET | |
64 | %endmacro | |
65 | ||
66 | INIT_MMX mmxext | |
67 | SCALARPRODUCT | |
68 | INIT_XMM sse2 | |
69 | SCALARPRODUCT | |
70 | ||
71 | %macro SCALARPRODUCT_LOOP 1 | |
72 | align 16 | |
73 | .loop%1: | |
74 | sub orderq, mmsize*2 | |
75 | %if %1 | |
76 | mova m1, m4 | |
77 | mova m4, [v2q + orderq] | |
78 | mova m0, [v2q + orderq + mmsize] | |
79 | palignr m1, m0, %1 | |
80 | palignr m0, m4, %1 | |
81 | mova m3, m5 | |
82 | mova m5, [v3q + orderq] | |
83 | mova m2, [v3q + orderq + mmsize] | |
84 | palignr m3, m2, %1 | |
85 | palignr m2, m5, %1 | |
86 | %else | |
87 | mova m0, [v2q + orderq] | |
88 | mova m1, [v2q + orderq + mmsize] | |
89 | mova m2, [v3q + orderq] | |
90 | mova m3, [v3q + orderq + mmsize] | |
91 | %endif | |
92 | %define t0 [v1q + orderq] | |
93 | %define t1 [v1q + orderq + mmsize] | |
94 | %if ARCH_X86_64 | |
95 | mova m8, t0 | |
96 | mova m9, t1 | |
97 | %define t0 m8 | |
98 | %define t1 m9 | |
99 | %endif | |
100 | pmaddwd m0, t0 | |
101 | pmaddwd m1, t1 | |
102 | pmullw m2, m7 | |
103 | pmullw m3, m7 | |
104 | paddw m2, t0 | |
105 | paddw m3, t1 | |
106 | paddd m6, m0 | |
107 | paddd m6, m1 | |
108 | mova [v1q + orderq], m2 | |
109 | mova [v1q + orderq + mmsize], m3 | |
110 | jg .loop%1 | |
111 | %if %1 | |
112 | jmp .end | |
113 | %endif | |
114 | %endmacro | |
115 | ||
116 | ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |
117 | ; int order, int mul) | |
118 | INIT_XMM ssse3 | |
119 | cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul | |
120 | shl orderq, 1 | |
121 | movd m7, mulm | |
122 | pshuflw m7, m7, 0 | |
123 | punpcklqdq m7, m7 | |
124 | pxor m6, m6 | |
125 | mov r4d, v2d | |
126 | and r4d, 15 | |
127 | and v2q, ~15 | |
128 | and v3q, ~15 | |
129 | mova m4, [v2q + orderq] | |
130 | mova m5, [v3q + orderq] | |
131 | ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |
132 | cmp r4d, 0 | |
133 | je .loop0 | |
134 | cmp r4d, 2 | |
135 | je .loop2 | |
136 | cmp r4d, 4 | |
137 | je .loop4 | |
138 | cmp r4d, 6 | |
139 | je .loop6 | |
140 | cmp r4d, 8 | |
141 | je .loop8 | |
142 | cmp r4d, 10 | |
143 | je .loop10 | |
144 | cmp r4d, 12 | |
145 | je .loop12 | |
146 | SCALARPRODUCT_LOOP 14 | |
147 | SCALARPRODUCT_LOOP 12 | |
148 | SCALARPRODUCT_LOOP 10 | |
149 | SCALARPRODUCT_LOOP 8 | |
150 | SCALARPRODUCT_LOOP 6 | |
151 | SCALARPRODUCT_LOOP 4 | |
152 | SCALARPRODUCT_LOOP 2 | |
153 | SCALARPRODUCT_LOOP 0 | |
154 | .end: | |
155 | HADDD m6, m0 | |
156 | movd eax, m6 | |
157 | RET |