Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* linear least squares model | |
3 | ;* | |
4 | ;* Copyright (c) 2013 Loren Merritt | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "x86util.asm" | |
24 | ||
25 | SECTION .text | |
26 | ||
27 | %define MAX_VARS 32 | |
28 | %define MAX_VARS_ALIGN (MAX_VARS+4) | |
29 | %define COVAR_STRIDE MAX_VARS_ALIGN*8 | |
30 | %define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] | |
31 | ||
32 | struc LLSModel | |
33 | .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN | |
34 | .coeff: resq MAX_VARS*MAX_VARS | |
35 | .variance: resq MAX_VARS | |
36 | .indep_count: resd 1 | |
37 | endstruc | |
38 | ||
39 | %macro ADDPD_MEM 2 | |
40 | %if cpuflag(avx) | |
41 | vaddpd %2, %2, %1 | |
42 | %else | |
43 | addpd %2, %1 | |
44 | %endif | |
45 | mova %1, %2 | |
46 | %endmacro | |
47 | ||
48 | INIT_XMM sse2 | |
49 | %define movdqa movaps | |
50 | cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 | |
51 | %define covarq ctxq | |
52 | mov id, [ctxq + LLSModel.indep_count] | |
53 | lea varq, [varq + iq*8] | |
54 | neg iq | |
55 | mov covar2q, covarq | |
56 | .loopi: | |
57 | ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal | |
58 | mova m1, [varq + iq*8] | |
59 | mova m3, [varq + iq*8 + 16] | |
60 | pshufd m4, m1, q1010 | |
61 | pshufd m5, m1, q3232 | |
62 | pshufd m6, m3, q1010 | |
63 | pshufd m7, m3, q3232 | |
64 | mulpd m0, m1, m4 | |
65 | mulpd m1, m1, m5 | |
66 | lea covarq, [covar2q + 16] | |
67 | ADDPD_MEM COVAR(-2,0), m0 | |
68 | ADDPD_MEM COVAR(-2,1), m1 | |
69 | lea jq, [iq + 2] | |
70 | cmp jd, -2 | |
71 | jg .skip4x4 | |
72 | .loop4x4: | |
73 | ; Compute all 16 pairwise products of a 4x4 block | |
74 | mulpd m0, m4, m3 | |
75 | mulpd m1, m5, m3 | |
76 | mulpd m2, m6, m3 | |
77 | mulpd m3, m3, m7 | |
78 | ADDPD_MEM COVAR(0,0), m0 | |
79 | ADDPD_MEM COVAR(0,1), m1 | |
80 | ADDPD_MEM COVAR(0,2), m2 | |
81 | ADDPD_MEM COVAR(0,3), m3 | |
82 | mova m3, [varq + jq*8 + 16] | |
83 | mulpd m0, m4, m3 | |
84 | mulpd m1, m5, m3 | |
85 | mulpd m2, m6, m3 | |
86 | mulpd m3, m3, m7 | |
87 | ADDPD_MEM COVAR(2,0), m0 | |
88 | ADDPD_MEM COVAR(2,1), m1 | |
89 | ADDPD_MEM COVAR(2,2), m2 | |
90 | ADDPD_MEM COVAR(2,3), m3 | |
91 | mova m3, [varq + jq*8 + 32] | |
92 | add covarq, 32 | |
93 | add jq, 4 | |
94 | cmp jd, -2 | |
95 | jle .loop4x4 | |
96 | .skip4x4: | |
97 | test jd, jd | |
98 | jg .skip2x4 | |
99 | mulpd m4, m3 | |
100 | mulpd m5, m3 | |
101 | mulpd m6, m3 | |
102 | mulpd m7, m3 | |
103 | ADDPD_MEM COVAR(0,0), m4 | |
104 | ADDPD_MEM COVAR(0,1), m5 | |
105 | ADDPD_MEM COVAR(0,2), m6 | |
106 | ADDPD_MEM COVAR(0,3), m7 | |
107 | .skip2x4: | |
108 | add iq, 4 | |
109 | add covar2q, 4*COVAR_STRIDE+32 | |
110 | cmp id, -2 | |
111 | jle .loopi | |
112 | test id, id | |
113 | jg .ret | |
114 | mov jq, iq | |
115 | %define covarq covar2q | |
116 | .loop2x1: | |
117 | movsd m0, [varq + iq*8] | |
118 | movlhps m0, m0 | |
119 | mulpd m0, [varq + jq*8] | |
120 | ADDPD_MEM COVAR(0,0), m0 | |
121 | inc iq | |
122 | add covarq, COVAR_STRIDE | |
123 | test id, id | |
124 | jle .loop2x1 | |
125 | .ret: | |
126 | REP_RET | |
127 | ||
128 | %if HAVE_AVX_EXTERNAL | |
129 | INIT_YMM avx | |
130 | cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 | |
131 | %define covarq ctxq | |
132 | mov countd, [ctxq + LLSModel.indep_count] | |
133 | lea count2d, [countq-2] | |
134 | xor id, id | |
135 | .loopi: | |
136 | ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal | |
137 | mova ymm1, [varq + iq*8] | |
138 | vbroadcastsd ymm4, [varq + iq*8] | |
139 | vbroadcastsd ymm5, [varq + iq*8 + 8] | |
140 | vbroadcastsd ymm6, [varq + iq*8 + 16] | |
141 | vbroadcastsd ymm7, [varq + iq*8 + 24] | |
142 | vextractf128 xmm3, ymm1, 1 | |
143 | vmulpd ymm0, ymm1, ymm4 | |
144 | vmulpd ymm1, ymm1, ymm5 | |
145 | vmulpd xmm2, xmm3, xmm6 | |
146 | vmulpd xmm3, xmm3, xmm7 | |
147 | ADDPD_MEM COVAR(iq ,0), ymm0 | |
148 | ADDPD_MEM COVAR(iq ,1), ymm1 | |
149 | ADDPD_MEM COVAR(iq+2,2), xmm2 | |
150 | ADDPD_MEM COVAR(iq+2,3), xmm3 | |
151 | lea jd, [iq + 4] | |
152 | cmp jd, count2d | |
153 | jg .skip4x4 | |
154 | .loop4x4: | |
155 | ; Compute all 16 pairwise products of a 4x4 block | |
156 | mova ymm3, [varq + jq*8] | |
157 | vmulpd ymm0, ymm3, ymm4 | |
158 | vmulpd ymm1, ymm3, ymm5 | |
159 | vmulpd ymm2, ymm3, ymm6 | |
160 | vmulpd ymm3, ymm3, ymm7 | |
161 | ADDPD_MEM COVAR(jq,0), ymm0 | |
162 | ADDPD_MEM COVAR(jq,1), ymm1 | |
163 | ADDPD_MEM COVAR(jq,2), ymm2 | |
164 | ADDPD_MEM COVAR(jq,3), ymm3 | |
165 | add jd, 4 | |
166 | cmp jd, count2d | |
167 | jle .loop4x4 | |
168 | .skip4x4: | |
169 | cmp jd, countd | |
170 | jg .skip2x4 | |
171 | mova xmm3, [varq + jq*8] | |
172 | vmulpd xmm0, xmm3, xmm4 | |
173 | vmulpd xmm1, xmm3, xmm5 | |
174 | vmulpd xmm2, xmm3, xmm6 | |
175 | vmulpd xmm3, xmm3, xmm7 | |
176 | ADDPD_MEM COVAR(jq,0), xmm0 | |
177 | ADDPD_MEM COVAR(jq,1), xmm1 | |
178 | ADDPD_MEM COVAR(jq,2), xmm2 | |
179 | ADDPD_MEM COVAR(jq,3), xmm3 | |
180 | .skip2x4: | |
181 | add id, 4 | |
182 | add covarq, 4*COVAR_STRIDE | |
183 | cmp id, count2d | |
184 | jle .loopi | |
185 | cmp id, countd | |
186 | jg .ret | |
187 | mov jd, id | |
188 | .loop2x1: | |
189 | vmovddup xmm0, [varq + iq*8] | |
190 | vmulpd xmm0, [varq + jq*8] | |
191 | ADDPD_MEM COVAR(jq,0), xmm0 | |
192 | inc id | |
193 | add covarq, COVAR_STRIDE | |
194 | cmp id, countd | |
195 | jle .loop2x1 | |
196 | .ret: | |
197 | REP_RET | |
198 | %endif | |
199 | ||
200 | INIT_XMM sse2 | |
201 | cglobal evaluate_lls, 3,4,2, ctx, var, order, i | |
202 | ; This function is often called on the same buffer as update_lls, but with | |
203 | ; an offset. They can't both be aligned. | |
204 | ; Load halves rather than movu to avoid store-forwarding stalls, since the | |
205 | ; input was initialized immediately prior to this function using scalar math. | |
206 | %define coefsq ctxq | |
207 | mov id, orderd | |
208 | imul orderd, MAX_VARS | |
209 | lea coefsq, [ctxq + LLSModel.coeff + orderq*8] | |
210 | movsd m0, [varq] | |
211 | movhpd m0, [varq + 8] | |
212 | mulpd m0, [coefsq] | |
213 | lea coefsq, [coefsq + iq*8] | |
214 | lea varq, [varq + iq*8] | |
215 | neg iq | |
216 | add iq, 2 | |
217 | .loop: | |
218 | movsd m1, [varq + iq*8] | |
219 | movhpd m1, [varq + iq*8 + 8] | |
220 | mulpd m1, [coefsq + iq*8] | |
221 | addpd m0, m1 | |
222 | add iq, 2 | |
223 | jl .loop | |
224 | jg .skip1 | |
225 | movsd m1, [varq + iq*8] | |
226 | mulsd m1, [coefsq + iq*8] | |
227 | addpd m0, m1 | |
228 | .skip1: | |
229 | movhlps m1, m0 | |
230 | addsd m0, m1 | |
231 | %if ARCH_X86_32 | |
232 | movsd r0m, m0 | |
233 | fld qword r0m | |
234 | %endif | |
235 | RET |