Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavutil / x86 / lls.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* linear least squares model
3;*
4;* Copyright (c) 2013 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86util.asm"
24
25SECTION .text
26
27%define MAX_VARS 32
28%define MAX_VARS_ALIGN (MAX_VARS+4)
29%define COVAR_STRIDE MAX_VARS_ALIGN*8
30%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
31
32struc LLSModel
33 .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
34 .coeff: resq MAX_VARS*MAX_VARS
35 .variance: resq MAX_VARS
36 .indep_count: resd 1
37endstruc
38
39%macro ADDPD_MEM 2
40%if cpuflag(avx)
41 vaddpd %2, %2, %1
42%else
43 addpd %2, %1
44%endif
45 mova %1, %2
46%endmacro
47
48INIT_XMM sse2
49%define movdqa movaps
50cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
51 %define covarq ctxq
52 mov id, [ctxq + LLSModel.indep_count]
53 lea varq, [varq + iq*8]
54 neg iq
55 mov covar2q, covarq
56.loopi:
57 ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
58 mova m1, [varq + iq*8]
59 mova m3, [varq + iq*8 + 16]
60 pshufd m4, m1, q1010
61 pshufd m5, m1, q3232
62 pshufd m6, m3, q1010
63 pshufd m7, m3, q3232
64 mulpd m0, m1, m4
65 mulpd m1, m1, m5
66 lea covarq, [covar2q + 16]
67 ADDPD_MEM COVAR(-2,0), m0
68 ADDPD_MEM COVAR(-2,1), m1
69 lea jq, [iq + 2]
70 cmp jd, -2
71 jg .skip4x4
72.loop4x4:
73 ; Compute all 16 pairwise products of a 4x4 block
74 mulpd m0, m4, m3
75 mulpd m1, m5, m3
76 mulpd m2, m6, m3
77 mulpd m3, m3, m7
78 ADDPD_MEM COVAR(0,0), m0
79 ADDPD_MEM COVAR(0,1), m1
80 ADDPD_MEM COVAR(0,2), m2
81 ADDPD_MEM COVAR(0,3), m3
82 mova m3, [varq + jq*8 + 16]
83 mulpd m0, m4, m3
84 mulpd m1, m5, m3
85 mulpd m2, m6, m3
86 mulpd m3, m3, m7
87 ADDPD_MEM COVAR(2,0), m0
88 ADDPD_MEM COVAR(2,1), m1
89 ADDPD_MEM COVAR(2,2), m2
90 ADDPD_MEM COVAR(2,3), m3
91 mova m3, [varq + jq*8 + 32]
92 add covarq, 32
93 add jq, 4
94 cmp jd, -2
95 jle .loop4x4
96.skip4x4:
97 test jd, jd
98 jg .skip2x4
99 mulpd m4, m3
100 mulpd m5, m3
101 mulpd m6, m3
102 mulpd m7, m3
103 ADDPD_MEM COVAR(0,0), m4
104 ADDPD_MEM COVAR(0,1), m5
105 ADDPD_MEM COVAR(0,2), m6
106 ADDPD_MEM COVAR(0,3), m7
107.skip2x4:
108 add iq, 4
109 add covar2q, 4*COVAR_STRIDE+32
110 cmp id, -2
111 jle .loopi
112 test id, id
113 jg .ret
114 mov jq, iq
115 %define covarq covar2q
116.loop2x1:
117 movsd m0, [varq + iq*8]
118 movlhps m0, m0
119 mulpd m0, [varq + jq*8]
120 ADDPD_MEM COVAR(0,0), m0
121 inc iq
122 add covarq, COVAR_STRIDE
123 test id, id
124 jle .loop2x1
125.ret:
126 REP_RET
127
128%if HAVE_AVX_EXTERNAL
129INIT_YMM avx
130cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
131 %define covarq ctxq
132 mov countd, [ctxq + LLSModel.indep_count]
133 lea count2d, [countq-2]
134 xor id, id
135.loopi:
136 ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
137 mova ymm1, [varq + iq*8]
138 vbroadcastsd ymm4, [varq + iq*8]
139 vbroadcastsd ymm5, [varq + iq*8 + 8]
140 vbroadcastsd ymm6, [varq + iq*8 + 16]
141 vbroadcastsd ymm7, [varq + iq*8 + 24]
142 vextractf128 xmm3, ymm1, 1
143 vmulpd ymm0, ymm1, ymm4
144 vmulpd ymm1, ymm1, ymm5
145 vmulpd xmm2, xmm3, xmm6
146 vmulpd xmm3, xmm3, xmm7
147 ADDPD_MEM COVAR(iq ,0), ymm0
148 ADDPD_MEM COVAR(iq ,1), ymm1
149 ADDPD_MEM COVAR(iq+2,2), xmm2
150 ADDPD_MEM COVAR(iq+2,3), xmm3
151 lea jd, [iq + 4]
152 cmp jd, count2d
153 jg .skip4x4
154.loop4x4:
155 ; Compute all 16 pairwise products of a 4x4 block
156 mova ymm3, [varq + jq*8]
157 vmulpd ymm0, ymm3, ymm4
158 vmulpd ymm1, ymm3, ymm5
159 vmulpd ymm2, ymm3, ymm6
160 vmulpd ymm3, ymm3, ymm7
161 ADDPD_MEM COVAR(jq,0), ymm0
162 ADDPD_MEM COVAR(jq,1), ymm1
163 ADDPD_MEM COVAR(jq,2), ymm2
164 ADDPD_MEM COVAR(jq,3), ymm3
165 add jd, 4
166 cmp jd, count2d
167 jle .loop4x4
168.skip4x4:
169 cmp jd, countd
170 jg .skip2x4
171 mova xmm3, [varq + jq*8]
172 vmulpd xmm0, xmm3, xmm4
173 vmulpd xmm1, xmm3, xmm5
174 vmulpd xmm2, xmm3, xmm6
175 vmulpd xmm3, xmm3, xmm7
176 ADDPD_MEM COVAR(jq,0), xmm0
177 ADDPD_MEM COVAR(jq,1), xmm1
178 ADDPD_MEM COVAR(jq,2), xmm2
179 ADDPD_MEM COVAR(jq,3), xmm3
180.skip2x4:
181 add id, 4
182 add covarq, 4*COVAR_STRIDE
183 cmp id, count2d
184 jle .loopi
185 cmp id, countd
186 jg .ret
187 mov jd, id
188.loop2x1:
189 vmovddup xmm0, [varq + iq*8]
190 vmulpd xmm0, [varq + jq*8]
191 ADDPD_MEM COVAR(jq,0), xmm0
192 inc id
193 add covarq, COVAR_STRIDE
194 cmp id, countd
195 jle .loop2x1
196.ret:
197 REP_RET
198%endif
199
200INIT_XMM sse2
201cglobal evaluate_lls, 3,4,2, ctx, var, order, i
202 ; This function is often called on the same buffer as update_lls, but with
203 ; an offset. They can't both be aligned.
204 ; Load halves rather than movu to avoid store-forwarding stalls, since the
205 ; input was initialized immediately prior to this function using scalar math.
206 %define coefsq ctxq
207 mov id, orderd
208 imul orderd, MAX_VARS
209 lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
210 movsd m0, [varq]
211 movhpd m0, [varq + 8]
212 mulpd m0, [coefsq]
213 lea coefsq, [coefsq + iq*8]
214 lea varq, [varq + iq*8]
215 neg iq
216 add iq, 2
217.loop:
218 movsd m1, [varq + iq*8]
219 movhpd m1, [varq + iq*8 + 8]
220 mulpd m1, [coefsq + iq*8]
221 addpd m0, m1
222 add iq, 2
223 jl .loop
224 jg .skip1
225 movsd m1, [varq + iq*8]
226 mulsd m1, [coefsq + iq*8]
227 addpd m0, m1
228.skip1:
229 movhlps m1, m0
230 addsd m0, m1
231%if ARCH_X86_32
232 movsd r0m, m0
233 fld qword r0m
234%endif
235 RET