| 1 | ;****************************************************************************** |
| 2 | ;* linear least squares model |
| 3 | ;* |
| 4 | ;* Copyright (c) 2013 Loren Merritt |
| 5 | ;* |
| 6 | ;* This file is part of FFmpeg. |
| 7 | ;* |
| 8 | ;* FFmpeg is free software; you can redistribute it and/or |
| 9 | ;* modify it under the terms of the GNU Lesser General Public |
| 10 | ;* License as published by the Free Software Foundation; either |
| 11 | ;* version 2.1 of the License, or (at your option) any later version. |
| 12 | ;* |
| 13 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | ;* Lesser General Public License for more details. |
| 17 | ;* |
| 18 | ;* You should have received a copy of the GNU Lesser General Public |
| 19 | ;* License along with FFmpeg; if not, write to the Free Software |
| 20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 21 | ;****************************************************************************** |
| 22 | |
| 23 | %include "x86util.asm" |
| 24 | |
| 25 | SECTION .text |
| 26 | |
| 27 | %define MAX_VARS 32 |
| 28 | %define MAX_VARS_ALIGN (MAX_VARS+4) |
| 29 | %define COVAR_STRIDE MAX_VARS_ALIGN*8 |
| 30 | %define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] |
| 31 | |
| 32 | struc LLSModel |
| 33 | .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN |
| 34 | .coeff: resq MAX_VARS*MAX_VARS |
| 35 | .variance: resq MAX_VARS |
| 36 | .indep_count: resd 1 |
| 37 | endstruc |
| 38 | |
| 39 | %macro ADDPD_MEM 2 |
| 40 | %if cpuflag(avx) |
| 41 | vaddpd %2, %2, %1 |
| 42 | %else |
| 43 | addpd %2, %1 |
| 44 | %endif |
| 45 | mova %1, %2 |
| 46 | %endmacro |
| 47 | |
| 48 | INIT_XMM sse2 |
| 49 | %define movdqa movaps |
| 50 | cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 |
| 51 | %define covarq ctxq |
| 52 | mov id, [ctxq + LLSModel.indep_count] |
| 53 | lea varq, [varq + iq*8] |
| 54 | neg iq |
| 55 | mov covar2q, covarq |
| 56 | .loopi: |
| 57 | ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal |
| 58 | mova m1, [varq + iq*8] |
| 59 | mova m3, [varq + iq*8 + 16] |
| 60 | pshufd m4, m1, q1010 |
| 61 | pshufd m5, m1, q3232 |
| 62 | pshufd m6, m3, q1010 |
| 63 | pshufd m7, m3, q3232 |
| 64 | mulpd m0, m1, m4 |
| 65 | mulpd m1, m1, m5 |
| 66 | lea covarq, [covar2q + 16] |
| 67 | ADDPD_MEM COVAR(-2,0), m0 |
| 68 | ADDPD_MEM COVAR(-2,1), m1 |
| 69 | lea jq, [iq + 2] |
| 70 | cmp jd, -2 |
| 71 | jg .skip4x4 |
| 72 | .loop4x4: |
| 73 | ; Compute all 16 pairwise products of a 4x4 block |
| 74 | mulpd m0, m4, m3 |
| 75 | mulpd m1, m5, m3 |
| 76 | mulpd m2, m6, m3 |
| 77 | mulpd m3, m3, m7 |
| 78 | ADDPD_MEM COVAR(0,0), m0 |
| 79 | ADDPD_MEM COVAR(0,1), m1 |
| 80 | ADDPD_MEM COVAR(0,2), m2 |
| 81 | ADDPD_MEM COVAR(0,3), m3 |
| 82 | mova m3, [varq + jq*8 + 16] |
| 83 | mulpd m0, m4, m3 |
| 84 | mulpd m1, m5, m3 |
| 85 | mulpd m2, m6, m3 |
| 86 | mulpd m3, m3, m7 |
| 87 | ADDPD_MEM COVAR(2,0), m0 |
| 88 | ADDPD_MEM COVAR(2,1), m1 |
| 89 | ADDPD_MEM COVAR(2,2), m2 |
| 90 | ADDPD_MEM COVAR(2,3), m3 |
| 91 | mova m3, [varq + jq*8 + 32] |
| 92 | add covarq, 32 |
| 93 | add jq, 4 |
| 94 | cmp jd, -2 |
| 95 | jle .loop4x4 |
| 96 | .skip4x4: |
| 97 | test jd, jd |
| 98 | jg .skip2x4 |
| 99 | mulpd m4, m3 |
| 100 | mulpd m5, m3 |
| 101 | mulpd m6, m3 |
| 102 | mulpd m7, m3 |
| 103 | ADDPD_MEM COVAR(0,0), m4 |
| 104 | ADDPD_MEM COVAR(0,1), m5 |
| 105 | ADDPD_MEM COVAR(0,2), m6 |
| 106 | ADDPD_MEM COVAR(0,3), m7 |
| 107 | .skip2x4: |
| 108 | add iq, 4 |
| 109 | add covar2q, 4*COVAR_STRIDE+32 |
| 110 | cmp id, -2 |
| 111 | jle .loopi |
| 112 | test id, id |
| 113 | jg .ret |
| 114 | mov jq, iq |
| 115 | %define covarq covar2q |
| 116 | .loop2x1: |
| 117 | movsd m0, [varq + iq*8] |
| 118 | movlhps m0, m0 |
| 119 | mulpd m0, [varq + jq*8] |
| 120 | ADDPD_MEM COVAR(0,0), m0 |
| 121 | inc iq |
| 122 | add covarq, COVAR_STRIDE |
| 123 | test id, id |
| 124 | jle .loop2x1 |
| 125 | .ret: |
| 126 | REP_RET |
| 127 | |
| 128 | %if HAVE_AVX_EXTERNAL |
| 129 | INIT_YMM avx |
| 130 | cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 |
| 131 | %define covarq ctxq |
| 132 | mov countd, [ctxq + LLSModel.indep_count] |
| 133 | lea count2d, [countq-2] |
| 134 | xor id, id |
| 135 | .loopi: |
| 136 | ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal |
| 137 | mova ymm1, [varq + iq*8] |
| 138 | vbroadcastsd ymm4, [varq + iq*8] |
| 139 | vbroadcastsd ymm5, [varq + iq*8 + 8] |
| 140 | vbroadcastsd ymm6, [varq + iq*8 + 16] |
| 141 | vbroadcastsd ymm7, [varq + iq*8 + 24] |
| 142 | vextractf128 xmm3, ymm1, 1 |
| 143 | vmulpd ymm0, ymm1, ymm4 |
| 144 | vmulpd ymm1, ymm1, ymm5 |
| 145 | vmulpd xmm2, xmm3, xmm6 |
| 146 | vmulpd xmm3, xmm3, xmm7 |
| 147 | ADDPD_MEM COVAR(iq ,0), ymm0 |
| 148 | ADDPD_MEM COVAR(iq ,1), ymm1 |
| 149 | ADDPD_MEM COVAR(iq+2,2), xmm2 |
| 150 | ADDPD_MEM COVAR(iq+2,3), xmm3 |
| 151 | lea jd, [iq + 4] |
| 152 | cmp jd, count2d |
| 153 | jg .skip4x4 |
| 154 | .loop4x4: |
| 155 | ; Compute all 16 pairwise products of a 4x4 block |
| 156 | mova ymm3, [varq + jq*8] |
| 157 | vmulpd ymm0, ymm3, ymm4 |
| 158 | vmulpd ymm1, ymm3, ymm5 |
| 159 | vmulpd ymm2, ymm3, ymm6 |
| 160 | vmulpd ymm3, ymm3, ymm7 |
| 161 | ADDPD_MEM COVAR(jq,0), ymm0 |
| 162 | ADDPD_MEM COVAR(jq,1), ymm1 |
| 163 | ADDPD_MEM COVAR(jq,2), ymm2 |
| 164 | ADDPD_MEM COVAR(jq,3), ymm3 |
| 165 | add jd, 4 |
| 166 | cmp jd, count2d |
| 167 | jle .loop4x4 |
| 168 | .skip4x4: |
| 169 | cmp jd, countd |
| 170 | jg .skip2x4 |
| 171 | mova xmm3, [varq + jq*8] |
| 172 | vmulpd xmm0, xmm3, xmm4 |
| 173 | vmulpd xmm1, xmm3, xmm5 |
| 174 | vmulpd xmm2, xmm3, xmm6 |
| 175 | vmulpd xmm3, xmm3, xmm7 |
| 176 | ADDPD_MEM COVAR(jq,0), xmm0 |
| 177 | ADDPD_MEM COVAR(jq,1), xmm1 |
| 178 | ADDPD_MEM COVAR(jq,2), xmm2 |
| 179 | ADDPD_MEM COVAR(jq,3), xmm3 |
| 180 | .skip2x4: |
| 181 | add id, 4 |
| 182 | add covarq, 4*COVAR_STRIDE |
| 183 | cmp id, count2d |
| 184 | jle .loopi |
| 185 | cmp id, countd |
| 186 | jg .ret |
| 187 | mov jd, id |
| 188 | .loop2x1: |
| 189 | vmovddup xmm0, [varq + iq*8] |
| 190 | vmulpd xmm0, [varq + jq*8] |
| 191 | ADDPD_MEM COVAR(jq,0), xmm0 |
| 192 | inc id |
| 193 | add covarq, COVAR_STRIDE |
| 194 | cmp id, countd |
| 195 | jle .loop2x1 |
| 196 | .ret: |
| 197 | REP_RET |
| 198 | %endif |
| 199 | |
| 200 | INIT_XMM sse2 |
| 201 | cglobal evaluate_lls, 3,4,2, ctx, var, order, i |
| 202 | ; This function is often called on the same buffer as update_lls, but with |
| 203 | ; an offset. They can't both be aligned. |
| 204 | ; Load halves rather than movu to avoid store-forwarding stalls, since the |
| 205 | ; input was initialized immediately prior to this function using scalar math. |
| 206 | %define coefsq ctxq |
| 207 | mov id, orderd |
| 208 | imul orderd, MAX_VARS |
| 209 | lea coefsq, [ctxq + LLSModel.coeff + orderq*8] |
| 210 | movsd m0, [varq] |
| 211 | movhpd m0, [varq + 8] |
| 212 | mulpd m0, [coefsq] |
| 213 | lea coefsq, [coefsq + iq*8] |
| 214 | lea varq, [varq + iq*8] |
| 215 | neg iq |
| 216 | add iq, 2 |
| 217 | .loop: |
| 218 | movsd m1, [varq + iq*8] |
| 219 | movhpd m1, [varq + iq*8 + 8] |
| 220 | mulpd m1, [coefsq + iq*8] |
| 221 | addpd m0, m1 |
| 222 | add iq, 2 |
| 223 | jl .loop |
| 224 | jg .skip1 |
| 225 | movsd m1, [varq + iq*8] |
| 226 | mulsd m1, [coefsq + iq*8] |
| 227 | addpd m0, m1 |
| 228 | .skip1: |
| 229 | movhlps m1, m0 |
| 230 | addsd m0, m1 |
| 231 | %if ARCH_X86_32 |
| 232 | movsd r0m, m0 |
| 233 | fld qword r0m |
| 234 | %endif |
| 235 | RET |