| 1 | ;****************************************************************************** |
| 2 | ;* MMX/SSE2-optimized functions for the VP3 decoder |
| 3 | ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> |
| 4 | ;* |
| 5 | ;* This file is part of FFmpeg. |
| 6 | ;* |
| 7 | ;* FFmpeg is free software; you can redistribute it and/or |
| 8 | ;* modify it under the terms of the GNU Lesser General Public |
| 9 | ;* License as published by the Free Software Foundation; either |
| 10 | ;* version 2.1 of the License, or (at your option) any later version. |
| 11 | ;* |
| 12 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | ;* Lesser General Public License for more details. |
| 16 | ;* |
| 17 | ;* You should have received a copy of the GNU Lesser General Public |
| 18 | ;* License along with FFmpeg; if not, write to the Free Software |
| 19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | ;****************************************************************************** |
| 21 | |
| 22 | %include "libavutil/x86/x86util.asm" |
| 23 | |
| 24 | ; MMX-optimized functions cribbed from the original VP3 source code. |
| 25 | |
| 26 | SECTION_RODATA |
| 27 | |
| 28 | vp3_idct_data: times 8 dw 64277 |
| 29 | times 8 dw 60547 |
| 30 | times 8 dw 54491 |
| 31 | times 8 dw 46341 |
| 32 | times 8 dw 36410 |
| 33 | times 8 dw 25080 |
| 34 | times 8 dw 12785 |
| 35 | |
| 36 | pb_7: times 8 db 0x07 |
| 37 | pb_1F: times 8 db 0x1f |
| 38 | pb_81: times 8 db 0x81 |
| 39 | |
| 40 | cextern pb_1 |
| 41 | cextern pb_3 |
| 42 | cextern pb_80 |
| 43 | |
| 44 | cextern pw_8 |
| 45 | |
| 46 | SECTION .text |
| 47 | |
| 48 | ; this is off by one or two for some cases when filter_limit is greater than 63 |
| 49 | ; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 |
| 50 | ; out: p1 in mm4, p2 in mm3 |
| 51 | %macro VP3_LOOP_FILTER 0 |
| 52 | movq m7, m6 |
| 53 | pand m6, [pb_7] ; p0&7 |
| 54 | psrlw m7, 3 |
| 55 | pand m7, [pb_1F] ; p0>>3 |
| 56 | movq m3, m2 ; p2 |
| 57 | pxor m2, m4 |
| 58 | pand m2, [pb_1] ; (p2^p1)&1 |
| 59 | movq m5, m2 |
| 60 | paddb m2, m2 |
| 61 | paddb m2, m5 ; 3*(p2^p1)&1 |
| 62 | paddb m2, m6 ; extra bits lost in shifts |
| 63 | pcmpeqb m0, m0 |
| 64 | pxor m1, m0 ; 255 - p3 |
| 65 | pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 |
| 66 | pxor m0, m4 ; 255 - p1 |
| 67 | pavgb m0, m3 ; (256 + p2-p1) >> 1 |
| 68 | paddb m1, [pb_3] |
| 69 | pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 |
| 70 | pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 |
| 71 | paddusb m7, m1 ; d+128+1 |
| 72 | movq m6, [pb_81] |
| 73 | psubusb m6, m7 |
| 74 | psubusb m7, [pb_81] |
| 75 | |
| 76 | movq m5, [r2+516] ; flim |
| 77 | pminub m6, m5 |
| 78 | pminub m7, m5 |
| 79 | movq m0, m6 |
| 80 | movq m1, m7 |
| 81 | paddb m6, m6 |
| 82 | paddb m7, m7 |
| 83 | pminub m6, m5 |
| 84 | pminub m7, m5 |
| 85 | psubb m6, m0 |
| 86 | psubb m7, m1 |
| 87 | paddusb m4, m7 |
| 88 | psubusb m4, m6 |
| 89 | psubusb m3, m7 |
| 90 | paddusb m3, m6 |
| 91 | %endmacro |
| 92 | |
| 93 | %macro STORE_4_WORDS 1 |
| 94 | movd r2d, %1 |
| 95 | mov [r0 -1], r2w |
| 96 | psrlq %1, 32 |
| 97 | shr r2, 16 |
| 98 | mov [r0+r1 -1], r2w |
| 99 | movd r2d, %1 |
| 100 | mov [r0+r1*2-1], r2w |
| 101 | shr r2, 16 |
| 102 | mov [r0+r3 -1], r2w |
| 103 | %endmacro |
| 104 | |
| 105 | INIT_MMX mmxext |
| 106 | cglobal vp3_v_loop_filter, 3, 4 |
| 107 | %if ARCH_X86_64 |
| 108 | movsxd r1, r1d |
| 109 | %endif |
| 110 | mov r3, r1 |
| 111 | neg r1 |
| 112 | movq m6, [r0+r1*2] |
| 113 | movq m4, [r0+r1 ] |
| 114 | movq m2, [r0 ] |
| 115 | movq m1, [r0+r3 ] |
| 116 | |
| 117 | VP3_LOOP_FILTER |
| 118 | |
| 119 | movq [r0+r1], m4 |
| 120 | movq [r0 ], m3 |
| 121 | RET |
| 122 | |
| 123 | cglobal vp3_h_loop_filter, 3, 4 |
| 124 | %if ARCH_X86_64 |
| 125 | movsxd r1, r1d |
| 126 | %endif |
| 127 | lea r3, [r1*3] |
| 128 | |
| 129 | movd m6, [r0 -2] |
| 130 | movd m4, [r0+r1 -2] |
| 131 | movd m2, [r0+r1*2-2] |
| 132 | movd m1, [r0+r3 -2] |
| 133 | lea r0, [r0+r1*4 ] |
| 134 | punpcklbw m6, [r0 -2] |
| 135 | punpcklbw m4, [r0+r1 -2] |
| 136 | punpcklbw m2, [r0+r1*2-2] |
| 137 | punpcklbw m1, [r0+r3 -2] |
| 138 | sub r0, r3 |
| 139 | sub r0, r1 |
| 140 | |
| 141 | TRANSPOSE4x4B 6, 4, 2, 1, 0 |
| 142 | VP3_LOOP_FILTER |
| 143 | SBUTTERFLY bw, 4, 3, 5 |
| 144 | |
| 145 | STORE_4_WORDS m4 |
| 146 | lea r0, [r0+r1*4 ] |
| 147 | STORE_4_WORDS m3 |
| 148 | RET |
| 149 | |
| 150 | ; from original comments: The Macro does IDct on 4 1-D Dcts |
| 151 | %macro BeginIDCT 0 |
| 152 | movq m2, I(3) |
| 153 | movq m6, C(3) |
| 154 | movq m4, m2 |
| 155 | movq m7, J(5) |
| 156 | pmulhw m4, m6 ; r4 = c3*i3 - i3 |
| 157 | movq m1, C(5) |
| 158 | pmulhw m6, m7 ; r6 = c3*i5 - i5 |
| 159 | movq m5, m1 |
| 160 | pmulhw m1, m2 ; r1 = c5*i3 - i3 |
| 161 | movq m3, I(1) |
| 162 | pmulhw m5, m7 ; r5 = c5*i5 - i5 |
| 163 | movq m0, C(1) |
| 164 | paddw m4, m2 ; r4 = c3*i3 |
| 165 | paddw m6, m7 ; r6 = c3*i5 |
| 166 | paddw m2, m1 ; r2 = c5*i3 |
| 167 | movq m1, J(7) |
| 168 | paddw m7, m5 ; r7 = c5*i5 |
| 169 | movq m5, m0 ; r5 = c1 |
| 170 | pmulhw m0, m3 ; r0 = c1*i1 - i1 |
| 171 | paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 |
| 172 | pmulhw m5, m1 ; r5 = c1*i7 - i7 |
| 173 | movq m7, C(7) |
| 174 | psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 |
| 175 | paddw m0, m3 ; r0 = c1*i1 |
| 176 | pmulhw m3, m7 ; r3 = c7*i1 |
| 177 | movq m2, I(2) |
| 178 | pmulhw m7, m1 ; r7 = c7*i7 |
| 179 | paddw m5, m1 ; r5 = c1*i7 |
| 180 | movq m1, m2 ; r1 = i2 |
| 181 | pmulhw m2, C(2) ; r2 = c2*i2 - i2 |
| 182 | psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 |
| 183 | movq m5, J(6) |
| 184 | paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 |
| 185 | movq m7, m5 ; r7 = i6 |
| 186 | psubsw m0, m4 ; r0 = A - C |
| 187 | pmulhw m5, C(2) ; r5 = c2*i6 - i6 |
| 188 | paddw m2, m1 ; r2 = c2*i2 |
| 189 | pmulhw m1, C(6) ; r1 = c6*i2 |
| 190 | paddsw m4, m4 ; r4 = C + C |
| 191 | paddsw m4, m0 ; r4 = C. = A + C |
| 192 | psubsw m3, m6 ; r3 = B - D |
| 193 | paddw m5, m7 ; r5 = c2*i6 |
| 194 | paddsw m6, m6 ; r6 = D + D |
| 195 | pmulhw m7, C(6) ; r7 = c6*i6 |
| 196 | paddsw m6, m3 ; r6 = D. = B + D |
| 197 | movq I(1), m4 ; save C. at I(1) |
| 198 | psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 |
| 199 | movq m4, C(4) |
| 200 | movq m5, m3 ; r5 = B - D |
| 201 | pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) |
| 202 | paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) |
| 203 | movq I(2), m6 ; save D. at I(2) |
| 204 | movq m2, m0 ; r2 = A - C |
| 205 | movq m6, I(0) |
| 206 | pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) |
| 207 | paddw m5, m3 ; r5 = B. = c4 * (B - D) |
| 208 | movq m3, J(4) |
| 209 | psubsw m5, m1 ; r5 = B.. = B. - H |
| 210 | paddw m2, m0 ; r0 = A. = c4 * (A - C) |
| 211 | psubsw m6, m3 ; r6 = i0 - i4 |
| 212 | movq m0, m6 |
| 213 | pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) |
| 214 | paddsw m3, m3 ; r3 = i4 + i4 |
| 215 | paddsw m1, m1 ; r1 = H + H |
| 216 | paddsw m3, m0 ; r3 = i0 + i4 |
| 217 | paddsw m1, m5 ; r1 = H. = B + H |
| 218 | pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) |
| 219 | paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) |
| 220 | psubsw m6, m2 ; r6 = F. = F - A. |
| 221 | paddsw m2, m2 ; r2 = A. + A. |
| 222 | movq m0, I(1) ; r0 = C. |
| 223 | paddsw m2, m6 ; r2 = A.. = F + A. |
| 224 | paddw m4, m3 ; r4 = E = c4 * (i0 + i4) |
| 225 | psubsw m2, m1 ; r2 = R2 = A.. - H. |
| 226 | %endmacro |
| 227 | |
| 228 | ; RowIDCT gets ready to transpose |
| 229 | %macro RowIDCT 0 |
| 230 | BeginIDCT |
| 231 | movq m3, I(2) ; r3 = D. |
| 232 | psubsw m4, m7 ; r4 = E. = E - G |
| 233 | paddsw m1, m1 ; r1 = H. + H. |
| 234 | paddsw m7, m7 ; r7 = G + G |
| 235 | paddsw m1, m2 ; r1 = R1 = A.. + H. |
| 236 | paddsw m7, m4 ; r1 = R1 = A.. + H. |
| 237 | psubsw m4, m3 ; r4 = R4 = E. - D. |
| 238 | paddsw m3, m3 |
| 239 | psubsw m6, m5 ; r6 = R6 = F. - B.. |
| 240 | paddsw m5, m5 |
| 241 | paddsw m3, m4 ; r3 = R3 = E. + D. |
| 242 | paddsw m5, m6 ; r5 = R5 = F. + B.. |
| 243 | psubsw m7, m0 ; r7 = R7 = G. - C. |
| 244 | paddsw m0, m0 |
| 245 | movq I(1), m1 ; save R1 |
| 246 | paddsw m0, m7 ; r0 = R0 = G. + C. |
| 247 | %endmacro |
| 248 | |
| 249 | ; Column IDCT normalizes and stores final results |
| 250 | %macro ColumnIDCT 0 |
| 251 | BeginIDCT |
| 252 | paddsw m2, OC_8 ; adjust R2 (and R1) for shift |
| 253 | paddsw m1, m1 ; r1 = H. + H. |
| 254 | paddsw m1, m2 ; r1 = R1 = A.. + H. |
| 255 | psraw m2, 4 ; r2 = NR2 |
| 256 | psubsw m4, m7 ; r4 = E. = E - G |
| 257 | psraw m1, 4 ; r1 = NR2 |
| 258 | movq m3, I(2) ; r3 = D. |
| 259 | paddsw m7, m7 ; r7 = G + G |
| 260 | movq I(2), m2 ; store NR2 at I2 |
| 261 | paddsw m7, m4 ; r7 = G. = E + G |
| 262 | movq I(1), m1 ; store NR1 at I1 |
| 263 | psubsw m4, m3 ; r4 = R4 = E. - D. |
| 264 | paddsw m4, OC_8 ; adjust R4 (and R3) for shift |
| 265 | paddsw m3, m3 ; r3 = D. + D. |
| 266 | paddsw m3, m4 ; r3 = R3 = E. + D. |
| 267 | psraw m4, 4 ; r4 = NR4 |
| 268 | psubsw m6, m5 ; r6 = R6 = F. - B.. |
| 269 | psraw m3, 4 ; r3 = NR3 |
| 270 | paddsw m6, OC_8 ; adjust R6 (and R5) for shift |
| 271 | paddsw m5, m5 ; r5 = B.. + B.. |
| 272 | paddsw m5, m6 ; r5 = R5 = F. + B.. |
| 273 | psraw m6, 4 ; r6 = NR6 |
| 274 | movq J(4), m4 ; store NR4 at J4 |
| 275 | psraw m5, 4 ; r5 = NR5 |
| 276 | movq I(3), m3 ; store NR3 at I3 |
| 277 | psubsw m7, m0 ; r7 = R7 = G. - C. |
| 278 | paddsw m7, OC_8 ; adjust R7 (and R0) for shift |
| 279 | paddsw m0, m0 ; r0 = C. + C. |
| 280 | paddsw m0, m7 ; r0 = R0 = G. + C. |
| 281 | psraw m7, 4 ; r7 = NR7 |
| 282 | movq J(6), m6 ; store NR6 at J6 |
| 283 | psraw m0, 4 ; r0 = NR0 |
| 284 | movq J(5), m5 ; store NR5 at J5 |
| 285 | movq J(7), m7 ; store NR7 at J7 |
| 286 | movq I(0), m0 ; store NR0 at I0 |
| 287 | %endmacro |
| 288 | |
| 289 | ; Following macro does two 4x4 transposes in place. |
| 290 | ; |
| 291 | ; At entry (we assume): |
| 292 | ; |
| 293 | ; r0 = a3 a2 a1 a0 |
| 294 | ; I(1) = b3 b2 b1 b0 |
| 295 | ; r2 = c3 c2 c1 c0 |
| 296 | ; r3 = d3 d2 d1 d0 |
| 297 | ; |
| 298 | ; r4 = e3 e2 e1 e0 |
| 299 | ; r5 = f3 f2 f1 f0 |
| 300 | ; r6 = g3 g2 g1 g0 |
| 301 | ; r7 = h3 h2 h1 h0 |
| 302 | ; |
| 303 | ; At exit, we have: |
| 304 | ; |
| 305 | ; I(0) = d0 c0 b0 a0 |
| 306 | ; I(1) = d1 c1 b1 a1 |
| 307 | ; I(2) = d2 c2 b2 a2 |
| 308 | ; I(3) = d3 c3 b3 a3 |
| 309 | ; |
| 310 | ; J(4) = h0 g0 f0 e0 |
| 311 | ; J(5) = h1 g1 f1 e1 |
| 312 | ; J(6) = h2 g2 f2 e2 |
| 313 | ; J(7) = h3 g3 f3 e3 |
| 314 | ; |
| 315 | ; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
| 316 | ; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
| 317 | ; |
| 318 | ; Since r1 is free at entry, we calculate the Js first. |
| 319 | %macro Transpose 0 |
| 320 | movq m1, m4 ; r1 = e3 e2 e1 e0 |
| 321 | punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 |
| 322 | movq I(0), m0 ; save a3 a2 a1 a0 |
| 323 | punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 |
| 324 | movq m0, m6 ; r0 = g3 g2 g1 g0 |
| 325 | punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 |
| 326 | movq m5, m4 ; r5 = f1 e1 f0 e0 |
| 327 | punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 |
| 328 | punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 |
| 329 | movq m6, m1 ; r6 = f3 e3 f2 e2 |
| 330 | movq J(4), m4 |
| 331 | punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 |
| 332 | movq J(5), m5 |
| 333 | punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 |
| 334 | movq m4, I(0) ; r4 = a3 a2 a1 a0 |
| 335 | punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 |
| 336 | movq m5, I(1) ; r5 = b3 b2 b1 b0 |
| 337 | movq m0, m4 ; r0 = a3 a2 a1 a0 |
| 338 | movq J(7), m6 |
| 339 | punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 |
| 340 | movq J(6), m1 |
| 341 | punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 |
| 342 | movq m5, m2 ; r5 = c3 c2 c1 c0 |
| 343 | punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 |
| 344 | movq m1, m0 ; r1 = b1 a1 b0 a0 |
| 345 | punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 |
| 346 | punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 |
| 347 | movq m2, m4 ; r2 = b3 a3 b2 a2 |
| 348 | movq I(0), m0 |
| 349 | punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 |
| 350 | movq I(1), m1 |
| 351 | punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 |
| 352 | punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 |
| 353 | movq I(3), m4 |
| 354 | movq I(2), m2 |
| 355 | %endmacro |
| 356 | |
| 357 | %macro VP3_1D_IDCT_SSE2 0 |
| 358 | movdqa m2, I(3) ; xmm2 = i3 |
| 359 | movdqa m6, C(3) ; xmm6 = c3 |
| 360 | movdqa m4, m2 ; xmm4 = i3 |
| 361 | movdqa m7, I(5) ; xmm7 = i5 |
| 362 | pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 |
| 363 | movdqa m1, C(5) ; xmm1 = c5 |
| 364 | pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 |
| 365 | movdqa m5, m1 ; xmm5 = c5 |
| 366 | pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 |
| 367 | movdqa m3, I(1) ; xmm3 = i1 |
| 368 | pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 |
| 369 | movdqa m0, C(1) ; xmm0 = c1 |
| 370 | paddw m4, m2 ; xmm4 = c3 * i3 |
| 371 | paddw m6, m7 ; xmm6 = c3 * i5 |
| 372 | paddw m2, m1 ; xmm2 = c5 * i3 |
| 373 | movdqa m1, I(7) ; xmm1 = i7 |
| 374 | paddw m7, m5 ; xmm7 = c5 * i5 |
| 375 | movdqa m5, m0 ; xmm5 = c1 |
| 376 | pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 |
| 377 | paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C |
| 378 | pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 |
| 379 | movdqa m7, C(7) ; xmm7 = c7 |
| 380 | psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D |
| 381 | paddw m0, m3 ; xmm0 = c1 * i1 |
| 382 | pmulhw m3, m7 ; xmm3 = c7 * i1 |
| 383 | movdqa m2, I(2) ; xmm2 = i2 |
| 384 | pmulhw m7, m1 ; xmm7 = c7 * i7 |
| 385 | paddw m5, m1 ; xmm5 = c1 * i7 |
| 386 | movdqa m1, m2 ; xmm1 = i2 |
| 387 | pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 |
| 388 | psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B |
| 389 | movdqa m5, I(6) ; xmm5 = i6 |
| 390 | paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A |
| 391 | movdqa m7, m5 ; xmm7 = i6 |
| 392 | psubsw m0, m4 ; xmm0 = A - C |
| 393 | pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 |
| 394 | paddw m2, m1 ; xmm2 = i2 * c2 |
| 395 | pmulhw m1, C(6) ; xmm1 = c6 * i2 |
| 396 | paddsw m4, m4 ; xmm4 = C + C |
| 397 | paddsw m4, m0 ; xmm4 = A + C = C. |
| 398 | psubsw m3, m6 ; xmm3 = B - D |
| 399 | paddw m5, m7 ; xmm5 = c2 * i6 |
| 400 | paddsw m6, m6 ; xmm6 = D + D |
| 401 | pmulhw m7, C(6) ; xmm7 = c6 * i6 |
| 402 | paddsw m6, m3 ; xmm6 = B + D = D. |
| 403 | movdqa I(1), m4 ; Save C. at I(1) |
| 404 | psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H |
| 405 | movdqa m4, C(4) ; xmm4 = C4 |
| 406 | movdqa m5, m3 ; xmm5 = B - D |
| 407 | pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) |
| 408 | paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G |
| 409 | movdqa I(2), m6 ; save D. at I(2) |
| 410 | movdqa m2, m0 ; xmm2 = A - C |
| 411 | movdqa m6, I(0) ; xmm6 = i0 |
| 412 | pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. |
| 413 | paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. |
| 414 | movdqa m3, I(4) ; xmm3 = i4 |
| 415 | psubsw m5, m1 ; xmm5 = B. - H = B.. |
| 416 | paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. |
| 417 | psubsw m6, m3 ; xmm6 = i0 - i4 |
| 418 | movdqa m0, m6 ; xmm0 = i0 - i4 |
| 419 | pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F |
| 420 | paddsw m3, m3 ; xmm3 = i4 + i4 |
| 421 | paddsw m1, m1 ; xmm1 = H + H |
| 422 | paddsw m3, m0 ; xmm3 = i0 + i4 |
| 423 | paddsw m1, m5 ; xmm1 = B. + H = H. |
| 424 | pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) |
| 425 | paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) |
| 426 | psubsw m6, m2 ; xmm6 = F - A. = F. |
| 427 | paddsw m2, m2 ; xmm2 = A. + A. |
| 428 | movdqa m0, I(1) ; Load C. from I(1) |
| 429 | paddsw m2, m6 ; xmm2 = F + A. = A.. |
| 430 | paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 |
| 431 | psubsw m2, m1 ; xmm2 = A.. - H. = R2 |
| 432 | ADD(m2) ; Adjust R2 and R1 before shifting |
| 433 | paddsw m1, m1 ; xmm1 = H. + H. |
| 434 | paddsw m1, m2 ; xmm1 = A.. + H. = R1 |
| 435 | SHIFT(m2) ; xmm2 = op2 |
| 436 | psubsw m4, m7 ; xmm4 = E - G = E. |
| 437 | SHIFT(m1) ; xmm1 = op1 |
| 438 | movdqa m3, I(2) ; Load D. from I(2) |
| 439 | paddsw m7, m7 ; xmm7 = G + G |
| 440 | paddsw m7, m4 ; xmm7 = E + G = G. |
| 441 | psubsw m4, m3 ; xmm4 = E. - D. = R4 |
| 442 | ADD(m4) ; Adjust R4 and R3 before shifting |
| 443 | paddsw m3, m3 ; xmm3 = D. + D. |
| 444 | paddsw m3, m4 ; xmm3 = E. + D. = R3 |
| 445 | SHIFT(m4) ; xmm4 = op4 |
| 446 | psubsw m6, m5 ; xmm6 = F. - B..= R6 |
| 447 | SHIFT(m3) ; xmm3 = op3 |
| 448 | ADD(m6) ; Adjust R6 and R5 before shifting |
| 449 | paddsw m5, m5 ; xmm5 = B.. + B.. |
| 450 | paddsw m5, m6 ; xmm5 = F. + B.. = R5 |
| 451 | SHIFT(m6) ; xmm6 = op6 |
| 452 | SHIFT(m5) ; xmm5 = op5 |
| 453 | psubsw m7, m0 ; xmm7 = G. - C. = R7 |
| 454 | ADD(m7) ; Adjust R7 and R0 before shifting |
| 455 | paddsw m0, m0 ; xmm0 = C. + C. |
| 456 | paddsw m0, m7 ; xmm0 = G. + C. |
| 457 | SHIFT(m7) ; xmm7 = op7 |
| 458 | SHIFT(m0) ; xmm0 = op0 |
| 459 | %endmacro |
| 460 | |
| 461 | %macro PUT_BLOCK 8 |
| 462 | movdqa O(0), m%1 |
| 463 | movdqa O(1), m%2 |
| 464 | movdqa O(2), m%3 |
| 465 | movdqa O(3), m%4 |
| 466 | movdqa O(4), m%5 |
| 467 | movdqa O(5), m%6 |
| 468 | movdqa O(6), m%7 |
| 469 | movdqa O(7), m%8 |
| 470 | %endmacro |
| 471 | |
| 472 | %macro VP3_IDCT 1 |
| 473 | %if mmsize == 16 |
| 474 | %define I(x) [%1+16*x] |
| 475 | %define O(x) [%1+16*x] |
| 476 | %define C(x) [vp3_idct_data+16*(x-1)] |
| 477 | %define SHIFT(x) |
| 478 | %define ADD(x) |
| 479 | VP3_1D_IDCT_SSE2 |
| 480 | %if ARCH_X86_64 |
| 481 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
| 482 | %else |
| 483 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] |
| 484 | %endif |
| 485 | PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 |
| 486 | |
| 487 | %define SHIFT(x) psraw x, 4 |
| 488 | %define ADD(x) paddsw x, [pw_8] |
| 489 | VP3_1D_IDCT_SSE2 |
| 490 | PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 |
| 491 | %else ; mmsize == 8 |
| 492 | ; eax = quantized input |
| 493 | ; ebx = dequantizer matrix |
| 494 | ; ecx = IDCT constants |
| 495 | ; M(I) = ecx + MaskOffset(0) + I * 8 |
| 496 | ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 |
| 497 | ; edx = output |
| 498 | ; r0..r7 = mm0..mm7 |
| 499 | %define OC_8 [pw_8] |
| 500 | %define C(x) [vp3_idct_data+16*(x-1)] |
| 501 | |
| 502 | ; at this point, function has completed dequantization + dezigzag + |
| 503 | ; partial transposition; now do the idct itself |
| 504 | %define I(x) [%1+16*x] |
| 505 | %define J(x) [%1+16*x] |
| 506 | RowIDCT |
| 507 | Transpose |
| 508 | |
| 509 | %define I(x) [%1+16*x+8] |
| 510 | %define J(x) [%1+16*x+8] |
| 511 | RowIDCT |
| 512 | Transpose |
| 513 | |
| 514 | %define I(x) [%1+16* x] |
| 515 | %define J(x) [%1+16*(x-4)+8] |
| 516 | ColumnIDCT |
| 517 | |
| 518 | %define I(x) [%1+16* x +64] |
| 519 | %define J(x) [%1+16*(x-4)+72] |
| 520 | ColumnIDCT |
| 521 | %endif ; mmsize == 16/8 |
| 522 | %endmacro |
| 523 | |
| 524 | %macro vp3_idct_funcs 0 |
| 525 | cglobal vp3_idct_put, 3, 4, 9 |
| 526 | VP3_IDCT r2 |
| 527 | |
| 528 | movsxdifnidn r1, r1d |
| 529 | mova m4, [pb_80] |
| 530 | lea r3, [r1*3] |
| 531 | %assign %%i 0 |
| 532 | %rep 16/mmsize |
| 533 | mova m0, [r2+mmsize*0+%%i] |
| 534 | mova m1, [r2+mmsize*2+%%i] |
| 535 | mova m2, [r2+mmsize*4+%%i] |
| 536 | mova m3, [r2+mmsize*6+%%i] |
| 537 | %if mmsize == 8 |
| 538 | packsswb m0, [r2+mmsize*8+%%i] |
| 539 | packsswb m1, [r2+mmsize*10+%%i] |
| 540 | packsswb m2, [r2+mmsize*12+%%i] |
| 541 | packsswb m3, [r2+mmsize*14+%%i] |
| 542 | %else |
| 543 | packsswb m0, [r2+mmsize*1+%%i] |
| 544 | packsswb m1, [r2+mmsize*3+%%i] |
| 545 | packsswb m2, [r2+mmsize*5+%%i] |
| 546 | packsswb m3, [r2+mmsize*7+%%i] |
| 547 | %endif |
| 548 | paddb m0, m4 |
| 549 | paddb m1, m4 |
| 550 | paddb m2, m4 |
| 551 | paddb m3, m4 |
| 552 | movq [r0 ], m0 |
| 553 | %if mmsize == 8 |
| 554 | movq [r0+r1 ], m1 |
| 555 | movq [r0+r1*2], m2 |
| 556 | movq [r0+r3 ], m3 |
| 557 | %else |
| 558 | movhps [r0+r1 ], m0 |
| 559 | movq [r0+r1*2], m1 |
| 560 | movhps [r0+r3 ], m1 |
| 561 | %endif |
| 562 | %if %%i == 0 |
| 563 | lea r0, [r0+r1*4] |
| 564 | %endif |
| 565 | %if mmsize == 16 |
| 566 | movq [r0 ], m2 |
| 567 | movhps [r0+r1 ], m2 |
| 568 | movq [r0+r1*2], m3 |
| 569 | movhps [r0+r3 ], m3 |
| 570 | %endif |
| 571 | %assign %%i %%i+8 |
| 572 | %endrep |
| 573 | |
| 574 | pxor m0, m0 |
| 575 | %assign %%offset 0 |
| 576 | %rep 128/mmsize |
| 577 | mova [r2+%%offset], m0 |
| 578 | %assign %%offset %%offset+mmsize |
| 579 | %endrep |
| 580 | RET |
| 581 | |
| 582 | cglobal vp3_idct_add, 3, 4, 9 |
| 583 | VP3_IDCT r2 |
| 584 | |
| 585 | movsxdifnidn r1, r1d |
| 586 | lea r3, [r1*3] |
| 587 | pxor m4, m4 |
| 588 | %if mmsize == 16 |
| 589 | %assign %%i 0 |
| 590 | %rep 2 |
| 591 | movq m0, [r0] |
| 592 | movq m1, [r0+r1] |
| 593 | movq m2, [r0+r1*2] |
| 594 | movq m3, [r0+r3] |
| 595 | punpcklbw m0, m4 |
| 596 | punpcklbw m1, m4 |
| 597 | punpcklbw m2, m4 |
| 598 | punpcklbw m3, m4 |
| 599 | paddsw m0, [r2+ 0+%%i] |
| 600 | paddsw m1, [r2+16+%%i] |
| 601 | paddsw m2, [r2+32+%%i] |
| 602 | paddsw m3, [r2+48+%%i] |
| 603 | packuswb m0, m1 |
| 604 | packuswb m2, m3 |
| 605 | movq [r0 ], m0 |
| 606 | movhps [r0+r1 ], m0 |
| 607 | movq [r0+r1*2], m2 |
| 608 | movhps [r0+r3 ], m2 |
| 609 | %if %%i == 0 |
| 610 | lea r0, [r0+r1*4] |
| 611 | %endif |
| 612 | %assign %%i %%i+64 |
| 613 | %endrep |
| 614 | %else |
| 615 | %assign %%i 0 |
| 616 | %rep 2 |
| 617 | movq m0, [r0] |
| 618 | movq m1, [r0+r1] |
| 619 | movq m2, [r0+r1*2] |
| 620 | movq m3, [r0+r3] |
| 621 | movq m5, m0 |
| 622 | movq m6, m1 |
| 623 | movq m7, m2 |
| 624 | punpcklbw m0, m4 |
| 625 | punpcklbw m1, m4 |
| 626 | punpcklbw m2, m4 |
| 627 | punpckhbw m5, m4 |
| 628 | punpckhbw m6, m4 |
| 629 | punpckhbw m7, m4 |
| 630 | paddsw m0, [r2+ 0+%%i] |
| 631 | paddsw m1, [r2+16+%%i] |
| 632 | paddsw m2, [r2+32+%%i] |
| 633 | paddsw m5, [r2+64+%%i] |
| 634 | paddsw m6, [r2+80+%%i] |
| 635 | paddsw m7, [r2+96+%%i] |
| 636 | packuswb m0, m5 |
| 637 | movq m5, m3 |
| 638 | punpcklbw m3, m4 |
| 639 | punpckhbw m5, m4 |
| 640 | packuswb m1, m6 |
| 641 | paddsw m3, [r2+48+%%i] |
| 642 | paddsw m5, [r2+112+%%i] |
| 643 | packuswb m2, m7 |
| 644 | packuswb m3, m5 |
| 645 | movq [r0 ], m0 |
| 646 | movq [r0+r1 ], m1 |
| 647 | movq [r0+r1*2], m2 |
| 648 | movq [r0+r3 ], m3 |
| 649 | %if %%i == 0 |
| 650 | lea r0, [r0+r1*4] |
| 651 | %endif |
| 652 | %assign %%i %%i+8 |
| 653 | %endrep |
| 654 | %endif |
| 655 | %assign %%i 0 |
| 656 | %rep 128/mmsize |
| 657 | mova [r2+%%i], m4 |
| 658 | %assign %%i %%i+mmsize |
| 659 | %endrep |
| 660 | RET |
| 661 | %endmacro |
| 662 | |
| 663 | %if ARCH_X86_32 |
| 664 | INIT_MMX mmx |
| 665 | vp3_idct_funcs |
| 666 | %endif |
| 667 | |
| 668 | INIT_XMM sse2 |
| 669 | vp3_idct_funcs |
| 670 | |
| 671 | %macro DC_ADD 0 |
| 672 | movq m2, [r0 ] |
| 673 | movq m3, [r0+r1 ] |
| 674 | paddusb m2, m0 |
| 675 | movq m4, [r0+r1*2] |
| 676 | paddusb m3, m0 |
| 677 | movq m5, [r0+r2 ] |
| 678 | paddusb m4, m0 |
| 679 | paddusb m5, m0 |
| 680 | psubusb m2, m1 |
| 681 | psubusb m3, m1 |
| 682 | movq [r0 ], m2 |
| 683 | psubusb m4, m1 |
| 684 | movq [r0+r1 ], m3 |
| 685 | psubusb m5, m1 |
| 686 | movq [r0+r1*2], m4 |
| 687 | movq [r0+r2 ], m5 |
| 688 | %endmacro |
| 689 | |
| 690 | INIT_MMX mmxext |
| 691 | cglobal vp3_idct_dc_add, 3, 4 |
| 692 | %if ARCH_X86_64 |
| 693 | movsxd r1, r1d |
| 694 | %endif |
| 695 | movsx r3, word [r2] |
| 696 | mov word [r2], 0 |
| 697 | lea r2, [r1*3] |
| 698 | add r3, 15 |
| 699 | sar r3, 5 |
| 700 | movd m0, r3d |
| 701 | pshufw m0, m0, 0x0 |
| 702 | pxor m1, m1 |
| 703 | psubw m1, m0 |
| 704 | packuswb m0, m0 |
| 705 | packuswb m1, m1 |
| 706 | DC_ADD |
| 707 | lea r0, [r0+r1*4] |
| 708 | DC_ADD |
| 709 | RET |