| 1 | ;***************************************************************************** |
| 2 | ;* MMX/SSE2/SSSE3-optimized H.264 QPEL code |
| 3 | ;***************************************************************************** |
| 4 | ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
| 5 | ;* Copyright (C) 2012 Daniel Kang |
| 6 | ;* |
| 7 | ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
| 8 | ;* |
| 9 | ;* This file is part of FFmpeg. |
| 10 | ;* |
| 11 | ;* FFmpeg is free software; you can redistribute it and/or |
| 12 | ;* modify it under the terms of the GNU Lesser General Public |
| 13 | ;* License as published by the Free Software Foundation; either |
| 14 | ;* version 2.1 of the License, or (at your option) any later version. |
| 15 | ;* |
| 16 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 17 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 19 | ;* Lesser General Public License for more details. |
| 20 | ;* |
| 21 | ;* You should have received a copy of the GNU Lesser General Public |
| 22 | ;* License along with FFmpeg; if not, write to the Free Software |
| 23 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 24 | ;****************************************************************************** |
| 25 | |
| 26 | %include "libavutil/x86/x86util.asm" |
| 27 | |
| 28 | SECTION_RODATA 32 |
| 29 | |
| 30 | cextern pw_16 |
| 31 | cextern pw_5 |
| 32 | cextern pb_0 |
| 33 | |
| 34 | SECTION .text |
| 35 | |
| 36 | |
| 37 | %macro op_avgh 3 |
| 38 | movh %3, %2 |
| 39 | pavgb %1, %3 |
| 40 | movh %2, %1 |
| 41 | %endmacro |
| 42 | |
| 43 | %macro op_avg 2-3 |
| 44 | pavgb %1, %2 |
| 45 | mova %2, %1 |
| 46 | %endmacro |
| 47 | |
| 48 | %macro op_puth 2-3 |
| 49 | movh %2, %1 |
| 50 | %endmacro |
| 51 | |
| 52 | %macro op_put 2-3 |
| 53 | mova %2, %1 |
| 54 | %endmacro |
| 55 | |
| 56 | %macro QPEL4_H_LOWPASS_OP 1 |
| 57 | cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride |
| 58 | movsxdifnidn r2, r2d |
| 59 | movsxdifnidn r3, r3d |
| 60 | pxor m7, m7 |
| 61 | mova m4, [pw_5] |
| 62 | mova m5, [pw_16] |
| 63 | mov r4d, 4 |
| 64 | .loop: |
| 65 | movh m1, [r1-1] |
| 66 | movh m2, [r1+0] |
| 67 | movh m3, [r1+1] |
| 68 | movh m0, [r1+2] |
| 69 | punpcklbw m1, m7 |
| 70 | punpcklbw m2, m7 |
| 71 | punpcklbw m3, m7 |
| 72 | punpcklbw m0, m7 |
| 73 | paddw m1, m0 |
| 74 | paddw m2, m3 |
| 75 | movh m0, [r1-2] |
| 76 | movh m3, [r1+3] |
| 77 | punpcklbw m0, m7 |
| 78 | punpcklbw m3, m7 |
| 79 | paddw m0, m3 |
| 80 | psllw m2, 2 |
| 81 | psubw m2, m1 |
| 82 | pmullw m2, m4 |
| 83 | paddw m0, m5 |
| 84 | paddw m0, m2 |
| 85 | psraw m0, 5 |
| 86 | packuswb m0, m0 |
| 87 | op_%1h m0, [r0], m6 |
| 88 | add r0, r2 |
| 89 | add r1, r3 |
| 90 | dec r4d |
| 91 | jg .loop |
| 92 | REP_RET |
| 93 | %endmacro |
| 94 | |
| 95 | INIT_MMX mmxext |
| 96 | QPEL4_H_LOWPASS_OP put |
| 97 | QPEL4_H_LOWPASS_OP avg |
| 98 | |
| 99 | %macro QPEL8_H_LOWPASS_OP 1 |
| 100 | cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride |
| 101 | movsxdifnidn r2, r2d |
| 102 | movsxdifnidn r3, r3d |
| 103 | mov r4d, 8 |
| 104 | pxor m7, m7 |
| 105 | mova m6, [pw_5] |
| 106 | .loop: |
| 107 | mova m0, [r1] |
| 108 | mova m2, [r1+1] |
| 109 | mova m1, m0 |
| 110 | mova m3, m2 |
| 111 | punpcklbw m0, m7 |
| 112 | punpckhbw m1, m7 |
| 113 | punpcklbw m2, m7 |
| 114 | punpckhbw m3, m7 |
| 115 | paddw m0, m2 |
| 116 | paddw m1, m3 |
| 117 | psllw m0, 2 |
| 118 | psllw m1, 2 |
| 119 | mova m2, [r1-1] |
| 120 | mova m4, [r1+2] |
| 121 | mova m3, m2 |
| 122 | mova m5, m4 |
| 123 | punpcklbw m2, m7 |
| 124 | punpckhbw m3, m7 |
| 125 | punpcklbw m4, m7 |
| 126 | punpckhbw m5, m7 |
| 127 | paddw m2, m4 |
| 128 | paddw m5, m3 |
| 129 | psubw m0, m2 |
| 130 | psubw m1, m5 |
| 131 | pmullw m0, m6 |
| 132 | pmullw m1, m6 |
| 133 | movd m2, [r1-2] |
| 134 | movd m5, [r1+7] |
| 135 | punpcklbw m2, m7 |
| 136 | punpcklbw m5, m7 |
| 137 | paddw m2, m3 |
| 138 | paddw m4, m5 |
| 139 | mova m5, [pw_16] |
| 140 | paddw m2, m5 |
| 141 | paddw m4, m5 |
| 142 | paddw m0, m2 |
| 143 | paddw m1, m4 |
| 144 | psraw m0, 5 |
| 145 | psraw m1, 5 |
| 146 | packuswb m0, m1 |
| 147 | op_%1 m0, [r0], m4 |
| 148 | add r0, r2 |
| 149 | add r1, r3 |
| 150 | dec r4d |
| 151 | jg .loop |
| 152 | REP_RET |
| 153 | %endmacro |
| 154 | |
| 155 | INIT_MMX mmxext |
| 156 | QPEL8_H_LOWPASS_OP put |
| 157 | QPEL8_H_LOWPASS_OP avg |
| 158 | |
| 159 | %macro QPEL8_H_LOWPASS_OP_XMM 1 |
| 160 | cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride |
| 161 | movsxdifnidn r2, r2d |
| 162 | movsxdifnidn r3, r3d |
| 163 | mov r4d, 8 |
| 164 | pxor m7, m7 |
| 165 | mova m6, [pw_5] |
| 166 | .loop: |
| 167 | movu m1, [r1-2] |
| 168 | mova m0, m1 |
| 169 | punpckhbw m1, m7 |
| 170 | punpcklbw m0, m7 |
| 171 | mova m2, m1 |
| 172 | mova m3, m1 |
| 173 | mova m4, m1 |
| 174 | mova m5, m1 |
| 175 | palignr m4, m0, 2 |
| 176 | palignr m3, m0, 4 |
| 177 | palignr m2, m0, 6 |
| 178 | palignr m1, m0, 8 |
| 179 | palignr m5, m0, 10 |
| 180 | paddw m0, m5 |
| 181 | paddw m2, m3 |
| 182 | paddw m1, m4 |
| 183 | psllw m2, 2 |
| 184 | psubw m2, m1 |
| 185 | paddw m0, [pw_16] |
| 186 | pmullw m2, m6 |
| 187 | paddw m2, m0 |
| 188 | psraw m2, 5 |
| 189 | packuswb m2, m2 |
| 190 | op_%1h m2, [r0], m4 |
| 191 | add r1, r3 |
| 192 | add r0, r2 |
| 193 | dec r4d |
| 194 | jne .loop |
| 195 | REP_RET |
| 196 | %endmacro |
| 197 | |
| 198 | INIT_XMM ssse3 |
| 199 | QPEL8_H_LOWPASS_OP_XMM put |
| 200 | QPEL8_H_LOWPASS_OP_XMM avg |
| 201 | |
| 202 | |
| 203 | %macro QPEL4_H_LOWPASS_L2_OP 1 |
| 204 | cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride |
| 205 | movsxdifnidn r3, r3d |
| 206 | movsxdifnidn r4, r4d |
| 207 | pxor m7, m7 |
| 208 | mova m4, [pw_5] |
| 209 | mova m5, [pw_16] |
| 210 | mov r5d, 4 |
| 211 | .loop: |
| 212 | movh m1, [r1-1] |
| 213 | movh m2, [r1+0] |
| 214 | movh m3, [r1+1] |
| 215 | movh m0, [r1+2] |
| 216 | punpcklbw m1, m7 |
| 217 | punpcklbw m2, m7 |
| 218 | punpcklbw m3, m7 |
| 219 | punpcklbw m0, m7 |
| 220 | paddw m1, m0 |
| 221 | paddw m2, m3 |
| 222 | movh m0, [r1-2] |
| 223 | movh m3, [r1+3] |
| 224 | punpcklbw m0, m7 |
| 225 | punpcklbw m3, m7 |
| 226 | paddw m0, m3 |
| 227 | psllw m2, 2 |
| 228 | psubw m2, m1 |
| 229 | pmullw m2, m4 |
| 230 | paddw m0, m5 |
| 231 | paddw m0, m2 |
| 232 | movh m3, [r2] |
| 233 | psraw m0, 5 |
| 234 | packuswb m0, m0 |
| 235 | pavgb m0, m3 |
| 236 | op_%1h m0, [r0], m6 |
| 237 | add r0, r3 |
| 238 | add r1, r3 |
| 239 | add r2, r4 |
| 240 | dec r5d |
| 241 | jg .loop |
| 242 | REP_RET |
| 243 | %endmacro |
| 244 | |
| 245 | INIT_MMX mmxext |
| 246 | QPEL4_H_LOWPASS_L2_OP put |
| 247 | QPEL4_H_LOWPASS_L2_OP avg |
| 248 | |
| 249 | |
| 250 | %macro QPEL8_H_LOWPASS_L2_OP 1 |
| 251 | cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride |
| 252 | movsxdifnidn r3, r3d |
| 253 | movsxdifnidn r4, r4d |
| 254 | mov r5d, 8 |
| 255 | pxor m7, m7 |
| 256 | mova m6, [pw_5] |
| 257 | .loop: |
| 258 | mova m0, [r1] |
| 259 | mova m2, [r1+1] |
| 260 | mova m1, m0 |
| 261 | mova m3, m2 |
| 262 | punpcklbw m0, m7 |
| 263 | punpckhbw m1, m7 |
| 264 | punpcklbw m2, m7 |
| 265 | punpckhbw m3, m7 |
| 266 | paddw m0, m2 |
| 267 | paddw m1, m3 |
| 268 | psllw m0, 2 |
| 269 | psllw m1, 2 |
| 270 | mova m2, [r1-1] |
| 271 | mova m4, [r1+2] |
| 272 | mova m3, m2 |
| 273 | mova m5, m4 |
| 274 | punpcklbw m2, m7 |
| 275 | punpckhbw m3, m7 |
| 276 | punpcklbw m4, m7 |
| 277 | punpckhbw m5, m7 |
| 278 | paddw m2, m4 |
| 279 | paddw m5, m3 |
| 280 | psubw m0, m2 |
| 281 | psubw m1, m5 |
| 282 | pmullw m0, m6 |
| 283 | pmullw m1, m6 |
| 284 | movd m2, [r1-2] |
| 285 | movd m5, [r1+7] |
| 286 | punpcklbw m2, m7 |
| 287 | punpcklbw m5, m7 |
| 288 | paddw m2, m3 |
| 289 | paddw m4, m5 |
| 290 | mova m5, [pw_16] |
| 291 | paddw m2, m5 |
| 292 | paddw m4, m5 |
| 293 | paddw m0, m2 |
| 294 | paddw m1, m4 |
| 295 | psraw m0, 5 |
| 296 | psraw m1, 5 |
| 297 | mova m4, [r2] |
| 298 | packuswb m0, m1 |
| 299 | pavgb m0, m4 |
| 300 | op_%1 m0, [r0], m4 |
| 301 | add r0, r3 |
| 302 | add r1, r3 |
| 303 | add r2, r4 |
| 304 | dec r5d |
| 305 | jg .loop |
| 306 | REP_RET |
| 307 | %endmacro |
| 308 | |
| 309 | INIT_MMX mmxext |
| 310 | QPEL8_H_LOWPASS_L2_OP put |
| 311 | QPEL8_H_LOWPASS_L2_OP avg |
| 312 | |
| 313 | |
| 314 | %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 |
| 315 | cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride |
| 316 | movsxdifnidn r3, r3d |
| 317 | movsxdifnidn r4, r4d |
| 318 | mov r5d, 8 |
| 319 | pxor m7, m7 |
| 320 | mova m6, [pw_5] |
| 321 | .loop: |
| 322 | lddqu m1, [r1-2] |
| 323 | mova m0, m1 |
| 324 | punpckhbw m1, m7 |
| 325 | punpcklbw m0, m7 |
| 326 | mova m2, m1 |
| 327 | mova m3, m1 |
| 328 | mova m4, m1 |
| 329 | mova m5, m1 |
| 330 | palignr m4, m0, 2 |
| 331 | palignr m3, m0, 4 |
| 332 | palignr m2, m0, 6 |
| 333 | palignr m1, m0, 8 |
| 334 | palignr m5, m0, 10 |
| 335 | paddw m0, m5 |
| 336 | paddw m2, m3 |
| 337 | paddw m1, m4 |
| 338 | psllw m2, 2 |
| 339 | movh m3, [r2] |
| 340 | psubw m2, m1 |
| 341 | paddw m0, [pw_16] |
| 342 | pmullw m2, m6 |
| 343 | paddw m2, m0 |
| 344 | psraw m2, 5 |
| 345 | packuswb m2, m2 |
| 346 | pavgb m2, m3 |
| 347 | op_%1h m2, [r0], m4 |
| 348 | add r1, r3 |
| 349 | add r0, r3 |
| 350 | add r2, r4 |
| 351 | dec r5d |
| 352 | jg .loop |
| 353 | REP_RET |
| 354 | %endmacro |
| 355 | |
| 356 | INIT_XMM ssse3 |
| 357 | QPEL8_H_LOWPASS_L2_OP_XMM put |
| 358 | QPEL8_H_LOWPASS_L2_OP_XMM avg |
| 359 | |
| 360 | |
| 361 | ; All functions that call this are required to have function arguments of |
| 362 | ; dst, src, dstStride, srcStride |
| 363 | %macro FILT_V 1 |
| 364 | mova m6, m2 |
| 365 | movh m5, [r1] |
| 366 | paddw m6, m3 |
| 367 | psllw m6, 2 |
| 368 | psubw m6, m1 |
| 369 | psubw m6, m4 |
| 370 | punpcklbw m5, m7 |
| 371 | pmullw m6, [pw_5] |
| 372 | paddw m0, [pw_16] |
| 373 | add r1, r3 |
| 374 | paddw m0, m5 |
| 375 | paddw m6, m0 |
| 376 | psraw m6, 5 |
| 377 | packuswb m6, m6 |
| 378 | op_%1h m6, [r0], m0 ; 1 |
| 379 | add r0, r2 |
| 380 | SWAP 0, 1, 2, 3, 4, 5 |
| 381 | %endmacro |
| 382 | |
| 383 | %macro QPEL4_V_LOWPASS_OP 1 |
| 384 | cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride |
| 385 | movsxdifnidn r2, r2d |
| 386 | movsxdifnidn r3, r3d |
| 387 | sub r1, r3 |
| 388 | sub r1, r3 |
| 389 | pxor m7, m7 |
| 390 | movh m0, [r1] |
| 391 | movh m1, [r1+r3] |
| 392 | lea r1, [r1+2*r3] |
| 393 | movh m2, [r1] |
| 394 | movh m3, [r1+r3] |
| 395 | lea r1, [r1+2*r3] |
| 396 | movh m4, [r1] |
| 397 | add r1, r3 |
| 398 | punpcklbw m0, m7 |
| 399 | punpcklbw m1, m7 |
| 400 | punpcklbw m2, m7 |
| 401 | punpcklbw m3, m7 |
| 402 | punpcklbw m4, m7 |
| 403 | FILT_V %1 |
| 404 | FILT_V %1 |
| 405 | FILT_V %1 |
| 406 | FILT_V %1 |
| 407 | RET |
| 408 | %endmacro |
| 409 | |
| 410 | INIT_MMX mmxext |
| 411 | QPEL4_V_LOWPASS_OP put |
| 412 | QPEL4_V_LOWPASS_OP avg |
| 413 | |
| 414 | |
| 415 | |
| 416 | %macro QPEL8OR16_V_LOWPASS_OP 1 |
| 417 | %if cpuflag(sse2) |
| 418 | cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h |
| 419 | movsxdifnidn r2, r2d |
| 420 | movsxdifnidn r3, r3d |
| 421 | sub r1, r3 |
| 422 | sub r1, r3 |
| 423 | %else |
| 424 | cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h |
| 425 | movsxdifnidn r2, r2d |
| 426 | movsxdifnidn r3, r3d |
| 427 | %endif |
| 428 | pxor m7, m7 |
| 429 | movh m0, [r1] |
| 430 | movh m1, [r1+r3] |
| 431 | lea r1, [r1+2*r3] |
| 432 | movh m2, [r1] |
| 433 | movh m3, [r1+r3] |
| 434 | lea r1, [r1+2*r3] |
| 435 | movh m4, [r1] |
| 436 | add r1, r3 |
| 437 | punpcklbw m0, m7 |
| 438 | punpcklbw m1, m7 |
| 439 | punpcklbw m2, m7 |
| 440 | punpcklbw m3, m7 |
| 441 | punpcklbw m4, m7 |
| 442 | FILT_V %1 |
| 443 | FILT_V %1 |
| 444 | FILT_V %1 |
| 445 | FILT_V %1 |
| 446 | FILT_V %1 |
| 447 | FILT_V %1 |
| 448 | FILT_V %1 |
| 449 | FILT_V %1 |
| 450 | cmp r4d, 16 |
| 451 | jne .end |
| 452 | FILT_V %1 |
| 453 | FILT_V %1 |
| 454 | FILT_V %1 |
| 455 | FILT_V %1 |
| 456 | FILT_V %1 |
| 457 | FILT_V %1 |
| 458 | FILT_V %1 |
| 459 | FILT_V %1 |
| 460 | .end: |
| 461 | REP_RET |
| 462 | %endmacro |
| 463 | |
| 464 | INIT_MMX mmxext |
| 465 | QPEL8OR16_V_LOWPASS_OP put |
| 466 | QPEL8OR16_V_LOWPASS_OP avg |
| 467 | |
| 468 | INIT_XMM sse2 |
| 469 | QPEL8OR16_V_LOWPASS_OP put |
| 470 | QPEL8OR16_V_LOWPASS_OP avg |
| 471 | |
| 472 | |
| 473 | ; All functions that use this are required to have args: |
| 474 | ; src, tmp, srcSize |
| 475 | %macro FILT_HV 1 ; offset |
| 476 | mova m6, m2 |
| 477 | movh m5, [r0] |
| 478 | paddw m6, m3 |
| 479 | psllw m6, 2 |
| 480 | paddw m0, [pw_16] |
| 481 | psubw m6, m1 |
| 482 | psubw m6, m4 |
| 483 | punpcklbw m5, m7 |
| 484 | pmullw m6, [pw_5] |
| 485 | paddw m0, m5 |
| 486 | add r0, r2 |
| 487 | paddw m6, m0 |
| 488 | mova [r1+%1], m6 |
| 489 | SWAP 0, 1, 2, 3, 4, 5 |
| 490 | %endmacro |
| 491 | |
| 492 | %macro QPEL4_HV1_LOWPASS_OP 1 |
| 493 | cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride |
| 494 | movsxdifnidn r2, r2d |
| 495 | pxor m7, m7 |
| 496 | movh m0, [r0] |
| 497 | movh m1, [r0+r2] |
| 498 | lea r0, [r0+2*r2] |
| 499 | movh m2, [r0] |
| 500 | movh m3, [r0+r2] |
| 501 | lea r0, [r0+2*r2] |
| 502 | movh m4, [r0] |
| 503 | add r0, r2 |
| 504 | punpcklbw m0, m7 |
| 505 | punpcklbw m1, m7 |
| 506 | punpcklbw m2, m7 |
| 507 | punpcklbw m3, m7 |
| 508 | punpcklbw m4, m7 |
| 509 | FILT_HV 0*24 |
| 510 | FILT_HV 1*24 |
| 511 | FILT_HV 2*24 |
| 512 | FILT_HV 3*24 |
| 513 | RET |
| 514 | |
| 515 | cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride |
| 516 | movsxdifnidn r2, r2d |
| 517 | mov r3d, 4 |
| 518 | .loop: |
| 519 | mova m0, [r0] |
| 520 | paddw m0, [r0+10] |
| 521 | mova m1, [r0+2] |
| 522 | paddw m1, [r0+8] |
| 523 | mova m2, [r0+4] |
| 524 | paddw m2, [r0+6] |
| 525 | psubw m0, m1 |
| 526 | psraw m0, 2 |
| 527 | psubw m0, m1 |
| 528 | paddsw m0, m2 |
| 529 | psraw m0, 2 |
| 530 | paddw m0, m2 |
| 531 | psraw m0, 6 |
| 532 | packuswb m0, m0 |
| 533 | op_%1h m0, [r1], m7 |
| 534 | add r0, 24 |
| 535 | add r1, r2 |
| 536 | dec r3d |
| 537 | jnz .loop |
| 538 | REP_RET |
| 539 | %endmacro |
| 540 | |
| 541 | INIT_MMX mmxext |
| 542 | QPEL4_HV1_LOWPASS_OP put |
| 543 | QPEL4_HV1_LOWPASS_OP avg |
| 544 | |
| 545 | %macro QPEL8OR16_HV1_LOWPASS_OP 1 |
| 546 | cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size |
| 547 | movsxdifnidn r2, r2d |
| 548 | pxor m7, m7 |
| 549 | movh m0, [r0] |
| 550 | movh m1, [r0+r2] |
| 551 | lea r0, [r0+2*r2] |
| 552 | movh m2, [r0] |
| 553 | movh m3, [r0+r2] |
| 554 | lea r0, [r0+2*r2] |
| 555 | movh m4, [r0] |
| 556 | add r0, r2 |
| 557 | punpcklbw m0, m7 |
| 558 | punpcklbw m1, m7 |
| 559 | punpcklbw m2, m7 |
| 560 | punpcklbw m3, m7 |
| 561 | punpcklbw m4, m7 |
| 562 | FILT_HV 0*48 |
| 563 | FILT_HV 1*48 |
| 564 | FILT_HV 2*48 |
| 565 | FILT_HV 3*48 |
| 566 | FILT_HV 4*48 |
| 567 | FILT_HV 5*48 |
| 568 | FILT_HV 6*48 |
| 569 | FILT_HV 7*48 |
| 570 | cmp r3d, 16 |
| 571 | jne .end |
| 572 | FILT_HV 8*48 |
| 573 | FILT_HV 9*48 |
| 574 | FILT_HV 10*48 |
| 575 | FILT_HV 11*48 |
| 576 | FILT_HV 12*48 |
| 577 | FILT_HV 13*48 |
| 578 | FILT_HV 14*48 |
| 579 | FILT_HV 15*48 |
| 580 | .end: |
| 581 | REP_RET |
| 582 | %endmacro |
| 583 | |
| 584 | INIT_MMX mmxext |
| 585 | QPEL8OR16_HV1_LOWPASS_OP put |
| 586 | QPEL8OR16_HV1_LOWPASS_OP avg |
| 587 | |
| 588 | INIT_XMM sse2 |
| 589 | QPEL8OR16_HV1_LOWPASS_OP put |
| 590 | |
| 591 | |
| 592 | |
| 593 | %macro QPEL8OR16_HV2_LOWPASS_OP 1 |
| 594 | ; unused is to match ssse3 and mmxext args |
| 595 | cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h |
| 596 | movsxdifnidn r2, r2d |
| 597 | .loop: |
| 598 | mova m0, [r1] |
| 599 | mova m3, [r1+8] |
| 600 | mova m1, [r1+2] |
| 601 | mova m4, [r1+10] |
| 602 | paddw m0, m4 |
| 603 | paddw m1, m3 |
| 604 | paddw m3, [r1+18] |
| 605 | paddw m4, [r1+16] |
| 606 | mova m2, [r1+4] |
| 607 | mova m5, [r1+12] |
| 608 | paddw m2, [r1+6] |
| 609 | paddw m5, [r1+14] |
| 610 | psubw m0, m1 |
| 611 | psubw m3, m4 |
| 612 | psraw m0, 2 |
| 613 | psraw m3, 2 |
| 614 | psubw m0, m1 |
| 615 | psubw m3, m4 |
| 616 | paddsw m0, m2 |
| 617 | paddsw m3, m5 |
| 618 | psraw m0, 2 |
| 619 | psraw m3, 2 |
| 620 | paddw m0, m2 |
| 621 | paddw m3, m5 |
| 622 | psraw m0, 6 |
| 623 | psraw m3, 6 |
| 624 | packuswb m0, m3 |
| 625 | op_%1 m0, [r0], m7 |
| 626 | add r1, 48 |
| 627 | add r0, r2 |
| 628 | dec r4d |
| 629 | jne .loop |
| 630 | REP_RET |
| 631 | %endmacro |
| 632 | |
| 633 | INIT_MMX mmxext |
| 634 | QPEL8OR16_HV2_LOWPASS_OP put |
| 635 | QPEL8OR16_HV2_LOWPASS_OP avg |
| 636 | |
| 637 | %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 |
| 638 | cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size |
| 639 | movsxdifnidn r2, r2d |
| 640 | movsxdifnidn r3, r3d |
| 641 | cmp r4d, 16 |
| 642 | je .op16 |
| 643 | .loop8: |
| 644 | mova m1, [r1+16] |
| 645 | mova m0, [r1] |
| 646 | mova m2, m1 |
| 647 | mova m3, m1 |
| 648 | mova m4, m1 |
| 649 | mova m5, m1 |
| 650 | palignr m5, m0, 10 |
| 651 | palignr m4, m0, 8 |
| 652 | palignr m3, m0, 6 |
| 653 | palignr m2, m0, 4 |
| 654 | palignr m1, m0, 2 |
| 655 | paddw m0, m5 |
| 656 | paddw m1, m4 |
| 657 | paddw m2, m3 |
| 658 | psubw m0, m1 |
| 659 | psraw m0, 2 |
| 660 | psubw m0, m1 |
| 661 | paddw m0, m2 |
| 662 | psraw m0, 2 |
| 663 | paddw m0, m2 |
| 664 | psraw m0, 6 |
| 665 | packuswb m0, m0 |
| 666 | op_%1h m0, [r0], m7 |
| 667 | add r1, 48 |
| 668 | add r0, r2 |
| 669 | dec r4d |
| 670 | jne .loop8 |
| 671 | jmp .done |
| 672 | .op16: |
| 673 | mova m4, [r1+32] |
| 674 | mova m5, [r1+16] |
| 675 | mova m7, [r1] |
| 676 | mova m3, m4 |
| 677 | mova m2, m4 |
| 678 | mova m1, m4 |
| 679 | mova m0, m4 |
| 680 | palignr m0, m5, 10 |
| 681 | palignr m1, m5, 8 |
| 682 | palignr m2, m5, 6 |
| 683 | palignr m3, m5, 4 |
| 684 | palignr m4, m5, 2 |
| 685 | paddw m0, m5 |
| 686 | paddw m1, m4 |
| 687 | paddw m2, m3 |
| 688 | mova m6, m5 |
| 689 | mova m4, m5 |
| 690 | mova m3, m5 |
| 691 | palignr m4, m7, 8 |
| 692 | palignr m6, m7, 2 |
| 693 | palignr m3, m7, 10 |
| 694 | paddw m4, m6 |
| 695 | mova m6, m5 |
| 696 | palignr m5, m7, 6 |
| 697 | palignr m6, m7, 4 |
| 698 | paddw m3, m7 |
| 699 | paddw m5, m6 |
| 700 | psubw m0, m1 |
| 701 | psubw m3, m4 |
| 702 | psraw m0, 2 |
| 703 | psraw m3, 2 |
| 704 | psubw m0, m1 |
| 705 | psubw m3, m4 |
| 706 | paddw m0, m2 |
| 707 | paddw m3, m5 |
| 708 | psraw m0, 2 |
| 709 | psraw m3, 2 |
| 710 | paddw m0, m2 |
| 711 | paddw m3, m5 |
| 712 | psraw m0, 6 |
| 713 | psraw m3, 6 |
| 714 | packuswb m3, m0 |
| 715 | op_%1 m3, [r0], m7 |
| 716 | add r1, 48 |
| 717 | add r0, r2 |
| 718 | dec r4d |
| 719 | jne .op16 |
| 720 | .done: |
| 721 | REP_RET |
| 722 | %endmacro |
| 723 | |
| 724 | INIT_XMM ssse3 |
| 725 | QPEL8OR16_HV2_LOWPASS_OP_XMM put |
| 726 | QPEL8OR16_HV2_LOWPASS_OP_XMM avg |
| 727 | |
| 728 | |
| 729 | %macro PIXELS4_L2_SHIFT5 1 |
| 730 | cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h |
| 731 | movsxdifnidn r3, r3d |
| 732 | movsxdifnidn r4, r4d |
| 733 | mova m0, [r1] |
| 734 | mova m1, [r1+24] |
| 735 | psraw m0, 5 |
| 736 | psraw m1, 5 |
| 737 | packuswb m0, m0 |
| 738 | packuswb m1, m1 |
| 739 | pavgb m0, [r2] |
| 740 | pavgb m1, [r2+r4] |
| 741 | op_%1h m0, [r0], m4 |
| 742 | op_%1h m1, [r0+r3], m5 |
| 743 | lea r2, [r2+r4*2] |
| 744 | lea r0, [r0+r3*2] |
| 745 | mova m0, [r1+48] |
| 746 | mova m1, [r1+72] |
| 747 | psraw m0, 5 |
| 748 | psraw m1, 5 |
| 749 | packuswb m0, m0 |
| 750 | packuswb m1, m1 |
| 751 | pavgb m0, [r2] |
| 752 | pavgb m1, [r2+r4] |
| 753 | op_%1h m0, [r0], m4 |
| 754 | op_%1h m1, [r0+r3], m5 |
| 755 | RET |
| 756 | %endmacro |
| 757 | |
| 758 | INIT_MMX mmxext |
| 759 | PIXELS4_L2_SHIFT5 put |
| 760 | PIXELS4_L2_SHIFT5 avg |
| 761 | |
| 762 | |
| 763 | %macro PIXELS8_L2_SHIFT5 1 |
| 764 | cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h |
| 765 | movsxdifnidn r3, r3d |
| 766 | movsxdifnidn r4, r4d |
| 767 | .loop: |
| 768 | mova m0, [r1] |
| 769 | mova m1, [r1+8] |
| 770 | mova m2, [r1+48] |
| 771 | mova m3, [r1+48+8] |
| 772 | psraw m0, 5 |
| 773 | psraw m1, 5 |
| 774 | psraw m2, 5 |
| 775 | psraw m3, 5 |
| 776 | packuswb m0, m1 |
| 777 | packuswb m2, m3 |
| 778 | pavgb m0, [r2] |
| 779 | pavgb m2, [r2+r4] |
| 780 | op_%1 m0, [r0], m4 |
| 781 | op_%1 m2, [r0+r3], m5 |
| 782 | lea r2, [r2+2*r4] |
| 783 | add r1, 48*2 |
| 784 | lea r0, [r0+2*r3] |
| 785 | sub r5d, 2 |
| 786 | jne .loop |
| 787 | REP_RET |
| 788 | %endmacro |
| 789 | |
| 790 | INIT_MMX mmxext |
| 791 | PIXELS8_L2_SHIFT5 put |
| 792 | PIXELS8_L2_SHIFT5 avg |
| 793 | |
| 794 | |
| 795 | %if ARCH_X86_64 |
| 796 | %macro QPEL16_H_LOWPASS_L2_OP 1 |
| 797 | cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride |
| 798 | movsxdifnidn r3, r3d |
| 799 | movsxdifnidn r4, r4d |
| 800 | mov r5d, 16 |
| 801 | pxor m15, m15 |
| 802 | mova m14, [pw_5] |
| 803 | mova m13, [pw_16] |
| 804 | .loop: |
| 805 | lddqu m1, [r1+6] |
| 806 | lddqu m7, [r1-2] |
| 807 | mova m0, m1 |
| 808 | punpckhbw m1, m15 |
| 809 | punpcklbw m0, m15 |
| 810 | punpcklbw m7, m15 |
| 811 | mova m2, m1 |
| 812 | mova m6, m0 |
| 813 | mova m3, m1 |
| 814 | mova m8, m0 |
| 815 | mova m4, m1 |
| 816 | mova m9, m0 |
| 817 | mova m12, m0 |
| 818 | mova m11, m1 |
| 819 | palignr m11, m0, 10 |
| 820 | palignr m12, m7, 10 |
| 821 | palignr m4, m0, 2 |
| 822 | palignr m9, m7, 2 |
| 823 | palignr m3, m0, 4 |
| 824 | palignr m8, m7, 4 |
| 825 | palignr m2, m0, 6 |
| 826 | palignr m6, m7, 6 |
| 827 | paddw m11, m0 |
| 828 | palignr m1, m0, 8 |
| 829 | palignr m0, m7, 8 |
| 830 | paddw m7, m12 |
| 831 | paddw m2, m3 |
| 832 | paddw m6, m8 |
| 833 | paddw m1, m4 |
| 834 | paddw m0, m9 |
| 835 | psllw m2, 2 |
| 836 | psllw m6, 2 |
| 837 | psubw m2, m1 |
| 838 | psubw m6, m0 |
| 839 | paddw m11, m13 |
| 840 | paddw m7, m13 |
| 841 | pmullw m2, m14 |
| 842 | pmullw m6, m14 |
| 843 | lddqu m3, [r2] |
| 844 | paddw m2, m11 |
| 845 | paddw m6, m7 |
| 846 | psraw m2, 5 |
| 847 | psraw m6, 5 |
| 848 | packuswb m6, m2 |
| 849 | pavgb m6, m3 |
| 850 | op_%1 m6, [r0], m11 |
| 851 | add r1, r3 |
| 852 | add r0, r3 |
| 853 | add r2, r4 |
| 854 | dec r5d |
| 855 | jg .loop |
| 856 | REP_RET |
| 857 | %endmacro |
| 858 | |
| 859 | INIT_XMM ssse3 |
| 860 | QPEL16_H_LOWPASS_L2_OP put |
| 861 | QPEL16_H_LOWPASS_L2_OP avg |
| 862 | %endif |