| 1 | ;****************************************************************************** |
| 2 | ;* VP8 MMXEXT optimizations |
| 3 | ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
| 4 | ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> |
| 5 | ;* |
| 6 | ;* This file is part of FFmpeg. |
| 7 | ;* |
| 8 | ;* FFmpeg is free software; you can redistribute it and/or |
| 9 | ;* modify it under the terms of the GNU Lesser General Public |
| 10 | ;* License as published by the Free Software Foundation; either |
| 11 | ;* version 2.1 of the License, or (at your option) any later version. |
| 12 | ;* |
| 13 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | ;* Lesser General Public License for more details. |
| 17 | ;* |
| 18 | ;* You should have received a copy of the GNU Lesser General Public |
| 19 | ;* License along with FFmpeg; if not, write to the Free Software |
| 20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 21 | ;****************************************************************************** |
| 22 | |
| 23 | %include "libavutil/x86/x86util.asm" |
| 24 | |
| 25 | SECTION_RODATA |
| 26 | |
| 27 | pw_27: times 8 dw 27 |
| 28 | pw_63: times 8 dw 63 |
| 29 | |
| 30 | pb_4: times 16 db 4 |
| 31 | pb_F8: times 16 db 0xF8 |
| 32 | pb_FE: times 16 db 0xFE |
| 33 | pb_27_63: times 8 db 27, 63 |
| 34 | pb_18_63: times 8 db 18, 63 |
| 35 | pb_9_63: times 8 db 9, 63 |
| 36 | |
| 37 | cextern pb_1 |
| 38 | cextern pb_3 |
| 39 | cextern pw_9 |
| 40 | cextern pw_18 |
| 41 | cextern pb_80 |
| 42 | |
| 43 | SECTION .text |
| 44 | |
| 45 | ;----------------------------------------------------------------------------- |
| 46 | ; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
| 47 | ;----------------------------------------------------------------------------- |
| 48 | |
| 49 | ; macro called with 7 mm register indexes as argument, and 4 regular registers |
| 50 | ; |
| 51 | ; first 4 mm registers will carry the transposed pixel data |
| 52 | ; the other three are scratchspace (one would be sufficient, but this allows |
| 53 | ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
| 54 | ; |
| 55 | ; first two regular registers are buf+4*stride and buf+5*stride |
| 56 | ; third is -stride, fourth is +stride |
| 57 | %macro READ_8x4_INTERLEAVED 11 |
| 58 | ; interleave 8 (A-H) rows of 4 pixels each |
| 59 | movd m%1, [%8+%10*4] ; A0-3 |
| 60 | movd m%5, [%9+%10*4] ; B0-3 |
| 61 | movd m%2, [%8+%10*2] ; C0-3 |
| 62 | movd m%6, [%8+%10] ; D0-3 |
| 63 | movd m%3, [%8] ; E0-3 |
| 64 | movd m%7, [%9] ; F0-3 |
| 65 | movd m%4, [%9+%11] ; G0-3 |
| 66 | punpcklbw m%1, m%5 ; A/B interleaved |
| 67 | movd m%5, [%9+%11*2] ; H0-3 |
| 68 | punpcklbw m%2, m%6 ; C/D interleaved |
| 69 | punpcklbw m%3, m%7 ; E/F interleaved |
| 70 | punpcklbw m%4, m%5 ; G/H interleaved |
| 71 | %endmacro |
| 72 | |
| 73 | ; macro called with 7 mm register indexes as argument, and 5 regular registers |
| 74 | ; first 11 mean the same as READ_8x4_TRANSPOSED above |
| 75 | ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
| 76 | ; will be set to second regular register + 8*stride at the end |
| 77 | %macro READ_16x4_INTERLEAVED 12 |
| 78 | ; transpose 16 (A-P) rows of 4 pixels each |
| 79 | lea %12, [r0+8*r2] |
| 80 | |
| 81 | ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
| 82 | movd m%1, [%8+%10*4] ; A0-3 |
| 83 | movd m%3, [%12+%10*4] ; I0-3 |
| 84 | movd m%2, [%8+%10*2] ; C0-3 |
| 85 | movd m%4, [%12+%10*2] ; K0-3 |
| 86 | movd m%6, [%8+%10] ; D0-3 |
| 87 | movd m%5, [%12+%10] ; L0-3 |
| 88 | movd m%7, [%12] ; M0-3 |
| 89 | add %12, %11 |
| 90 | punpcklbw m%1, m%3 ; A/I |
| 91 | movd m%3, [%8] ; E0-3 |
| 92 | punpcklbw m%2, m%4 ; C/K |
| 93 | punpcklbw m%6, m%5 ; D/L |
| 94 | punpcklbw m%3, m%7 ; E/M |
| 95 | punpcklbw m%2, m%6 ; C/D/K/L interleaved |
| 96 | |
| 97 | ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
| 98 | movd m%5, [%9+%10*4] ; B0-3 |
| 99 | movd m%4, [%12+%10*4] ; J0-3 |
| 100 | movd m%7, [%9] ; F0-3 |
| 101 | movd m%6, [%12] ; N0-3 |
| 102 | punpcklbw m%5, m%4 ; B/J |
| 103 | punpcklbw m%7, m%6 ; F/N |
| 104 | punpcklbw m%1, m%5 ; A/B/I/J interleaved |
| 105 | punpcklbw m%3, m%7 ; E/F/M/N interleaved |
| 106 | movd m%4, [%9+%11] ; G0-3 |
| 107 | movd m%6, [%12+%11] ; O0-3 |
| 108 | movd m%5, [%9+%11*2] ; H0-3 |
| 109 | movd m%7, [%12+%11*2] ; P0-3 |
| 110 | punpcklbw m%4, m%6 ; G/O |
| 111 | punpcklbw m%5, m%7 ; H/P |
| 112 | punpcklbw m%4, m%5 ; G/H/O/P interleaved |
| 113 | %endmacro |
| 114 | |
| 115 | ; write 4 mm registers of 2 dwords each |
| 116 | ; first four arguments are mm register indexes containing source data |
| 117 | ; last four are registers containing buf+4*stride, buf+5*stride, |
| 118 | ; -stride and +stride |
| 119 | %macro WRITE_4x2D 8 |
| 120 | ; write out (2 dwords per register) |
| 121 | movd [%5+%7*4], m%1 |
| 122 | movd [%5+%7*2], m%2 |
| 123 | movd [%5], m%3 |
| 124 | movd [%6+%8], m%4 |
| 125 | punpckhdq m%1, m%1 |
| 126 | punpckhdq m%2, m%2 |
| 127 | punpckhdq m%3, m%3 |
| 128 | punpckhdq m%4, m%4 |
| 129 | movd [%6+%7*4], m%1 |
| 130 | movd [%5+%7], m%2 |
| 131 | movd [%6], m%3 |
| 132 | movd [%6+%8*2], m%4 |
| 133 | %endmacro |
| 134 | |
| 135 | ; write 4 xmm registers of 4 dwords each |
| 136 | ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
| 137 | ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
| 138 | ; we add 1*stride to the third regular registry in the process |
| 139 | ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
| 140 | ; same memory region), or 8 if they cover two separate buffers (third one points to |
| 141 | ; a different memory region than the first two), allowing for more optimal code for |
| 142 | ; the 16-width case |
| 143 | %macro WRITE_4x4D 10 |
| 144 | ; write out (4 dwords per register), start with dwords zero |
| 145 | movd [%5+%8*4], m%1 |
| 146 | movd [%5], m%2 |
| 147 | movd [%7+%8*4], m%3 |
| 148 | movd [%7], m%4 |
| 149 | |
| 150 | ; store dwords 1 |
| 151 | psrldq m%1, 4 |
| 152 | psrldq m%2, 4 |
| 153 | psrldq m%3, 4 |
| 154 | psrldq m%4, 4 |
| 155 | movd [%6+%8*4], m%1 |
| 156 | movd [%6], m%2 |
| 157 | %if %10 == 16 |
| 158 | movd [%6+%9*4], m%3 |
| 159 | %endif |
| 160 | movd [%7+%9], m%4 |
| 161 | |
| 162 | ; write dwords 2 |
| 163 | psrldq m%1, 4 |
| 164 | psrldq m%2, 4 |
| 165 | %if %10 == 8 |
| 166 | movd [%5+%8*2], m%1 |
| 167 | movd %5d, m%3 |
| 168 | %endif |
| 169 | psrldq m%3, 4 |
| 170 | psrldq m%4, 4 |
| 171 | %if %10 == 16 |
| 172 | movd [%5+%8*2], m%1 |
| 173 | %endif |
| 174 | movd [%6+%9], m%2 |
| 175 | movd [%7+%8*2], m%3 |
| 176 | movd [%7+%9*2], m%4 |
| 177 | add %7, %9 |
| 178 | |
| 179 | ; store dwords 3 |
| 180 | psrldq m%1, 4 |
| 181 | psrldq m%2, 4 |
| 182 | psrldq m%3, 4 |
| 183 | psrldq m%4, 4 |
| 184 | %if %10 == 8 |
| 185 | mov [%7+%8*4], %5d |
| 186 | movd [%6+%8*2], m%1 |
| 187 | %else |
| 188 | movd [%5+%8], m%1 |
| 189 | %endif |
| 190 | movd [%6+%9*2], m%2 |
| 191 | movd [%7+%8*2], m%3 |
| 192 | movd [%7+%9*2], m%4 |
| 193 | %endmacro |
| 194 | |
| 195 | ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
| 196 | ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
| 197 | ; for pre-SSE4: |
| 198 | ; 3 is a general-purpose register that we will clobber |
| 199 | ; for SSE4: |
| 200 | ; 3 is a pointer to the destination's 5th line |
| 201 | ; 4 is a pointer to the destination's 4th line |
| 202 | ; 5/6 is -stride and +stride |
| 203 | %macro WRITE_2x4W 6 |
| 204 | movd %3d, %1 |
| 205 | punpckhdq %1, %1 |
| 206 | mov [%4+%5*4], %3w |
| 207 | shr %3, 16 |
| 208 | add %4, %6 |
| 209 | mov [%4+%5*4], %3w |
| 210 | |
| 211 | movd %3d, %1 |
| 212 | add %4, %5 |
| 213 | mov [%4+%5*2], %3w |
| 214 | shr %3, 16 |
| 215 | mov [%4+%5 ], %3w |
| 216 | |
| 217 | movd %3d, %2 |
| 218 | punpckhdq %2, %2 |
| 219 | mov [%4 ], %3w |
| 220 | shr %3, 16 |
| 221 | mov [%4+%6 ], %3w |
| 222 | |
| 223 | movd %3d, %2 |
| 224 | add %4, %6 |
| 225 | mov [%4+%6 ], %3w |
| 226 | shr %3, 16 |
| 227 | mov [%4+%6*2], %3w |
| 228 | add %4, %5 |
| 229 | %endmacro |
| 230 | |
| 231 | %macro WRITE_8W 5 |
| 232 | %if cpuflag(sse4) |
| 233 | pextrw [%3+%4*4], %1, 0 |
| 234 | pextrw [%2+%4*4], %1, 1 |
| 235 | pextrw [%3+%4*2], %1, 2 |
| 236 | pextrw [%3+%4 ], %1, 3 |
| 237 | pextrw [%3 ], %1, 4 |
| 238 | pextrw [%2 ], %1, 5 |
| 239 | pextrw [%2+%5 ], %1, 6 |
| 240 | pextrw [%2+%5*2], %1, 7 |
| 241 | %else |
| 242 | movd %2d, %1 |
| 243 | psrldq %1, 4 |
| 244 | mov [%3+%4*4], %2w |
| 245 | shr %2, 16 |
| 246 | add %3, %5 |
| 247 | mov [%3+%4*4], %2w |
| 248 | |
| 249 | movd %2d, %1 |
| 250 | psrldq %1, 4 |
| 251 | add %3, %4 |
| 252 | mov [%3+%4*2], %2w |
| 253 | shr %2, 16 |
| 254 | mov [%3+%4 ], %2w |
| 255 | |
| 256 | movd %2d, %1 |
| 257 | psrldq %1, 4 |
| 258 | mov [%3 ], %2w |
| 259 | shr %2, 16 |
| 260 | mov [%3+%5 ], %2w |
| 261 | |
| 262 | movd %2d, %1 |
| 263 | add %3, %5 |
| 264 | mov [%3+%5 ], %2w |
| 265 | shr %2, 16 |
| 266 | mov [%3+%5*2], %2w |
| 267 | %endif |
| 268 | %endmacro |
| 269 | |
| 270 | %macro SIMPLE_LOOPFILTER 2 |
| 271 | cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr |
| 272 | %if mmsize == 8 ; mmx/mmxext |
| 273 | mov cntrq, 2 |
| 274 | %endif |
| 275 | %if cpuflag(ssse3) |
| 276 | pxor m0, m0 |
| 277 | %endif |
| 278 | SPLATB_REG m7, flim, m0 ; splat "flim" into register |
| 279 | |
| 280 | ; set up indexes to address 4 rows |
| 281 | %if mmsize == 8 |
| 282 | DEFINE_ARGS dst1, mstride, stride, cntr, dst2 |
| 283 | %else |
| 284 | DEFINE_ARGS dst1, mstride, stride, dst3, dst2 |
| 285 | %endif |
| 286 | mov strideq, mstrideq |
| 287 | neg mstrideq |
| 288 | %ifidn %1, h |
| 289 | lea dst1q, [dst1q+4*strideq-2] |
| 290 | %endif |
| 291 | |
| 292 | %if mmsize == 8 ; mmx / mmxext |
| 293 | .next8px: |
| 294 | %endif |
| 295 | %ifidn %1, v |
| 296 | ; read 4 half/full rows of pixels |
| 297 | mova m0, [dst1q+mstrideq*2] ; p1 |
| 298 | mova m1, [dst1q+mstrideq] ; p0 |
| 299 | mova m2, [dst1q] ; q0 |
| 300 | mova m3, [dst1q+ strideq] ; q1 |
| 301 | %else ; h |
| 302 | lea dst2q, [dst1q+ strideq] |
| 303 | |
| 304 | %if mmsize == 8 ; mmx/mmxext |
| 305 | READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq |
| 306 | %else ; sse2 |
| 307 | READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q |
| 308 | %endif |
| 309 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
| 310 | %endif |
| 311 | |
| 312 | ; simple_limit |
| 313 | mova m5, m2 ; m5=backup of q0 |
| 314 | mova m6, m1 ; m6=backup of p0 |
| 315 | psubusb m1, m2 ; p0-q0 |
| 316 | psubusb m2, m6 ; q0-p0 |
| 317 | por m1, m2 ; FFABS(p0-q0) |
| 318 | paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
| 319 | |
| 320 | mova m4, m3 |
| 321 | mova m2, m0 |
| 322 | psubusb m3, m0 ; q1-p1 |
| 323 | psubusb m0, m4 ; p1-q1 |
| 324 | por m3, m0 ; FFABS(p1-q1) |
| 325 | mova m0, [pb_80] |
| 326 | pxor m2, m0 |
| 327 | pxor m4, m0 |
| 328 | psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
| 329 | pand m3, [pb_FE] |
| 330 | psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
| 331 | paddusb m3, m1 |
| 332 | psubusb m3, m7 |
| 333 | pxor m1, m1 |
| 334 | pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
| 335 | |
| 336 | ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
| 337 | mova m4, m5 |
| 338 | pxor m5, m0 |
| 339 | pxor m0, m6 |
| 340 | psubsb m5, m0 ; q0-p0 (signed) |
| 341 | paddsb m2, m5 |
| 342 | paddsb m2, m5 |
| 343 | paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
| 344 | pand m2, m3 ; apply filter mask (m3) |
| 345 | |
| 346 | mova m3, [pb_F8] |
| 347 | mova m1, m2 |
| 348 | paddsb m2, [pb_4] ; f1<<3=a+4 |
| 349 | paddsb m1, [pb_3] ; f2<<3=a+3 |
| 350 | pand m2, m3 |
| 351 | pand m1, m3 ; cache f2<<3 |
| 352 | |
| 353 | pxor m0, m0 |
| 354 | pxor m3, m3 |
| 355 | pcmpgtb m0, m2 ; which values are <0? |
| 356 | psubb m3, m2 ; -f1<<3 |
| 357 | psrlq m2, 3 ; +f1 |
| 358 | psrlq m3, 3 ; -f1 |
| 359 | pand m3, m0 |
| 360 | pandn m0, m2 |
| 361 | psubusb m4, m0 |
| 362 | paddusb m4, m3 ; q0-f1 |
| 363 | |
| 364 | pxor m0, m0 |
| 365 | pxor m3, m3 |
| 366 | pcmpgtb m0, m1 ; which values are <0? |
| 367 | psubb m3, m1 ; -f2<<3 |
| 368 | psrlq m1, 3 ; +f2 |
| 369 | psrlq m3, 3 ; -f2 |
| 370 | pand m3, m0 |
| 371 | pandn m0, m1 |
| 372 | paddusb m6, m0 |
| 373 | psubusb m6, m3 ; p0+f2 |
| 374 | |
| 375 | ; store |
| 376 | %ifidn %1, v |
| 377 | mova [dst1q], m4 |
| 378 | mova [dst1q+mstrideq], m6 |
| 379 | %else ; h |
| 380 | inc dst1q |
| 381 | SBUTTERFLY bw, 6, 4, 0 |
| 382 | |
| 383 | %if mmsize == 16 ; sse2 |
| 384 | %if cpuflag(sse4) |
| 385 | inc dst2q |
| 386 | %endif |
| 387 | WRITE_8W m6, dst2q, dst1q, mstrideq, strideq |
| 388 | lea dst2q, [dst3q+mstrideq+1] |
| 389 | %if cpuflag(sse4) |
| 390 | inc dst3q |
| 391 | %endif |
| 392 | WRITE_8W m4, dst3q, dst2q, mstrideq, strideq |
| 393 | %else ; mmx/mmxext |
| 394 | WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq |
| 395 | %endif |
| 396 | %endif |
| 397 | |
| 398 | %if mmsize == 8 ; mmx/mmxext |
| 399 | ; next 8 pixels |
| 400 | %ifidn %1, v |
| 401 | add dst1q, 8 ; advance 8 cols = pixels |
| 402 | %else ; h |
| 403 | lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines |
| 404 | %endif |
| 405 | dec cntrq |
| 406 | jg .next8px |
| 407 | REP_RET |
| 408 | %else ; sse2 |
| 409 | RET |
| 410 | %endif |
| 411 | %endmacro |
| 412 | |
| 413 | %if ARCH_X86_32 |
| 414 | INIT_MMX mmx |
| 415 | SIMPLE_LOOPFILTER v, 4 |
| 416 | SIMPLE_LOOPFILTER h, 5 |
| 417 | INIT_MMX mmxext |
| 418 | SIMPLE_LOOPFILTER v, 4 |
| 419 | SIMPLE_LOOPFILTER h, 5 |
| 420 | %endif |
| 421 | |
| 422 | INIT_XMM sse2 |
| 423 | SIMPLE_LOOPFILTER v, 3 |
| 424 | SIMPLE_LOOPFILTER h, 5 |
| 425 | INIT_XMM ssse3 |
| 426 | SIMPLE_LOOPFILTER v, 3 |
| 427 | SIMPLE_LOOPFILTER h, 5 |
| 428 | INIT_XMM sse4 |
| 429 | SIMPLE_LOOPFILTER h, 5 |
| 430 | |
| 431 | ;----------------------------------------------------------------------------- |
| 432 | ; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
| 433 | ; int flimE, int flimI, int hev_thr); |
| 434 | ;----------------------------------------------------------------------------- |
| 435 | |
| 436 | %macro INNER_LOOPFILTER 2 |
| 437 | %define stack_size 0 |
| 438 | %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
| 439 | %ifidn %1, v ; [3]=hev() result |
| 440 | %define stack_size mmsize * -4 |
| 441 | %else ; h ; extra storage space for transposes |
| 442 | %define stack_size mmsize * -5 |
| 443 | %endif |
| 444 | %endif |
| 445 | |
| 446 | %if %2 == 8 ; chroma |
| 447 | cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr |
| 448 | %else ; luma |
| 449 | cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr |
| 450 | %endif |
| 451 | |
| 452 | %if cpuflag(ssse3) |
| 453 | pxor m7, m7 |
| 454 | %endif |
| 455 | |
| 456 | %ifndef m8 |
| 457 | ; splat function arguments |
| 458 | SPLATB_REG m0, flimEq, m7 ; E |
| 459 | SPLATB_REG m1, flimIq, m7 ; I |
| 460 | SPLATB_REG m2, hevthrq, m7 ; hev_thresh |
| 461 | |
| 462 | %define m_flimE [rsp] |
| 463 | %define m_flimI [rsp+mmsize] |
| 464 | %define m_hevthr [rsp+mmsize*2] |
| 465 | %define m_maskres [rsp+mmsize*3] |
| 466 | %define m_p0backup [rsp+mmsize*3] |
| 467 | %define m_q0backup [rsp+mmsize*4] |
| 468 | |
| 469 | mova m_flimE, m0 |
| 470 | mova m_flimI, m1 |
| 471 | mova m_hevthr, m2 |
| 472 | %else |
| 473 | %define m_flimE m9 |
| 474 | %define m_flimI m10 |
| 475 | %define m_hevthr m11 |
| 476 | %define m_maskres m12 |
| 477 | %define m_p0backup m12 |
| 478 | %define m_q0backup m8 |
| 479 | |
| 480 | ; splat function arguments |
| 481 | SPLATB_REG m_flimE, flimEq, m7 ; E |
| 482 | SPLATB_REG m_flimI, flimIq, m7 ; I |
| 483 | SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh |
| 484 | %endif |
| 485 | |
| 486 | %if %2 == 8 ; chroma |
| 487 | DEFINE_ARGS dst1, dst8, mstride, stride, dst2 |
| 488 | %elif mmsize == 8 |
| 489 | DEFINE_ARGS dst1, mstride, stride, dst2, cntr |
| 490 | mov cntrq, 2 |
| 491 | %else |
| 492 | DEFINE_ARGS dst1, mstride, stride, dst2, dst8 |
| 493 | %endif |
| 494 | mov strideq, mstrideq |
| 495 | neg mstrideq |
| 496 | %ifidn %1, h |
| 497 | lea dst1q, [dst1q+strideq*4-4] |
| 498 | %if %2 == 8 ; chroma |
| 499 | lea dst8q, [dst8q+strideq*4-4] |
| 500 | %endif |
| 501 | %endif |
| 502 | |
| 503 | %if mmsize == 8 |
| 504 | .next8px: |
| 505 | %endif |
| 506 | ; read |
| 507 | lea dst2q, [dst1q+strideq] |
| 508 | %ifidn %1, v |
| 509 | %if %2 == 8 && mmsize == 16 |
| 510 | %define movrow movh |
| 511 | %else |
| 512 | %define movrow mova |
| 513 | %endif |
| 514 | movrow m0, [dst1q+mstrideq*4] ; p3 |
| 515 | movrow m1, [dst2q+mstrideq*4] ; p2 |
| 516 | movrow m2, [dst1q+mstrideq*2] ; p1 |
| 517 | movrow m5, [dst2q] ; q1 |
| 518 | movrow m6, [dst2q+ strideq*1] ; q2 |
| 519 | movrow m7, [dst2q+ strideq*2] ; q3 |
| 520 | %if mmsize == 16 && %2 == 8 |
| 521 | movhps m0, [dst8q+mstrideq*4] |
| 522 | movhps m2, [dst8q+mstrideq*2] |
| 523 | add dst8q, strideq |
| 524 | movhps m1, [dst8q+mstrideq*4] |
| 525 | movhps m5, [dst8q] |
| 526 | movhps m6, [dst8q+ strideq ] |
| 527 | movhps m7, [dst8q+ strideq*2] |
| 528 | add dst8q, mstrideq |
| 529 | %endif |
| 530 | %elif mmsize == 8 ; mmx/mmxext (h) |
| 531 | ; read 8 rows of 8px each |
| 532 | movu m0, [dst1q+mstrideq*4] |
| 533 | movu m1, [dst2q+mstrideq*4] |
| 534 | movu m2, [dst1q+mstrideq*2] |
| 535 | movu m3, [dst1q+mstrideq ] |
| 536 | movu m4, [dst1q] |
| 537 | movu m5, [dst2q] |
| 538 | movu m6, [dst2q+ strideq ] |
| 539 | |
| 540 | ; 8x8 transpose |
| 541 | TRANSPOSE4x4B 0, 1, 2, 3, 7 |
| 542 | mova m_q0backup, m1 |
| 543 | movu m7, [dst2q+ strideq*2] |
| 544 | TRANSPOSE4x4B 4, 5, 6, 7, 1 |
| 545 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
| 546 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
| 547 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
| 548 | mova m1, m_q0backup |
| 549 | mova m_q0backup, m2 ; store q0 |
| 550 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
| 551 | mova m_p0backup, m5 ; store p0 |
| 552 | SWAP 1, 4 |
| 553 | SWAP 2, 4 |
| 554 | SWAP 6, 3 |
| 555 | SWAP 5, 3 |
| 556 | %else ; sse2 (h) |
| 557 | %if %2 == 16 |
| 558 | lea dst8q, [dst1q+ strideq*8] |
| 559 | %endif |
| 560 | |
| 561 | ; read 16 rows of 8px each, interleave |
| 562 | movh m0, [dst1q+mstrideq*4] |
| 563 | movh m1, [dst8q+mstrideq*4] |
| 564 | movh m2, [dst1q+mstrideq*2] |
| 565 | movh m5, [dst8q+mstrideq*2] |
| 566 | movh m3, [dst1q+mstrideq ] |
| 567 | movh m6, [dst8q+mstrideq ] |
| 568 | movh m4, [dst1q] |
| 569 | movh m7, [dst8q] |
| 570 | punpcklbw m0, m1 ; A/I |
| 571 | punpcklbw m2, m5 ; C/K |
| 572 | punpcklbw m3, m6 ; D/L |
| 573 | punpcklbw m4, m7 ; E/M |
| 574 | |
| 575 | add dst8q, strideq |
| 576 | movh m1, [dst2q+mstrideq*4] |
| 577 | movh m6, [dst8q+mstrideq*4] |
| 578 | movh m5, [dst2q] |
| 579 | movh m7, [dst8q] |
| 580 | punpcklbw m1, m6 ; B/J |
| 581 | punpcklbw m5, m7 ; F/N |
| 582 | movh m6, [dst2q+ strideq ] |
| 583 | movh m7, [dst8q+ strideq ] |
| 584 | punpcklbw m6, m7 ; G/O |
| 585 | |
| 586 | ; 8x16 transpose |
| 587 | TRANSPOSE4x4B 0, 1, 2, 3, 7 |
| 588 | %ifdef m8 |
| 589 | SWAP 1, 8 |
| 590 | %else |
| 591 | mova m_q0backup, m1 |
| 592 | %endif |
| 593 | movh m7, [dst2q+ strideq*2] |
| 594 | movh m1, [dst8q+ strideq*2] |
| 595 | punpcklbw m7, m1 ; H/P |
| 596 | TRANSPOSE4x4B 4, 5, 6, 7, 1 |
| 597 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
| 598 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
| 599 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
| 600 | %ifdef m8 |
| 601 | SWAP 1, 8 |
| 602 | SWAP 2, 8 |
| 603 | %else |
| 604 | mova m1, m_q0backup |
| 605 | mova m_q0backup, m2 ; store q0 |
| 606 | %endif |
| 607 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
| 608 | %ifdef m12 |
| 609 | SWAP 5, 12 |
| 610 | %else |
| 611 | mova m_p0backup, m5 ; store p0 |
| 612 | %endif |
| 613 | SWAP 1, 4 |
| 614 | SWAP 2, 4 |
| 615 | SWAP 6, 3 |
| 616 | SWAP 5, 3 |
| 617 | %endif |
| 618 | |
| 619 | ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
| 620 | mova m4, m1 |
| 621 | SWAP 4, 1 |
| 622 | psubusb m4, m0 ; p2-p3 |
| 623 | psubusb m0, m1 ; p3-p2 |
| 624 | por m0, m4 ; abs(p3-p2) |
| 625 | |
| 626 | mova m4, m2 |
| 627 | SWAP 4, 2 |
| 628 | psubusb m4, m1 ; p1-p2 |
| 629 | psubusb m1, m2 ; p2-p1 |
| 630 | por m1, m4 ; abs(p2-p1) |
| 631 | |
| 632 | mova m4, m6 |
| 633 | SWAP 4, 6 |
| 634 | psubusb m4, m7 ; q2-q3 |
| 635 | psubusb m7, m6 ; q3-q2 |
| 636 | por m7, m4 ; abs(q3-q2) |
| 637 | |
| 638 | mova m4, m5 |
| 639 | SWAP 4, 5 |
| 640 | psubusb m4, m6 ; q1-q2 |
| 641 | psubusb m6, m5 ; q2-q1 |
| 642 | por m6, m4 ; abs(q2-q1) |
| 643 | |
| 644 | %if notcpuflag(mmxext) |
| 645 | mova m4, m_flimI |
| 646 | pxor m3, m3 |
| 647 | psubusb m0, m4 |
| 648 | psubusb m1, m4 |
| 649 | psubusb m7, m4 |
| 650 | psubusb m6, m4 |
| 651 | pcmpeqb m0, m3 ; abs(p3-p2) <= I |
| 652 | pcmpeqb m1, m3 ; abs(p2-p1) <= I |
| 653 | pcmpeqb m7, m3 ; abs(q3-q2) <= I |
| 654 | pcmpeqb m6, m3 ; abs(q2-q1) <= I |
| 655 | pand m0, m1 |
| 656 | pand m7, m6 |
| 657 | pand m0, m7 |
| 658 | %else ; mmxext/sse2 |
| 659 | pmaxub m0, m1 |
| 660 | pmaxub m6, m7 |
| 661 | pmaxub m0, m6 |
| 662 | %endif |
| 663 | |
| 664 | ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
| 665 | SWAP 7, 3 ; now m7 is zero |
| 666 | %ifidn %1, v |
| 667 | movrow m3, [dst1q+mstrideq ] ; p0 |
| 668 | %if mmsize == 16 && %2 == 8 |
| 669 | movhps m3, [dst8q+mstrideq ] |
| 670 | %endif |
| 671 | %elifdef m12 |
| 672 | SWAP 3, 12 |
| 673 | %else |
| 674 | mova m3, m_p0backup |
| 675 | %endif |
| 676 | |
| 677 | mova m1, m2 |
| 678 | SWAP 1, 2 |
| 679 | mova m6, m3 |
| 680 | SWAP 3, 6 |
| 681 | psubusb m1, m3 ; p1-p0 |
| 682 | psubusb m6, m2 ; p0-p1 |
| 683 | por m1, m6 ; abs(p1-p0) |
| 684 | %if notcpuflag(mmxext) |
| 685 | mova m6, m1 |
| 686 | psubusb m1, m4 |
| 687 | psubusb m6, m_hevthr |
| 688 | pcmpeqb m1, m7 ; abs(p1-p0) <= I |
| 689 | pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
| 690 | pand m0, m1 |
| 691 | mova m_maskres, m6 |
| 692 | %else ; mmxext/sse2 |
| 693 | pmaxub m0, m1 ; max_I |
| 694 | SWAP 1, 4 ; max_hev_thresh |
| 695 | %endif |
| 696 | |
| 697 | SWAP 6, 4 ; now m6 is I |
| 698 | %ifidn %1, v |
| 699 | movrow m4, [dst1q] ; q0 |
| 700 | %if mmsize == 16 && %2 == 8 |
| 701 | movhps m4, [dst8q] |
| 702 | %endif |
| 703 | %elifdef m8 |
| 704 | SWAP 4, 8 |
| 705 | %else |
| 706 | mova m4, m_q0backup |
| 707 | %endif |
| 708 | mova m1, m4 |
| 709 | SWAP 1, 4 |
| 710 | mova m7, m5 |
| 711 | SWAP 7, 5 |
| 712 | psubusb m1, m5 ; q0-q1 |
| 713 | psubusb m7, m4 ; q1-q0 |
| 714 | por m1, m7 ; abs(q1-q0) |
| 715 | %if notcpuflag(mmxext) |
| 716 | mova m7, m1 |
| 717 | psubusb m1, m6 |
| 718 | psubusb m7, m_hevthr |
| 719 | pxor m6, m6 |
| 720 | pcmpeqb m1, m6 ; abs(q1-q0) <= I |
| 721 | pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
| 722 | mova m6, m_maskres |
| 723 | pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
| 724 | pand m6, m7 |
| 725 | %else ; mmxext/sse2 |
| 726 | pxor m7, m7 |
| 727 | pmaxub m0, m1 |
| 728 | pmaxub m6, m1 |
| 729 | psubusb m0, m_flimI |
| 730 | psubusb m6, m_hevthr |
| 731 | pcmpeqb m0, m7 ; max(abs(..)) <= I |
| 732 | pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
| 733 | %endif |
| 734 | %ifdef m12 |
| 735 | SWAP 6, 12 |
| 736 | %else |
| 737 | mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
| 738 | %endif |
| 739 | |
| 740 | ; simple_limit |
| 741 | mova m1, m3 |
| 742 | SWAP 1, 3 |
| 743 | mova m6, m4 ; keep copies of p0/q0 around for later use |
| 744 | SWAP 6, 4 |
| 745 | psubusb m1, m4 ; p0-q0 |
| 746 | psubusb m6, m3 ; q0-p0 |
| 747 | por m1, m6 ; abs(q0-p0) |
| 748 | paddusb m1, m1 ; m1=2*abs(q0-p0) |
| 749 | |
| 750 | mova m7, m2 |
| 751 | SWAP 7, 2 |
| 752 | mova m6, m5 |
| 753 | SWAP 6, 5 |
| 754 | psubusb m7, m5 ; p1-q1 |
| 755 | psubusb m6, m2 ; q1-p1 |
| 756 | por m7, m6 ; abs(q1-p1) |
| 757 | pxor m6, m6 |
| 758 | pand m7, [pb_FE] |
| 759 | psrlq m7, 1 ; abs(q1-p1)/2 |
| 760 | paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
| 761 | psubusb m7, m_flimE |
| 762 | pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
| 763 | pand m0, m7 ; normal_limit result |
| 764 | |
| 765 | ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
| 766 | %ifdef m8 ; x86-64 && sse2 |
| 767 | mova m8, [pb_80] |
| 768 | %define m_pb_80 m8 |
| 769 | %else ; x86-32 or mmx/mmxext |
| 770 | %define m_pb_80 [pb_80] |
| 771 | %endif |
| 772 | mova m1, m4 |
| 773 | mova m7, m3 |
| 774 | pxor m1, m_pb_80 |
| 775 | pxor m7, m_pb_80 |
| 776 | psubsb m1, m7 ; (signed) q0-p0 |
| 777 | mova m6, m2 |
| 778 | mova m7, m5 |
| 779 | pxor m6, m_pb_80 |
| 780 | pxor m7, m_pb_80 |
| 781 | psubsb m6, m7 ; (signed) p1-q1 |
| 782 | mova m7, m_maskres |
| 783 | pandn m7, m6 |
| 784 | paddsb m7, m1 |
| 785 | paddsb m7, m1 |
| 786 | paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
| 787 | |
| 788 | pand m7, m0 |
| 789 | mova m1, [pb_F8] |
| 790 | mova m6, m7 |
| 791 | paddsb m7, [pb_3] |
| 792 | paddsb m6, [pb_4] |
| 793 | pand m7, m1 |
| 794 | pand m6, m1 |
| 795 | |
| 796 | pxor m1, m1 |
| 797 | pxor m0, m0 |
| 798 | pcmpgtb m1, m7 |
| 799 | psubb m0, m7 |
| 800 | psrlq m7, 3 ; +f2 |
| 801 | psrlq m0, 3 ; -f2 |
| 802 | pand m0, m1 |
| 803 | pandn m1, m7 |
| 804 | psubusb m3, m0 |
| 805 | paddusb m3, m1 ; p0+f2 |
| 806 | |
| 807 | pxor m1, m1 |
| 808 | pxor m0, m0 |
| 809 | pcmpgtb m0, m6 |
| 810 | psubb m1, m6 |
| 811 | psrlq m6, 3 ; +f1 |
| 812 | psrlq m1, 3 ; -f1 |
| 813 | pand m1, m0 |
| 814 | pandn m0, m6 |
| 815 | psubusb m4, m0 |
| 816 | paddusb m4, m1 ; q0-f1 |
| 817 | |
| 818 | %ifdef m12 |
| 819 | SWAP 6, 12 |
| 820 | %else |
| 821 | mova m6, m_maskres |
| 822 | %endif |
| 823 | %if notcpuflag(mmxext) |
| 824 | mova m7, [pb_1] |
| 825 | %else ; mmxext/sse2 |
| 826 | pxor m7, m7 |
| 827 | %endif |
| 828 | pand m0, m6 |
| 829 | pand m1, m6 |
| 830 | %if notcpuflag(mmxext) |
| 831 | paddusb m0, m7 |
| 832 | pand m1, [pb_FE] |
| 833 | pandn m7, m0 |
| 834 | psrlq m1, 1 |
| 835 | psrlq m7, 1 |
| 836 | SWAP 0, 7 |
| 837 | %else ; mmxext/sse2 |
| 838 | psubusb m1, [pb_1] |
| 839 | pavgb m0, m7 ; a |
| 840 | pavgb m1, m7 ; -a |
| 841 | %endif |
| 842 | psubusb m5, m0 |
| 843 | psubusb m2, m1 |
| 844 | paddusb m5, m1 ; q1-a |
| 845 | paddusb m2, m0 ; p1+a |
| 846 | |
| 847 | ; store |
| 848 | %ifidn %1, v |
| 849 | movrow [dst1q+mstrideq*2], m2 |
| 850 | movrow [dst1q+mstrideq ], m3 |
| 851 | movrow [dst1q], m4 |
| 852 | movrow [dst1q+ strideq ], m5 |
| 853 | %if mmsize == 16 && %2 == 8 |
| 854 | movhps [dst8q+mstrideq*2], m2 |
| 855 | movhps [dst8q+mstrideq ], m3 |
| 856 | movhps [dst8q], m4 |
| 857 | movhps [dst8q+ strideq ], m5 |
| 858 | %endif |
| 859 | %else ; h |
| 860 | add dst1q, 2 |
| 861 | add dst2q, 2 |
| 862 | |
| 863 | ; 4x8/16 transpose |
| 864 | TRANSPOSE4x4B 2, 3, 4, 5, 6 |
| 865 | |
| 866 | %if mmsize == 8 ; mmx/mmxext (h) |
| 867 | WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq |
| 868 | %else ; sse2 (h) |
| 869 | lea dst8q, [dst8q+mstrideq +2] |
| 870 | WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 |
| 871 | %endif |
| 872 | %endif |
| 873 | |
| 874 | %if mmsize == 8 |
| 875 | %if %2 == 8 ; chroma |
| 876 | %ifidn %1, h |
| 877 | sub dst1q, 2 |
| 878 | %endif |
| 879 | cmp dst1q, dst8q |
| 880 | mov dst1q, dst8q |
| 881 | jnz .next8px |
| 882 | %else |
| 883 | %ifidn %1, h |
| 884 | lea dst1q, [dst1q+ strideq*8-2] |
| 885 | %else ; v |
| 886 | add dst1q, 8 |
| 887 | %endif |
| 888 | dec cntrq |
| 889 | jg .next8px |
| 890 | %endif |
| 891 | REP_RET |
| 892 | %else ; mmsize == 16 |
| 893 | RET |
| 894 | %endif |
| 895 | %endmacro |
| 896 | |
| 897 | %if ARCH_X86_32 |
| 898 | INIT_MMX mmx |
| 899 | INNER_LOOPFILTER v, 16 |
| 900 | INNER_LOOPFILTER h, 16 |
| 901 | INNER_LOOPFILTER v, 8 |
| 902 | INNER_LOOPFILTER h, 8 |
| 903 | |
| 904 | INIT_MMX mmxext |
| 905 | INNER_LOOPFILTER v, 16 |
| 906 | INNER_LOOPFILTER h, 16 |
| 907 | INNER_LOOPFILTER v, 8 |
| 908 | INNER_LOOPFILTER h, 8 |
| 909 | %endif |
| 910 | |
| 911 | INIT_XMM sse2 |
| 912 | INNER_LOOPFILTER v, 16 |
| 913 | INNER_LOOPFILTER h, 16 |
| 914 | INNER_LOOPFILTER v, 8 |
| 915 | INNER_LOOPFILTER h, 8 |
| 916 | |
| 917 | INIT_XMM ssse3 |
| 918 | INNER_LOOPFILTER v, 16 |
| 919 | INNER_LOOPFILTER h, 16 |
| 920 | INNER_LOOPFILTER v, 8 |
| 921 | INNER_LOOPFILTER h, 8 |
| 922 | |
| 923 | ;----------------------------------------------------------------------------- |
| 924 | ; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
| 925 | ; int flimE, int flimI, int hev_thr); |
| 926 | ;----------------------------------------------------------------------------- |
| 927 | |
| 928 | %macro MBEDGE_LOOPFILTER 2 |
| 929 | %define stack_size 0 |
| 930 | %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
| 931 | %if mmsize == 16 ; [3]=hev() result |
| 932 | ; [4]=filter tmp result |
| 933 | ; [5]/[6] = p2/q2 backup |
| 934 | ; [7]=lim_res sign result |
| 935 | %define stack_size mmsize * -7 |
| 936 | %else ; 8 ; extra storage space for transposes |
| 937 | %define stack_size mmsize * -8 |
| 938 | %endif |
| 939 | %endif |
| 940 | |
| 941 | %if %2 == 8 ; chroma |
| 942 | cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr |
| 943 | %else ; luma |
| 944 | cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr |
| 945 | %endif |
| 946 | |
| 947 | %if cpuflag(ssse3) |
| 948 | pxor m7, m7 |
| 949 | %endif |
| 950 | |
| 951 | %ifndef m8 |
| 952 | ; splat function arguments |
| 953 | SPLATB_REG m0, flimEq, m7 ; E |
| 954 | SPLATB_REG m1, flimIq, m7 ; I |
| 955 | SPLATB_REG m2, hevthrq, m7 ; hev_thresh |
| 956 | |
| 957 | %define m_flimE [rsp] |
| 958 | %define m_flimI [rsp+mmsize] |
| 959 | %define m_hevthr [rsp+mmsize*2] |
| 960 | %define m_maskres [rsp+mmsize*3] |
| 961 | %define m_limres [rsp+mmsize*4] |
| 962 | %define m_p0backup [rsp+mmsize*3] |
| 963 | %define m_q0backup [rsp+mmsize*4] |
| 964 | %define m_p2backup [rsp+mmsize*5] |
| 965 | %define m_q2backup [rsp+mmsize*6] |
| 966 | %if mmsize == 16 |
| 967 | %define m_limsign [rsp] |
| 968 | %else |
| 969 | %define m_limsign [rsp+mmsize*7] |
| 970 | %endif |
| 971 | |
| 972 | mova m_flimE, m0 |
| 973 | mova m_flimI, m1 |
| 974 | mova m_hevthr, m2 |
| 975 | %else ; sse2 on x86-64 |
| 976 | %define m_flimE m9 |
| 977 | %define m_flimI m10 |
| 978 | %define m_hevthr m11 |
| 979 | %define m_maskres m12 |
| 980 | %define m_limres m8 |
| 981 | %define m_p0backup m12 |
| 982 | %define m_q0backup m8 |
| 983 | %define m_p2backup m13 |
| 984 | %define m_q2backup m14 |
| 985 | %define m_limsign m9 |
| 986 | |
| 987 | ; splat function arguments |
| 988 | SPLATB_REG m_flimE, flimEq, m7 ; E |
| 989 | SPLATB_REG m_flimI, flimIq, m7 ; I |
| 990 | SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh |
| 991 | %endif |
| 992 | |
| 993 | %if %2 == 8 ; chroma |
| 994 | DEFINE_ARGS dst1, dst8, mstride, stride, dst2 |
| 995 | %elif mmsize == 8 |
| 996 | DEFINE_ARGS dst1, mstride, stride, dst2, cntr |
| 997 | mov cntrq, 2 |
| 998 | %else |
| 999 | DEFINE_ARGS dst1, mstride, stride, dst2, dst8 |
| 1000 | %endif |
| 1001 | mov strideq, mstrideq |
| 1002 | neg mstrideq |
| 1003 | %ifidn %1, h |
| 1004 | lea dst1q, [dst1q+strideq*4-4] |
| 1005 | %if %2 == 8 ; chroma |
| 1006 | lea dst8q, [dst8q+strideq*4-4] |
| 1007 | %endif |
| 1008 | %endif |
| 1009 | |
| 1010 | %if mmsize == 8 |
| 1011 | .next8px: |
| 1012 | %endif |
| 1013 | ; read |
| 1014 | lea dst2q, [dst1q+ strideq ] |
| 1015 | %ifidn %1, v |
| 1016 | %if %2 == 8 && mmsize == 16 |
| 1017 | %define movrow movh |
| 1018 | %else |
| 1019 | %define movrow mova |
| 1020 | %endif |
| 1021 | movrow m0, [dst1q+mstrideq*4] ; p3 |
| 1022 | movrow m1, [dst2q+mstrideq*4] ; p2 |
| 1023 | movrow m2, [dst1q+mstrideq*2] ; p1 |
| 1024 | movrow m5, [dst2q] ; q1 |
| 1025 | movrow m6, [dst2q+ strideq ] ; q2 |
| 1026 | movrow m7, [dst2q+ strideq*2] ; q3 |
| 1027 | %if mmsize == 16 && %2 == 8 |
| 1028 | movhps m0, [dst8q+mstrideq*4] |
| 1029 | movhps m2, [dst8q+mstrideq*2] |
| 1030 | add dst8q, strideq |
| 1031 | movhps m1, [dst8q+mstrideq*4] |
| 1032 | movhps m5, [dst8q] |
| 1033 | movhps m6, [dst8q+ strideq ] |
| 1034 | movhps m7, [dst8q+ strideq*2] |
| 1035 | add dst8q, mstrideq |
| 1036 | %endif |
| 1037 | %elif mmsize == 8 ; mmx/mmxext (h) |
| 1038 | ; read 8 rows of 8px each |
| 1039 | movu m0, [dst1q+mstrideq*4] |
| 1040 | movu m1, [dst2q+mstrideq*4] |
| 1041 | movu m2, [dst1q+mstrideq*2] |
| 1042 | movu m3, [dst1q+mstrideq ] |
| 1043 | movu m4, [dst1q] |
| 1044 | movu m5, [dst2q] |
| 1045 | movu m6, [dst2q+ strideq ] |
| 1046 | |
| 1047 | ; 8x8 transpose |
| 1048 | TRANSPOSE4x4B 0, 1, 2, 3, 7 |
| 1049 | mova m_q0backup, m1 |
| 1050 | movu m7, [dst2q+ strideq*2] |
| 1051 | TRANSPOSE4x4B 4, 5, 6, 7, 1 |
| 1052 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
| 1053 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
| 1054 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
| 1055 | mova m1, m_q0backup |
| 1056 | mova m_q0backup, m2 ; store q0 |
| 1057 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
| 1058 | mova m_p0backup, m5 ; store p0 |
| 1059 | SWAP 1, 4 |
| 1060 | SWAP 2, 4 |
| 1061 | SWAP 6, 3 |
| 1062 | SWAP 5, 3 |
| 1063 | %else ; sse2 (h) |
| 1064 | %if %2 == 16 |
| 1065 | lea dst8q, [dst1q+ strideq*8 ] |
| 1066 | %endif |
| 1067 | |
| 1068 | ; read 16 rows of 8px each, interleave |
| 1069 | movh m0, [dst1q+mstrideq*4] |
| 1070 | movh m1, [dst8q+mstrideq*4] |
| 1071 | movh m2, [dst1q+mstrideq*2] |
| 1072 | movh m5, [dst8q+mstrideq*2] |
| 1073 | movh m3, [dst1q+mstrideq ] |
| 1074 | movh m6, [dst8q+mstrideq ] |
| 1075 | movh m4, [dst1q] |
| 1076 | movh m7, [dst8q] |
| 1077 | punpcklbw m0, m1 ; A/I |
| 1078 | punpcklbw m2, m5 ; C/K |
| 1079 | punpcklbw m3, m6 ; D/L |
| 1080 | punpcklbw m4, m7 ; E/M |
| 1081 | |
| 1082 | add dst8q, strideq |
| 1083 | movh m1, [dst2q+mstrideq*4] |
| 1084 | movh m6, [dst8q+mstrideq*4] |
| 1085 | movh m5, [dst2q] |
| 1086 | movh m7, [dst8q] |
| 1087 | punpcklbw m1, m6 ; B/J |
| 1088 | punpcklbw m5, m7 ; F/N |
| 1089 | movh m6, [dst2q+ strideq ] |
| 1090 | movh m7, [dst8q+ strideq ] |
| 1091 | punpcklbw m6, m7 ; G/O |
| 1092 | |
| 1093 | ; 8x16 transpose |
| 1094 | TRANSPOSE4x4B 0, 1, 2, 3, 7 |
| 1095 | %ifdef m8 |
| 1096 | SWAP 1, 8 |
| 1097 | %else |
| 1098 | mova m_q0backup, m1 |
| 1099 | %endif |
| 1100 | movh m7, [dst2q+ strideq*2] |
| 1101 | movh m1, [dst8q+ strideq*2] |
| 1102 | punpcklbw m7, m1 ; H/P |
| 1103 | TRANSPOSE4x4B 4, 5, 6, 7, 1 |
| 1104 | SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
| 1105 | SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
| 1106 | SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
| 1107 | %ifdef m8 |
| 1108 | SWAP 1, 8 |
| 1109 | SWAP 2, 8 |
| 1110 | %else |
| 1111 | mova m1, m_q0backup |
| 1112 | mova m_q0backup, m2 ; store q0 |
| 1113 | %endif |
| 1114 | SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
| 1115 | %ifdef m12 |
| 1116 | SWAP 5, 12 |
| 1117 | %else |
| 1118 | mova m_p0backup, m5 ; store p0 |
| 1119 | %endif |
| 1120 | SWAP 1, 4 |
| 1121 | SWAP 2, 4 |
| 1122 | SWAP 6, 3 |
| 1123 | SWAP 5, 3 |
| 1124 | %endif |
| 1125 | |
| 1126 | ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
| 1127 | mova m4, m1 |
| 1128 | SWAP 4, 1 |
| 1129 | psubusb m4, m0 ; p2-p3 |
| 1130 | psubusb m0, m1 ; p3-p2 |
| 1131 | por m0, m4 ; abs(p3-p2) |
| 1132 | |
| 1133 | mova m4, m2 |
| 1134 | SWAP 4, 2 |
| 1135 | psubusb m4, m1 ; p1-p2 |
| 1136 | mova m_p2backup, m1 |
| 1137 | psubusb m1, m2 ; p2-p1 |
| 1138 | por m1, m4 ; abs(p2-p1) |
| 1139 | |
| 1140 | mova m4, m6 |
| 1141 | SWAP 4, 6 |
| 1142 | psubusb m4, m7 ; q2-q3 |
| 1143 | psubusb m7, m6 ; q3-q2 |
| 1144 | por m7, m4 ; abs(q3-q2) |
| 1145 | |
| 1146 | mova m4, m5 |
| 1147 | SWAP 4, 5 |
| 1148 | psubusb m4, m6 ; q1-q2 |
| 1149 | mova m_q2backup, m6 |
| 1150 | psubusb m6, m5 ; q2-q1 |
| 1151 | por m6, m4 ; abs(q2-q1) |
| 1152 | |
| 1153 | %if notcpuflag(mmxext) |
| 1154 | mova m4, m_flimI |
| 1155 | pxor m3, m3 |
| 1156 | psubusb m0, m4 |
| 1157 | psubusb m1, m4 |
| 1158 | psubusb m7, m4 |
| 1159 | psubusb m6, m4 |
| 1160 | pcmpeqb m0, m3 ; abs(p3-p2) <= I |
| 1161 | pcmpeqb m1, m3 ; abs(p2-p1) <= I |
| 1162 | pcmpeqb m7, m3 ; abs(q3-q2) <= I |
| 1163 | pcmpeqb m6, m3 ; abs(q2-q1) <= I |
| 1164 | pand m0, m1 |
| 1165 | pand m7, m6 |
| 1166 | pand m0, m7 |
| 1167 | %else ; mmxext/sse2 |
| 1168 | pmaxub m0, m1 |
| 1169 | pmaxub m6, m7 |
| 1170 | pmaxub m0, m6 |
| 1171 | %endif |
| 1172 | |
| 1173 | ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
| 1174 | SWAP 7, 3 ; now m7 is zero |
| 1175 | %ifidn %1, v |
| 1176 | movrow m3, [dst1q+mstrideq ] ; p0 |
| 1177 | %if mmsize == 16 && %2 == 8 |
| 1178 | movhps m3, [dst8q+mstrideq ] |
| 1179 | %endif |
| 1180 | %elifdef m12 |
| 1181 | SWAP 3, 12 |
| 1182 | %else |
| 1183 | mova m3, m_p0backup |
| 1184 | %endif |
| 1185 | |
| 1186 | mova m1, m2 |
| 1187 | SWAP 1, 2 |
| 1188 | mova m6, m3 |
| 1189 | SWAP 3, 6 |
| 1190 | psubusb m1, m3 ; p1-p0 |
| 1191 | psubusb m6, m2 ; p0-p1 |
| 1192 | por m1, m6 ; abs(p1-p0) |
| 1193 | %if notcpuflag(mmxext) |
| 1194 | mova m6, m1 |
| 1195 | psubusb m1, m4 |
| 1196 | psubusb m6, m_hevthr |
| 1197 | pcmpeqb m1, m7 ; abs(p1-p0) <= I |
| 1198 | pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
| 1199 | pand m0, m1 |
| 1200 | mova m_maskres, m6 |
| 1201 | %else ; mmxext/sse2 |
| 1202 | pmaxub m0, m1 ; max_I |
| 1203 | SWAP 1, 4 ; max_hev_thresh |
| 1204 | %endif |
| 1205 | |
| 1206 | SWAP 6, 4 ; now m6 is I |
| 1207 | %ifidn %1, v |
| 1208 | movrow m4, [dst1q] ; q0 |
| 1209 | %if mmsize == 16 && %2 == 8 |
| 1210 | movhps m4, [dst8q] |
| 1211 | %endif |
| 1212 | %elifdef m8 |
| 1213 | SWAP 4, 8 |
| 1214 | %else |
| 1215 | mova m4, m_q0backup |
| 1216 | %endif |
| 1217 | mova m1, m4 |
| 1218 | SWAP 1, 4 |
| 1219 | mova m7, m5 |
| 1220 | SWAP 7, 5 |
| 1221 | psubusb m1, m5 ; q0-q1 |
| 1222 | psubusb m7, m4 ; q1-q0 |
| 1223 | por m1, m7 ; abs(q1-q0) |
| 1224 | %if notcpuflag(mmxext) |
| 1225 | mova m7, m1 |
| 1226 | psubusb m1, m6 |
| 1227 | psubusb m7, m_hevthr |
| 1228 | pxor m6, m6 |
| 1229 | pcmpeqb m1, m6 ; abs(q1-q0) <= I |
| 1230 | pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
| 1231 | mova m6, m_maskres |
| 1232 | pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
| 1233 | pand m6, m7 |
| 1234 | %else ; mmxext/sse2 |
| 1235 | pxor m7, m7 |
| 1236 | pmaxub m0, m1 |
| 1237 | pmaxub m6, m1 |
| 1238 | psubusb m0, m_flimI |
| 1239 | psubusb m6, m_hevthr |
| 1240 | pcmpeqb m0, m7 ; max(abs(..)) <= I |
| 1241 | pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
| 1242 | %endif |
| 1243 | %ifdef m12 |
| 1244 | SWAP 6, 12 |
| 1245 | %else |
| 1246 | mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
| 1247 | %endif |
| 1248 | |
| 1249 | ; simple_limit |
| 1250 | mova m1, m3 |
| 1251 | SWAP 1, 3 |
| 1252 | mova m6, m4 ; keep copies of p0/q0 around for later use |
| 1253 | SWAP 6, 4 |
| 1254 | psubusb m1, m4 ; p0-q0 |
| 1255 | psubusb m6, m3 ; q0-p0 |
| 1256 | por m1, m6 ; abs(q0-p0) |
| 1257 | paddusb m1, m1 ; m1=2*abs(q0-p0) |
| 1258 | |
| 1259 | mova m7, m2 |
| 1260 | SWAP 7, 2 |
| 1261 | mova m6, m5 |
| 1262 | SWAP 6, 5 |
| 1263 | psubusb m7, m5 ; p1-q1 |
| 1264 | psubusb m6, m2 ; q1-p1 |
| 1265 | por m7, m6 ; abs(q1-p1) |
| 1266 | pxor m6, m6 |
| 1267 | pand m7, [pb_FE] |
| 1268 | psrlq m7, 1 ; abs(q1-p1)/2 |
| 1269 | paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
| 1270 | psubusb m7, m_flimE |
| 1271 | pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
| 1272 | pand m0, m7 ; normal_limit result |
| 1273 | |
| 1274 | ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
| 1275 | %ifdef m8 ; x86-64 && sse2 |
| 1276 | mova m8, [pb_80] |
| 1277 | %define m_pb_80 m8 |
| 1278 | %else ; x86-32 or mmx/mmxext |
| 1279 | %define m_pb_80 [pb_80] |
| 1280 | %endif |
| 1281 | mova m1, m4 |
| 1282 | mova m7, m3 |
| 1283 | pxor m1, m_pb_80 |
| 1284 | pxor m7, m_pb_80 |
| 1285 | psubsb m1, m7 ; (signed) q0-p0 |
| 1286 | mova m6, m2 |
| 1287 | mova m7, m5 |
| 1288 | pxor m6, m_pb_80 |
| 1289 | pxor m7, m_pb_80 |
| 1290 | psubsb m6, m7 ; (signed) p1-q1 |
| 1291 | mova m7, m_maskres |
| 1292 | paddsb m6, m1 |
| 1293 | paddsb m6, m1 |
| 1294 | paddsb m6, m1 |
| 1295 | pand m6, m0 |
| 1296 | %ifdef m8 |
| 1297 | mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
| 1298 | pand m_limres, m7 |
| 1299 | %else |
| 1300 | mova m0, m6 |
| 1301 | pand m0, m7 |
| 1302 | mova m_limres, m0 |
| 1303 | %endif |
| 1304 | pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
| 1305 | |
| 1306 | mova m1, [pb_F8] |
| 1307 | mova m6, m7 |
| 1308 | paddsb m7, [pb_3] |
| 1309 | paddsb m6, [pb_4] |
| 1310 | pand m7, m1 |
| 1311 | pand m6, m1 |
| 1312 | |
| 1313 | pxor m1, m1 |
| 1314 | pxor m0, m0 |
| 1315 | pcmpgtb m1, m7 |
| 1316 | psubb m0, m7 |
| 1317 | psrlq m7, 3 ; +f2 |
| 1318 | psrlq m0, 3 ; -f2 |
| 1319 | pand m0, m1 |
| 1320 | pandn m1, m7 |
| 1321 | psubusb m3, m0 |
| 1322 | paddusb m3, m1 ; p0+f2 |
| 1323 | |
| 1324 | pxor m1, m1 |
| 1325 | pxor m0, m0 |
| 1326 | pcmpgtb m0, m6 |
| 1327 | psubb m1, m6 |
| 1328 | psrlq m6, 3 ; +f1 |
| 1329 | psrlq m1, 3 ; -f1 |
| 1330 | pand m1, m0 |
| 1331 | pandn m0, m6 |
| 1332 | psubusb m4, m0 |
| 1333 | paddusb m4, m1 ; q0-f1 |
| 1334 | |
| 1335 | ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
| 1336 | %if cpuflag(ssse3) |
| 1337 | mova m7, [pb_1] |
| 1338 | %else |
| 1339 | mova m7, [pw_63] |
| 1340 | %endif |
| 1341 | %ifdef m8 |
| 1342 | SWAP 1, 8 |
| 1343 | %else |
| 1344 | mova m1, m_limres |
| 1345 | %endif |
| 1346 | pxor m0, m0 |
| 1347 | mova m6, m1 |
| 1348 | pcmpgtb m0, m1 ; which are negative |
| 1349 | %if cpuflag(ssse3) |
| 1350 | punpcklbw m6, m7 ; interleave with "1" for rounding |
| 1351 | punpckhbw m1, m7 |
| 1352 | %else |
| 1353 | punpcklbw m6, m0 ; signed byte->word |
| 1354 | punpckhbw m1, m0 |
| 1355 | %endif |
| 1356 | mova m_limsign, m0 |
| 1357 | %if cpuflag(ssse3) |
| 1358 | mova m7, [pb_27_63] |
| 1359 | %ifndef m8 |
| 1360 | mova m_limres, m1 |
| 1361 | %endif |
| 1362 | %ifdef m10 |
| 1363 | SWAP 0, 10 ; don't lose lim_sign copy |
| 1364 | %endif |
| 1365 | mova m0, m7 |
| 1366 | pmaddubsw m7, m6 |
| 1367 | SWAP 6, 7 |
| 1368 | pmaddubsw m0, m1 |
| 1369 | SWAP 1, 0 |
| 1370 | %ifdef m10 |
| 1371 | SWAP 0, 10 |
| 1372 | %else |
| 1373 | mova m0, m_limsign |
| 1374 | %endif |
| 1375 | %else |
| 1376 | mova m_maskres, m6 ; backup for later in filter |
| 1377 | mova m_limres, m1 |
| 1378 | pmullw m6, [pw_27] |
| 1379 | pmullw m1, [pw_27] |
| 1380 | paddw m6, m7 |
| 1381 | paddw m1, m7 |
| 1382 | %endif |
| 1383 | psraw m6, 7 |
| 1384 | psraw m1, 7 |
| 1385 | packsswb m6, m1 ; a0 |
| 1386 | pxor m1, m1 |
| 1387 | psubb m1, m6 |
| 1388 | pand m1, m0 ; -a0 |
| 1389 | pandn m0, m6 ; +a0 |
| 1390 | %if cpuflag(ssse3) |
| 1391 | mova m6, [pb_18_63] ; pipelining |
| 1392 | %endif |
| 1393 | psubusb m3, m1 |
| 1394 | paddusb m4, m1 |
| 1395 | paddusb m3, m0 ; p0+a0 |
| 1396 | psubusb m4, m0 ; q0-a0 |
| 1397 | |
| 1398 | %if cpuflag(ssse3) |
| 1399 | SWAP 6, 7 |
| 1400 | %ifdef m10 |
| 1401 | SWAP 1, 10 |
| 1402 | %else |
| 1403 | mova m1, m_limres |
| 1404 | %endif |
| 1405 | mova m0, m7 |
| 1406 | pmaddubsw m7, m6 |
| 1407 | SWAP 6, 7 |
| 1408 | pmaddubsw m0, m1 |
| 1409 | SWAP 1, 0 |
| 1410 | %ifdef m10 |
| 1411 | SWAP 0, 10 |
| 1412 | %endif |
| 1413 | mova m0, m_limsign |
| 1414 | %else |
| 1415 | mova m6, m_maskres |
| 1416 | mova m1, m_limres |
| 1417 | pmullw m6, [pw_18] |
| 1418 | pmullw m1, [pw_18] |
| 1419 | paddw m6, m7 |
| 1420 | paddw m1, m7 |
| 1421 | %endif |
| 1422 | mova m0, m_limsign |
| 1423 | psraw m6, 7 |
| 1424 | psraw m1, 7 |
| 1425 | packsswb m6, m1 ; a1 |
| 1426 | pxor m1, m1 |
| 1427 | psubb m1, m6 |
| 1428 | pand m1, m0 ; -a1 |
| 1429 | pandn m0, m6 ; +a1 |
| 1430 | %if cpuflag(ssse3) |
| 1431 | mova m6, [pb_9_63] |
| 1432 | %endif |
| 1433 | psubusb m2, m1 |
| 1434 | paddusb m5, m1 |
| 1435 | paddusb m2, m0 ; p1+a1 |
| 1436 | psubusb m5, m0 ; q1-a1 |
| 1437 | |
| 1438 | %if cpuflag(ssse3) |
| 1439 | SWAP 6, 7 |
| 1440 | %ifdef m10 |
| 1441 | SWAP 1, 10 |
| 1442 | %else |
| 1443 | mova m1, m_limres |
| 1444 | %endif |
| 1445 | mova m0, m7 |
| 1446 | pmaddubsw m7, m6 |
| 1447 | SWAP 6, 7 |
| 1448 | pmaddubsw m0, m1 |
| 1449 | SWAP 1, 0 |
| 1450 | %else |
| 1451 | %ifdef m8 |
| 1452 | SWAP 6, 12 |
| 1453 | SWAP 1, 8 |
| 1454 | %else |
| 1455 | mova m6, m_maskres |
| 1456 | mova m1, m_limres |
| 1457 | %endif |
| 1458 | pmullw m6, [pw_9] |
| 1459 | pmullw m1, [pw_9] |
| 1460 | paddw m6, m7 |
| 1461 | paddw m1, m7 |
| 1462 | %endif |
| 1463 | %ifdef m9 |
| 1464 | SWAP 7, 9 |
| 1465 | %else |
| 1466 | mova m7, m_limsign |
| 1467 | %endif |
| 1468 | psraw m6, 7 |
| 1469 | psraw m1, 7 |
| 1470 | packsswb m6, m1 ; a1 |
| 1471 | pxor m0, m0 |
| 1472 | psubb m0, m6 |
| 1473 | pand m0, m7 ; -a1 |
| 1474 | pandn m7, m6 ; +a1 |
| 1475 | %ifdef m8 |
| 1476 | SWAP 1, 13 |
| 1477 | SWAP 6, 14 |
| 1478 | %else |
| 1479 | mova m1, m_p2backup |
| 1480 | mova m6, m_q2backup |
| 1481 | %endif |
| 1482 | psubusb m1, m0 |
| 1483 | paddusb m6, m0 |
| 1484 | paddusb m1, m7 ; p1+a1 |
| 1485 | psubusb m6, m7 ; q1-a1 |
| 1486 | |
| 1487 | ; store |
| 1488 | %ifidn %1, v |
| 1489 | movrow [dst2q+mstrideq*4], m1 |
| 1490 | movrow [dst1q+mstrideq*2], m2 |
| 1491 | movrow [dst1q+mstrideq ], m3 |
| 1492 | movrow [dst1q], m4 |
| 1493 | movrow [dst2q], m5 |
| 1494 | movrow [dst2q+ strideq ], m6 |
| 1495 | %if mmsize == 16 && %2 == 8 |
| 1496 | add dst8q, mstrideq |
| 1497 | movhps [dst8q+mstrideq*2], m1 |
| 1498 | movhps [dst8q+mstrideq ], m2 |
| 1499 | movhps [dst8q], m3 |
| 1500 | add dst8q, strideq |
| 1501 | movhps [dst8q], m4 |
| 1502 | movhps [dst8q+ strideq ], m5 |
| 1503 | movhps [dst8q+ strideq*2], m6 |
| 1504 | %endif |
| 1505 | %else ; h |
| 1506 | inc dst1q |
| 1507 | inc dst2q |
| 1508 | |
| 1509 | ; 4x8/16 transpose |
| 1510 | TRANSPOSE4x4B 1, 2, 3, 4, 0 |
| 1511 | SBUTTERFLY bw, 5, 6, 0 |
| 1512 | |
| 1513 | %if mmsize == 8 ; mmx/mmxext (h) |
| 1514 | WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq |
| 1515 | add dst1q, 4 |
| 1516 | WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq |
| 1517 | %else ; sse2 (h) |
| 1518 | lea dst8q, [dst8q+mstrideq+1] |
| 1519 | WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 |
| 1520 | lea dst1q, [dst2q+mstrideq+4] |
| 1521 | lea dst8q, [dst8q+mstrideq+4] |
| 1522 | %if cpuflag(sse4) |
| 1523 | add dst2q, 4 |
| 1524 | %endif |
| 1525 | WRITE_8W m5, dst2q, dst1q, mstrideq, strideq |
| 1526 | %if cpuflag(sse4) |
| 1527 | lea dst2q, [dst8q+ strideq ] |
| 1528 | %endif |
| 1529 | WRITE_8W m6, dst2q, dst8q, mstrideq, strideq |
| 1530 | %endif |
| 1531 | %endif |
| 1532 | |
| 1533 | %if mmsize == 8 |
| 1534 | %if %2 == 8 ; chroma |
| 1535 | %ifidn %1, h |
| 1536 | sub dst1q, 5 |
| 1537 | %endif |
| 1538 | cmp dst1q, dst8q |
| 1539 | mov dst1q, dst8q |
| 1540 | jnz .next8px |
| 1541 | %else |
| 1542 | %ifidn %1, h |
| 1543 | lea dst1q, [dst1q+ strideq*8-5] |
| 1544 | %else ; v |
| 1545 | add dst1q, 8 |
| 1546 | %endif |
| 1547 | dec cntrq |
| 1548 | jg .next8px |
| 1549 | %endif |
| 1550 | REP_RET |
| 1551 | %else ; mmsize == 16 |
| 1552 | RET |
| 1553 | %endif |
| 1554 | %endmacro |
| 1555 | |
| 1556 | %if ARCH_X86_32 |
| 1557 | INIT_MMX mmx |
| 1558 | MBEDGE_LOOPFILTER v, 16 |
| 1559 | MBEDGE_LOOPFILTER h, 16 |
| 1560 | MBEDGE_LOOPFILTER v, 8 |
| 1561 | MBEDGE_LOOPFILTER h, 8 |
| 1562 | |
| 1563 | INIT_MMX mmxext |
| 1564 | MBEDGE_LOOPFILTER v, 16 |
| 1565 | MBEDGE_LOOPFILTER h, 16 |
| 1566 | MBEDGE_LOOPFILTER v, 8 |
| 1567 | MBEDGE_LOOPFILTER h, 8 |
| 1568 | %endif |
| 1569 | |
| 1570 | INIT_XMM sse2 |
| 1571 | MBEDGE_LOOPFILTER v, 16 |
| 1572 | MBEDGE_LOOPFILTER h, 16 |
| 1573 | MBEDGE_LOOPFILTER v, 8 |
| 1574 | MBEDGE_LOOPFILTER h, 8 |
| 1575 | |
| 1576 | INIT_XMM ssse3 |
| 1577 | MBEDGE_LOOPFILTER v, 16 |
| 1578 | MBEDGE_LOOPFILTER h, 16 |
| 1579 | MBEDGE_LOOPFILTER v, 8 |
| 1580 | MBEDGE_LOOPFILTER h, 8 |
| 1581 | |
| 1582 | INIT_XMM sse4 |
| 1583 | MBEDGE_LOOPFILTER h, 16 |
| 1584 | MBEDGE_LOOPFILTER h, 8 |