| 1 | ;****************************************************************************** |
| 2 | ;* x86-optimized functions for gradfun filter |
| 3 | ;* |
| 4 | ;* This file is part of FFmpeg. |
| 5 | ;* |
| 6 | ;* FFmpeg is free software; you can redistribute it and/or |
| 7 | ;* modify it under the terms of the GNU Lesser General Public |
| 8 | ;* License as published by the Free Software Foundation; either |
| 9 | ;* version 2.1 of the License, or (at your option) any later version. |
| 10 | ;* |
| 11 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | ;* Lesser General Public License for more details. |
| 15 | ;* |
| 16 | ;* You should have received a copy of the GNU Lesser General Public |
| 17 | ;* License along with FFmpeg; if not, write to the Free Software |
| 18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | ;****************************************************************************** |
| 20 | |
| 21 | %include "libavutil/x86/x86util.asm" |
| 22 | |
| 23 | SECTION_RODATA |
| 24 | |
| 25 | pw_7f: times 8 dw 0x7F |
| 26 | pw_ff: times 8 dw 0xFF |
| 27 | |
| 28 | SECTION .text |
| 29 | |
| 30 | %macro FILTER_LINE 1 |
| 31 | movh m0, [r2+r0] |
| 32 | movh m1, [r3+r0] |
| 33 | punpcklbw m0, m7 |
| 34 | punpcklwd m1, m1 |
| 35 | psllw m0, 7 |
| 36 | psubw m1, m0 |
| 37 | PABSW m2, m1 |
| 38 | pmulhuw m2, m5 |
| 39 | psubw m2, m6 |
| 40 | pminsw m2, m7 |
| 41 | pmullw m2, m2 |
| 42 | psllw m1, 2 |
| 43 | paddw m0, %1 |
| 44 | pmulhw m1, m2 |
| 45 | paddw m0, m1 |
| 46 | psraw m0, 7 |
| 47 | packuswb m0, m0 |
| 48 | movh [r1+r0], m0 |
| 49 | %endmacro |
| 50 | |
| 51 | INIT_MMX mmxext |
| 52 | cglobal gradfun_filter_line, 6, 6 |
| 53 | movh m5, r4d |
| 54 | pxor m7, m7 |
| 55 | pshufw m5, m5,0 |
| 56 | mova m6, [pw_7f] |
| 57 | mova m3, [r5] |
| 58 | mova m4, [r5+8] |
| 59 | .loop: |
| 60 | FILTER_LINE m3 |
| 61 | add r0, 4 |
| 62 | jge .end |
| 63 | FILTER_LINE m4 |
| 64 | add r0, 4 |
| 65 | jl .loop |
| 66 | .end: |
| 67 | REP_RET |
| 68 | |
| 69 | INIT_XMM ssse3 |
| 70 | cglobal gradfun_filter_line, 6, 6, 8 |
| 71 | movd m5, r4d |
| 72 | pxor m7, m7 |
| 73 | pshuflw m5, m5, 0 |
| 74 | mova m6, [pw_7f] |
| 75 | punpcklqdq m5, m5 |
| 76 | mova m4, [r5] |
| 77 | .loop: |
| 78 | FILTER_LINE m4 |
| 79 | add r0, 8 |
| 80 | jl .loop |
| 81 | REP_RET |
| 82 | |
| 83 | %macro BLUR_LINE 1 |
| 84 | cglobal gradfun_blur_line_%1, 6, 6, 8 |
| 85 | mova m7, [pw_ff] |
| 86 | .loop: |
| 87 | %1 m0, [r4+r0] |
| 88 | %1 m1, [r5+r0] |
| 89 | mova m2, m0 |
| 90 | mova m3, m1 |
| 91 | psrlw m0, 8 |
| 92 | psrlw m1, 8 |
| 93 | pand m2, m7 |
| 94 | pand m3, m7 |
| 95 | paddw m0, m1 |
| 96 | paddw m2, m3 |
| 97 | paddw m0, m2 |
| 98 | paddw m0, [r2+r0] |
| 99 | mova m1, [r1+r0] |
| 100 | mova [r1+r0], m0 |
| 101 | psubw m0, m1 |
| 102 | mova [r3+r0], m0 |
| 103 | add r0, 16 |
| 104 | jl .loop |
| 105 | REP_RET |
| 106 | %endmacro |
| 107 | |
| 108 | INIT_XMM sse2 |
| 109 | BLUR_LINE movdqa |
| 110 | BLUR_LINE movdqu |