| 1 | ;****************************************************************************** |
| 2 | ;* Copyright (c) 2012 Loren Merritt |
| 3 | ;* |
| 4 | ;* This file is part of FFmpeg. |
| 5 | ;* |
| 6 | ;* FFmpeg is free software; you can redistribute it and/or |
| 7 | ;* modify it under the terms of the GNU Lesser General Public |
| 8 | ;* License as published by the Free Software Foundation; either |
| 9 | ;* version 2.1 of the License, or (at your option) any later version. |
| 10 | ;* |
| 11 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | ;* Lesser General Public License for more details. |
| 15 | ;* |
| 16 | ;* You should have received a copy of the GNU Lesser General Public |
| 17 | ;* License along with FFmpeg; if not, write to the Free Software |
| 18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | ;****************************************************************************** |
| 20 | |
| 21 | %include "libavutil/x86/x86util.asm" |
| 22 | |
| 23 | SECTION .text |
| 24 | |
| 25 | %macro LOWPASS 3 ; prevsample, cursample, lut |
| 26 | sub %1q, %2q |
| 27 | %if lut_bits != 8 |
| 28 | sar %1q, 8-lut_bits |
| 29 | %endif |
| 30 | movsx %1d, word [%3q+%1q*2] |
| 31 | add %1d, %2d |
| 32 | %endmacro |
| 33 | |
| 34 | %macro LOAD 3 ; dstreg, x, bitdepth |
| 35 | %if %3 == 8 |
| 36 | movzx %1, byte [srcq+%2] |
| 37 | %else |
| 38 | movzx %1, word [srcq+(%2)*2] |
| 39 | %endif |
| 40 | %if %3 != 16 |
| 41 | shl %1, 16-%3 |
| 42 | add %1, (1<<(15-%3))-1 |
| 43 | %endif |
| 44 | %endmacro |
| 45 | |
| 46 | %macro HQDN3D_ROW 1 ; bitdepth |
| 47 | %if ARCH_X86_64 |
| 48 | cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, spatial, temporal, pixelant, t0, t1 |
| 49 | %else |
| 50 | cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, spatial, temporal |
| 51 | %endif |
| 52 | %assign bytedepth (%1+7)>>3 |
| 53 | %assign lut_bits 4+4*(%1/16) |
| 54 | dec widthq |
| 55 | lea srcq, [srcq+widthq*bytedepth] |
| 56 | lea dstq, [dstq+widthq*bytedepth] |
| 57 | lea frameantq, [frameantq+widthq*2] |
| 58 | lea lineantq, [lineantq+widthq*2] |
| 59 | neg widthq |
| 60 | %define xq widthq |
| 61 | %if ARCH_X86_32 |
| 62 | mov dstmp, dstq |
| 63 | mov srcmp, srcq |
| 64 | mov frameantmp, frameantq |
| 65 | mov lineantmp, lineantq |
| 66 | %define dstq r0 |
| 67 | %define frameantq r0 |
| 68 | %define lineantq r0 |
| 69 | %define pixelantq r1 |
| 70 | %define pixelantd r1d |
| 71 | DECLARE_REG_TMP 2,3 |
| 72 | %endif |
| 73 | LOAD pixelantd, xq, %1 |
| 74 | ALIGN 16 |
| 75 | .loop: |
| 76 | movifnidn srcq, srcmp |
| 77 | LOAD t0d, xq+1, %1 ; skip on the last iteration to avoid overread |
| 78 | .loop2: |
| 79 | movifnidn lineantq, lineantmp |
| 80 | movzx t1d, word [lineantq+xq*2] |
| 81 | LOWPASS t1, pixelant, spatial |
| 82 | mov [lineantq+xq*2], t1w |
| 83 | LOWPASS pixelant, t0, spatial |
| 84 | movifnidn frameantq, frameantmp |
| 85 | movzx t0d, word [frameantq+xq*2] |
| 86 | LOWPASS t0, t1, temporal |
| 87 | mov [frameantq+xq*2], t0w |
| 88 | movifnidn dstq, dstmp |
| 89 | %if %1 != 16 |
| 90 | shr t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation |
| 91 | %endif |
| 92 | %if %1 == 8 |
| 93 | mov [dstq+xq], t0b |
| 94 | %else |
| 95 | mov [dstq+xq*2], t0w |
| 96 | %endif |
| 97 | inc xq |
| 98 | jl .loop |
| 99 | je .loop2 |
| 100 | REP_RET |
| 101 | %endmacro ; HQDN3D_ROW |
| 102 | |
| 103 | HQDN3D_ROW 8 |
| 104 | HQDN3D_ROW 9 |
| 105 | HQDN3D_ROW 10 |
| 106 | HQDN3D_ROW 16 |