| 1 | /* |
| 2 | * Alpha optimized DSP utils |
| 3 | * Copyright (c) 2002 Falk Hueffner <falk@debian.org> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "regdef.h" |
| 23 | |
| 24 | /* Some nicer register names. */ |
| 25 | #define ta t10 |
| 26 | #define tb t11 |
| 27 | #define tc t12 |
| 28 | #define td AT |
| 29 | /* Danger: these overlap with the argument list and the return value */ |
| 30 | #define te a5 |
| 31 | #define tf a4 |
| 32 | #define tg a3 |
| 33 | #define th v0 |
| 34 | |
| 35 | .set noat |
| 36 | .set noreorder |
| 37 | .arch pca56 |
| 38 | .text |
| 39 | |
| 40 | /***************************************************************************** |
| 41 | * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size) |
| 42 | * |
| 43 | * This code is written with a pca56 in mind. For ev6, one should |
| 44 | * really take the increased latency of 3 cycles for MVI instructions |
| 45 | * into account. |
| 46 | * |
| 47 | * It is important to keep the loading and first use of a register as |
| 48 | * far apart as possible, because if a register is accessed before it |
| 49 | * has been fetched from memory, the CPU will stall. |
| 50 | */ |
| 51 | .align 4 |
| 52 | .globl pix_abs16x16_mvi_asm |
| 53 | .ent pix_abs16x16_mvi_asm |
| 54 | pix_abs16x16_mvi_asm: |
| 55 | .frame sp, 0, ra, 0 |
| 56 | .prologue 0 |
| 57 | |
| 58 | and a2, 7, t0 |
| 59 | clr v0 |
| 60 | beq t0, $aligned |
| 61 | .align 4 |
| 62 | $unaligned: |
| 63 | /* Registers: |
| 64 | line 0: |
| 65 | t0: left_u -> left lo -> left |
| 66 | t1: mid |
| 67 | t2: right_u -> right hi -> right |
| 68 | t3: ref left |
| 69 | t4: ref right |
| 70 | line 1: |
| 71 | t5: left_u -> left lo -> left |
| 72 | t6: mid |
| 73 | t7: right_u -> right hi -> right |
| 74 | t8: ref left |
| 75 | t9: ref right |
| 76 | temp: |
| 77 | ta: left hi |
| 78 | tb: right lo |
| 79 | tc: error left |
| 80 | td: error right */ |
| 81 | |
| 82 | /* load line 0 */ |
| 83 | ldq_u t0, 0(a2) # left_u |
| 84 | ldq_u t1, 8(a2) # mid |
| 85 | ldq_u t2, 16(a2) # right_u |
| 86 | ldq t3, 0(a1) # ref left |
| 87 | ldq t4, 8(a1) # ref right |
| 88 | addq a1, a3, a1 # pix1 |
| 89 | addq a2, a3, a2 # pix2 |
| 90 | /* load line 1 */ |
| 91 | ldq_u t5, 0(a2) # left_u |
| 92 | ldq_u t6, 8(a2) # mid |
| 93 | ldq_u t7, 16(a2) # right_u |
| 94 | ldq t8, 0(a1) # ref left |
| 95 | ldq t9, 8(a1) # ref right |
| 96 | addq a1, a3, a1 # pix1 |
| 97 | addq a2, a3, a2 # pix2 |
| 98 | /* calc line 0 */ |
| 99 | extql t0, a2, t0 # left lo |
| 100 | extqh t1, a2, ta # left hi |
| 101 | extql t1, a2, tb # right lo |
| 102 | or t0, ta, t0 # left |
| 103 | extqh t2, a2, t2 # right hi |
| 104 | perr t3, t0, tc # error left |
| 105 | or t2, tb, t2 # right |
| 106 | perr t4, t2, td # error right |
| 107 | addq v0, tc, v0 # add error left |
| 108 | addq v0, td, v0 # add error left |
| 109 | /* calc line 1 */ |
| 110 | extql t5, a2, t5 # left lo |
| 111 | extqh t6, a2, ta # left hi |
| 112 | extql t6, a2, tb # right lo |
| 113 | or t5, ta, t5 # left |
| 114 | extqh t7, a2, t7 # right hi |
| 115 | perr t8, t5, tc # error left |
| 116 | or t7, tb, t7 # right |
| 117 | perr t9, t7, td # error right |
| 118 | addq v0, tc, v0 # add error left |
| 119 | addq v0, td, v0 # add error left |
| 120 | /* loop */ |
| 121 | subq a4, 2, a4 # h -= 2 |
| 122 | bne a4, $unaligned |
| 123 | ret |
| 124 | |
| 125 | .align 4 |
| 126 | $aligned: |
| 127 | /* load line 0 */ |
| 128 | ldq t0, 0(a2) # left |
| 129 | ldq t1, 8(a2) # right |
| 130 | addq a2, a3, a2 # pix2 |
| 131 | ldq t2, 0(a1) # ref left |
| 132 | ldq t3, 8(a1) # ref right |
| 133 | addq a1, a3, a1 # pix1 |
| 134 | /* load line 1 */ |
| 135 | ldq t4, 0(a2) # left |
| 136 | ldq t5, 8(a2) # right |
| 137 | addq a2, a3, a2 # pix2 |
| 138 | ldq t6, 0(a1) # ref left |
| 139 | ldq t7, 8(a1) # ref right |
| 140 | addq a1, a3, a1 # pix1 |
| 141 | /* load line 2 */ |
| 142 | ldq t8, 0(a2) # left |
| 143 | ldq t9, 8(a2) # right |
| 144 | addq a2, a3, a2 # pix2 |
| 145 | ldq ta, 0(a1) # ref left |
| 146 | ldq tb, 8(a1) # ref right |
| 147 | addq a1, a3, a1 # pix1 |
| 148 | /* load line 3 */ |
| 149 | ldq tc, 0(a2) # left |
| 150 | ldq td, 8(a2) # right |
| 151 | addq a2, a3, a2 # pix2 |
| 152 | ldq te, 0(a1) # ref left |
| 153 | ldq a0, 8(a1) # ref right |
| 154 | /* calc line 0 */ |
| 155 | perr t0, t2, t0 # error left |
| 156 | addq a1, a3, a1 # pix1 |
| 157 | perr t1, t3, t1 # error right |
| 158 | addq v0, t0, v0 # add error left |
| 159 | /* calc line 1 */ |
| 160 | perr t4, t6, t0 # error left |
| 161 | addq v0, t1, v0 # add error right |
| 162 | perr t5, t7, t1 # error right |
| 163 | addq v0, t0, v0 # add error left |
| 164 | /* calc line 2 */ |
| 165 | perr t8, ta, t0 # error left |
| 166 | addq v0, t1, v0 # add error right |
| 167 | perr t9, tb, t1 # error right |
| 168 | addq v0, t0, v0 # add error left |
| 169 | /* calc line 3 */ |
| 170 | perr tc, te, t0 # error left |
| 171 | addq v0, t1, v0 # add error right |
| 172 | perr td, a0, t1 # error right |
| 173 | addq v0, t0, v0 # add error left |
| 174 | addq v0, t1, v0 # add error right |
| 175 | /* loop */ |
| 176 | subq a4, 4, a4 # h -= 4 |
| 177 | bne a4, $aligned |
| 178 | ret |
| 179 | .end pix_abs16x16_mvi_asm |