| 1 | ;****************************************************************************** |
| 2 | ;* x86-optimized input routines; does shuffling of packed |
| 3 | ;* YUV formats into individual planes, and converts RGB |
| 4 | ;* into YUV planes also. |
| 5 | ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> |
| 6 | ;* |
| 7 | ;* This file is part of FFmpeg. |
| 8 | ;* |
| 9 | ;* FFmpeg is free software; you can redistribute it and/or |
| 10 | ;* modify it under the terms of the GNU Lesser General Public |
| 11 | ;* License as published by the Free Software Foundation; either |
| 12 | ;* version 2.1 of the License, or (at your option) any later version. |
| 13 | ;* |
| 14 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 17 | ;* Lesser General Public License for more details. |
| 18 | ;* |
| 19 | ;* You should have received a copy of the GNU Lesser General Public |
| 20 | ;* License along with FFmpeg; if not, write to the Free Software |
| 21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 22 | ;****************************************************************************** |
| 23 | |
| 24 | %include "libavutil/x86/x86util.asm" |
| 25 | |
| 26 | SECTION_RODATA |
| 27 | |
| 28 | %define RY 0x20DE |
| 29 | %define GY 0x4087 |
| 30 | %define BY 0x0C88 |
| 31 | %define RU 0xECFF |
| 32 | %define GU 0xDAC8 |
| 33 | %define BU 0x3838 |
| 34 | %define RV 0x3838 |
| 35 | %define GV 0xD0E3 |
| 36 | %define BV 0xF6E4 |
| 37 | |
| 38 | rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 |
| 39 | rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 |
| 40 | %define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq |
| 41 | %define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq |
| 42 | %define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq |
| 43 | %define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq |
| 44 | %define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq |
| 45 | %define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq |
| 46 | %define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq |
| 47 | %define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq |
| 48 | %define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq |
| 49 | %define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq |
| 50 | %define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq |
| 51 | %define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq |
| 52 | |
| 53 | %define rgba_Ycoeff_rb 16*4 + 16*12 + tableq |
| 54 | %define rgba_Ycoeff_br 16*4 + 16*13 + tableq |
| 55 | %define rgba_Ycoeff_ga 16*4 + 16*14 + tableq |
| 56 | %define rgba_Ycoeff_ag 16*4 + 16*15 + tableq |
| 57 | %define rgba_Ucoeff_rb 16*4 + 16*16 + tableq |
| 58 | %define rgba_Ucoeff_br 16*4 + 16*17 + tableq |
| 59 | %define rgba_Ucoeff_ga 16*4 + 16*18 + tableq |
| 60 | %define rgba_Ucoeff_ag 16*4 + 16*19 + tableq |
| 61 | %define rgba_Vcoeff_rb 16*4 + 16*20 + tableq |
| 62 | %define rgba_Vcoeff_br 16*4 + 16*21 + tableq |
| 63 | %define rgba_Vcoeff_ga 16*4 + 16*22 + tableq |
| 64 | %define rgba_Vcoeff_ag 16*4 + 16*23 + tableq |
| 65 | |
| 66 | ; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY |
| 67 | ; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY |
| 68 | ; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY |
| 69 | ; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY |
| 70 | ; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU |
| 71 | ; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU |
| 72 | ; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU |
| 73 | ; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU |
| 74 | ; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV |
| 75 | ; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV |
| 76 | ; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV |
| 77 | ; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV |
| 78 | |
| 79 | ; rgba_Ycoeff_rb: times 4 dw RY, BY |
| 80 | ; rgba_Ycoeff_br: times 4 dw BY, RY |
| 81 | ; rgba_Ycoeff_ga: times 4 dw GY, 0 |
| 82 | ; rgba_Ycoeff_ag: times 4 dw 0, GY |
| 83 | ; rgba_Ucoeff_rb: times 4 dw RU, BU |
| 84 | ; rgba_Ucoeff_br: times 4 dw BU, RU |
| 85 | ; rgba_Ucoeff_ga: times 4 dw GU, 0 |
| 86 | ; rgba_Ucoeff_ag: times 4 dw 0, GU |
| 87 | ; rgba_Vcoeff_rb: times 4 dw RV, BV |
| 88 | ; rgba_Vcoeff_br: times 4 dw BV, RV |
| 89 | ; rgba_Vcoeff_ga: times 4 dw GV, 0 |
| 90 | ; rgba_Vcoeff_ag: times 4 dw 0, GV |
| 91 | |
| 92 | shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ |
| 93 | 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 |
| 94 | shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \ |
| 95 | 8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80 |
| 96 | |
| 97 | SECTION .text |
| 98 | |
| 99 | ;----------------------------------------------------------------------------- |
| 100 | ; RGB to Y/UV. |
| 101 | ; |
| 102 | ; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); |
| 103 | ; and |
| 104 | ; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, |
| 105 | ; const uint8_t *unused, int w); |
| 106 | ;----------------------------------------------------------------------------- |
| 107 | |
| 108 | ; %1 = nr. of XMM registers |
| 109 | ; %2 = rgb or bgr |
| 110 | %macro RGB24_TO_Y_FN 2-3 |
| 111 | cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table |
| 112 | %if mmsize == 8 |
| 113 | mova m5, [%2_Ycoeff_12x4] |
| 114 | mova m6, [%2_Ycoeff_3x56] |
| 115 | %define coeff1 m5 |
| 116 | %define coeff2 m6 |
| 117 | %elif ARCH_X86_64 |
| 118 | mova m8, [%2_Ycoeff_12x4] |
| 119 | mova m9, [%2_Ycoeff_3x56] |
| 120 | %define coeff1 m8 |
| 121 | %define coeff2 m9 |
| 122 | %else ; x86-32 && mmsize == 16 |
| 123 | %define coeff1 [%2_Ycoeff_12x4] |
| 124 | %define coeff2 [%2_Ycoeff_3x56] |
| 125 | %endif ; x86-32/64 && mmsize == 8/16 |
| 126 | %if (ARCH_X86_64 || mmsize == 8) && %0 == 3 |
| 127 | jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body |
| 128 | %else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 |
| 129 | .body: |
| 130 | %if cpuflag(ssse3) |
| 131 | mova m7, [shuf_rgb_12x4] |
| 132 | %define shuf_rgb1 m7 |
| 133 | %if ARCH_X86_64 |
| 134 | mova m10, [shuf_rgb_3x56] |
| 135 | %define shuf_rgb2 m10 |
| 136 | %else ; x86-32 |
| 137 | %define shuf_rgb2 [shuf_rgb_3x56] |
| 138 | %endif ; x86-32/64 |
| 139 | %endif ; cpuflag(ssse3) |
| 140 | %if ARCH_X86_64 |
| 141 | movsxd wq, wd |
| 142 | %endif |
| 143 | add wq, wq |
| 144 | add dstq, wq |
| 145 | neg wq |
| 146 | %if notcpuflag(ssse3) |
| 147 | pxor m7, m7 |
| 148 | %endif ; !cpuflag(ssse3) |
| 149 | mova m4, [rgb_Yrnd] |
| 150 | .loop: |
| 151 | %if cpuflag(ssse3) |
| 152 | movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] |
| 153 | movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] |
| 154 | pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } |
| 155 | pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } |
| 156 | pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } |
| 157 | pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } |
| 158 | %else ; !cpuflag(ssse3) |
| 159 | movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } |
| 160 | movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } |
| 161 | movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 } |
| 162 | movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 } |
| 163 | %if mmsize == 16 ; i.e. sse2 |
| 164 | punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } |
| 165 | punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } |
| 166 | movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 } |
| 167 | movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 } |
| 168 | movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 } |
| 169 | movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 } |
| 170 | punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } |
| 171 | punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } |
| 172 | %endif ; mmsize == 16 |
| 173 | punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } |
| 174 | punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } |
| 175 | punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } |
| 176 | punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } |
| 177 | %endif ; cpuflag(ssse3) |
| 178 | add srcq, 3 * mmsize / 2 |
| 179 | pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY } |
| 180 | pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY } |
| 181 | pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY } |
| 182 | pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY } |
| 183 | paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3] |
| 184 | paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] |
| 185 | paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } |
| 186 | paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } |
| 187 | psrad m0, 9 |
| 188 | psrad m2, 9 |
| 189 | packssdw m0, m2 ; (word) { Y[0-7] } |
| 190 | mova [dstq+wq], m0 |
| 191 | add wq, mmsize |
| 192 | jl .loop |
| 193 | REP_RET |
| 194 | %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 |
| 195 | %endmacro |
| 196 | |
| 197 | ; %1 = nr. of XMM registers |
| 198 | ; %2 = rgb or bgr |
| 199 | %macro RGB24_TO_UV_FN 2-3 |
| 200 | cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table |
| 201 | %if ARCH_X86_64 |
| 202 | mova m8, [%2_Ucoeff_12x4] |
| 203 | mova m9, [%2_Ucoeff_3x56] |
| 204 | mova m10, [%2_Vcoeff_12x4] |
| 205 | mova m11, [%2_Vcoeff_3x56] |
| 206 | %define coeffU1 m8 |
| 207 | %define coeffU2 m9 |
| 208 | %define coeffV1 m10 |
| 209 | %define coeffV2 m11 |
| 210 | %else ; x86-32 |
| 211 | %define coeffU1 [%2_Ucoeff_12x4] |
| 212 | %define coeffU2 [%2_Ucoeff_3x56] |
| 213 | %define coeffV1 [%2_Vcoeff_12x4] |
| 214 | %define coeffV2 [%2_Vcoeff_3x56] |
| 215 | %endif ; x86-32/64 |
| 216 | %if ARCH_X86_64 && %0 == 3 |
| 217 | jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body |
| 218 | %else ; ARCH_X86_64 && %0 == 3 |
| 219 | .body: |
| 220 | %if cpuflag(ssse3) |
| 221 | mova m7, [shuf_rgb_12x4] |
| 222 | %define shuf_rgb1 m7 |
| 223 | %if ARCH_X86_64 |
| 224 | mova m12, [shuf_rgb_3x56] |
| 225 | %define shuf_rgb2 m12 |
| 226 | %else ; x86-32 |
| 227 | %define shuf_rgb2 [shuf_rgb_3x56] |
| 228 | %endif ; x86-32/64 |
| 229 | %endif ; cpuflag(ssse3) |
| 230 | %if ARCH_X86_64 |
| 231 | movsxd wq, dword r5m |
| 232 | %else ; x86-32 |
| 233 | mov wq, r5m |
| 234 | %endif |
| 235 | add wq, wq |
| 236 | add dstUq, wq |
| 237 | add dstVq, wq |
| 238 | neg wq |
| 239 | mova m6, [rgb_UVrnd] |
| 240 | %if notcpuflag(ssse3) |
| 241 | pxor m7, m7 |
| 242 | %endif |
| 243 | .loop: |
| 244 | %if cpuflag(ssse3) |
| 245 | movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] |
| 246 | movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] |
| 247 | pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } |
| 248 | pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } |
| 249 | %else ; !cpuflag(ssse3) |
| 250 | movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } |
| 251 | movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } |
| 252 | movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 } |
| 253 | movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 } |
| 254 | %if mmsize == 16 |
| 255 | punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } |
| 256 | punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } |
| 257 | movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 } |
| 258 | movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 } |
| 259 | %endif ; mmsize == 16 |
| 260 | punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } |
| 261 | punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } |
| 262 | %endif ; cpuflag(ssse3) |
| 263 | pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV } |
| 264 | pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV } |
| 265 | pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU } |
| 266 | pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU } |
| 267 | paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3] |
| 268 | paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3] |
| 269 | %if cpuflag(ssse3) |
| 270 | pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } |
| 271 | pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } |
| 272 | %else ; !cpuflag(ssse3) |
| 273 | %if mmsize == 16 |
| 274 | movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 } |
| 275 | movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 } |
| 276 | punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } |
| 277 | punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } |
| 278 | %endif ; mmsize == 16 && !cpuflag(ssse3) |
| 279 | punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } |
| 280 | punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } |
| 281 | %endif ; cpuflag(ssse3) |
| 282 | add srcq, 3 * mmsize / 2 |
| 283 | pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU } |
| 284 | pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU } |
| 285 | pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV } |
| 286 | pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV } |
| 287 | paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7] |
| 288 | paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7] |
| 289 | paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] } |
| 290 | paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } |
| 291 | paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } |
| 292 | paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } |
| 293 | psrad m0, 9 |
| 294 | psrad m2, 9 |
| 295 | psrad m1, 9 |
| 296 | psrad m4, 9 |
| 297 | packssdw m0, m1 ; (word) { U[0-7] } |
| 298 | packssdw m2, m4 ; (word) { V[0-7] } |
| 299 | %if mmsize == 8 |
| 300 | mova [dstUq+wq], m0 |
| 301 | mova [dstVq+wq], m2 |
| 302 | %else ; mmsize == 16 |
| 303 | mova [dstUq+wq], m0 |
| 304 | mova [dstVq+wq], m2 |
| 305 | %endif ; mmsize == 8/16 |
| 306 | add wq, mmsize |
| 307 | jl .loop |
| 308 | REP_RET |
| 309 | %endif ; ARCH_X86_64 && %0 == 3 |
| 310 | %endmacro |
| 311 | |
| 312 | ; %1 = nr. of XMM registers for rgb-to-Y func |
| 313 | ; %2 = nr. of XMM registers for rgb-to-UV func |
| 314 | %macro RGB24_FUNCS 2 |
| 315 | RGB24_TO_Y_FN %1, rgb |
| 316 | RGB24_TO_Y_FN %1, bgr, rgb |
| 317 | RGB24_TO_UV_FN %2, rgb |
| 318 | RGB24_TO_UV_FN %2, bgr, rgb |
| 319 | %endmacro |
| 320 | |
| 321 | %if ARCH_X86_32 |
| 322 | INIT_MMX mmx |
| 323 | RGB24_FUNCS 0, 0 |
| 324 | %endif |
| 325 | |
| 326 | INIT_XMM sse2 |
| 327 | RGB24_FUNCS 10, 12 |
| 328 | |
| 329 | INIT_XMM ssse3 |
| 330 | RGB24_FUNCS 11, 13 |
| 331 | |
| 332 | %if HAVE_AVX_EXTERNAL |
| 333 | INIT_XMM avx |
| 334 | RGB24_FUNCS 11, 13 |
| 335 | %endif |
| 336 | |
| 337 | ; %1 = nr. of XMM registers |
| 338 | ; %2-5 = rgba, bgra, argb or abgr (in individual characters) |
| 339 | %macro RGB32_TO_Y_FN 5-6 |
| 340 | cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table |
| 341 | mova m5, [rgba_Ycoeff_%2%4] |
| 342 | mova m6, [rgba_Ycoeff_%3%5] |
| 343 | %if %0 == 6 |
| 344 | jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body |
| 345 | %else ; %0 == 6 |
| 346 | .body: |
| 347 | %if ARCH_X86_64 |
| 348 | movsxd wq, wd |
| 349 | %endif |
| 350 | add wq, wq |
| 351 | sub wq, mmsize - 1 |
| 352 | lea srcq, [srcq+wq*2] |
| 353 | add dstq, wq |
| 354 | neg wq |
| 355 | mova m4, [rgb_Yrnd] |
| 356 | pcmpeqb m7, m7 |
| 357 | psrlw m7, 8 ; (word) { 0x00ff } x4 |
| 358 | .loop: |
| 359 | ; FIXME check alignment and use mova |
| 360 | movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] |
| 361 | movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] |
| 362 | DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] |
| 363 | pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] |
| 364 | pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] |
| 365 | pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7] |
| 366 | pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7] |
| 367 | paddd m0, m4 ; += rgb_Yrnd |
| 368 | paddd m2, m4 ; += rgb_Yrnd |
| 369 | paddd m0, m1 ; (dword) { Y[0-3] } |
| 370 | paddd m2, m3 ; (dword) { Y[4-7] } |
| 371 | psrad m0, 9 |
| 372 | psrad m2, 9 |
| 373 | packssdw m0, m2 ; (word) { Y[0-7] } |
| 374 | mova [dstq+wq], m0 |
| 375 | add wq, mmsize |
| 376 | jl .loop |
| 377 | sub wq, mmsize - 1 |
| 378 | jz .end |
| 379 | add srcq, 2*mmsize - 2 |
| 380 | add dstq, mmsize - 1 |
| 381 | .loop2: |
| 382 | movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] |
| 383 | DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] |
| 384 | pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] |
| 385 | pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] |
| 386 | paddd m0, m4 ; += rgb_Yrnd |
| 387 | paddd m0, m1 ; (dword) { Y[0-3] } |
| 388 | psrad m0, 9 |
| 389 | packssdw m0, m0 ; (word) { Y[0-7] } |
| 390 | movd [dstq+wq], m0 |
| 391 | add wq, 2 |
| 392 | jl .loop2 |
| 393 | .end: |
| 394 | REP_RET |
| 395 | %endif ; %0 == 3 |
| 396 | %endmacro |
| 397 | |
| 398 | ; %1 = nr. of XMM registers |
| 399 | ; %2-5 = rgba, bgra, argb or abgr (in individual characters) |
| 400 | %macro RGB32_TO_UV_FN 5-6 |
| 401 | cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table |
| 402 | %if ARCH_X86_64 |
| 403 | mova m8, [rgba_Ucoeff_%2%4] |
| 404 | mova m9, [rgba_Ucoeff_%3%5] |
| 405 | mova m10, [rgba_Vcoeff_%2%4] |
| 406 | mova m11, [rgba_Vcoeff_%3%5] |
| 407 | %define coeffU1 m8 |
| 408 | %define coeffU2 m9 |
| 409 | %define coeffV1 m10 |
| 410 | %define coeffV2 m11 |
| 411 | %else ; x86-32 |
| 412 | %define coeffU1 [rgba_Ucoeff_%2%4] |
| 413 | %define coeffU2 [rgba_Ucoeff_%3%5] |
| 414 | %define coeffV1 [rgba_Vcoeff_%2%4] |
| 415 | %define coeffV2 [rgba_Vcoeff_%3%5] |
| 416 | %endif ; x86-64/32 |
| 417 | %if ARCH_X86_64 && %0 == 6 |
| 418 | jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body |
| 419 | %else ; ARCH_X86_64 && %0 == 6 |
| 420 | .body: |
| 421 | %if ARCH_X86_64 |
| 422 | movsxd wq, dword r5m |
| 423 | %else ; x86-32 |
| 424 | mov wq, r5m |
| 425 | %endif |
| 426 | add wq, wq |
| 427 | sub wq, mmsize - 1 |
| 428 | add dstUq, wq |
| 429 | add dstVq, wq |
| 430 | lea srcq, [srcq+wq*2] |
| 431 | neg wq |
| 432 | pcmpeqb m7, m7 |
| 433 | psrlw m7, 8 ; (word) { 0x00ff } x4 |
| 434 | mova m6, [rgb_UVrnd] |
| 435 | .loop: |
| 436 | ; FIXME check alignment and use mova |
| 437 | movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] |
| 438 | movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] |
| 439 | DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] |
| 440 | pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] |
| 441 | pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] |
| 442 | pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] |
| 443 | pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] |
| 444 | paddd m3, m6 ; += rgb_UVrnd |
| 445 | paddd m1, m6 ; += rgb_UVrnd |
| 446 | paddd m2, m3 ; (dword) { V[0-3] } |
| 447 | paddd m0, m1 ; (dword) { U[0-3] } |
| 448 | pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7] |
| 449 | pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7] |
| 450 | pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7] |
| 451 | pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] |
| 452 | paddd m3, m6 ; += rgb_UVrnd |
| 453 | paddd m5, m6 ; += rgb_UVrnd |
| 454 | psrad m0, 9 |
| 455 | paddd m1, m3 ; (dword) { V[4-7] } |
| 456 | paddd m4, m5 ; (dword) { U[4-7] } |
| 457 | psrad m2, 9 |
| 458 | psrad m4, 9 |
| 459 | psrad m1, 9 |
| 460 | packssdw m0, m4 ; (word) { U[0-7] } |
| 461 | packssdw m2, m1 ; (word) { V[0-7] } |
| 462 | %if mmsize == 8 |
| 463 | mova [dstUq+wq], m0 |
| 464 | mova [dstVq+wq], m2 |
| 465 | %else ; mmsize == 16 |
| 466 | mova [dstUq+wq], m0 |
| 467 | mova [dstVq+wq], m2 |
| 468 | %endif ; mmsize == 8/16 |
| 469 | add wq, mmsize |
| 470 | jl .loop |
| 471 | sub wq, mmsize - 1 |
| 472 | jz .end |
| 473 | add srcq , 2*mmsize - 2 |
| 474 | add dstUq, mmsize - 1 |
| 475 | add dstVq, mmsize - 1 |
| 476 | .loop2: |
| 477 | movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3] |
| 478 | DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] |
| 479 | pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] |
| 480 | pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] |
| 481 | pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] |
| 482 | pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] |
| 483 | paddd m3, m6 ; += rgb_UVrnd |
| 484 | paddd m1, m6 ; += rgb_UVrnd |
| 485 | paddd m2, m3 ; (dword) { V[0-3] } |
| 486 | paddd m0, m1 ; (dword) { U[0-3] } |
| 487 | psrad m0, 9 |
| 488 | psrad m2, 9 |
| 489 | packssdw m0, m0 ; (word) { U[0-7] } |
| 490 | packssdw m2, m2 ; (word) { V[0-7] } |
| 491 | movd [dstUq+wq], m0 |
| 492 | movd [dstVq+wq], m2 |
| 493 | add wq, 2 |
| 494 | jl .loop2 |
| 495 | .end: |
| 496 | REP_RET |
| 497 | %endif ; ARCH_X86_64 && %0 == 3 |
| 498 | %endmacro |
| 499 | |
| 500 | ; %1 = nr. of XMM registers for rgb-to-Y func |
| 501 | ; %2 = nr. of XMM registers for rgb-to-UV func |
| 502 | %macro RGB32_FUNCS 2 |
| 503 | RGB32_TO_Y_FN %1, r, g, b, a |
| 504 | RGB32_TO_Y_FN %1, b, g, r, a, rgba |
| 505 | RGB32_TO_Y_FN %1, a, r, g, b, rgba |
| 506 | RGB32_TO_Y_FN %1, a, b, g, r, rgba |
| 507 | |
| 508 | RGB32_TO_UV_FN %2, r, g, b, a |
| 509 | RGB32_TO_UV_FN %2, b, g, r, a, rgba |
| 510 | RGB32_TO_UV_FN %2, a, r, g, b, rgba |
| 511 | RGB32_TO_UV_FN %2, a, b, g, r, rgba |
| 512 | %endmacro |
| 513 | |
| 514 | %if ARCH_X86_32 |
| 515 | INIT_MMX mmx |
| 516 | RGB32_FUNCS 0, 0 |
| 517 | %endif |
| 518 | |
| 519 | INIT_XMM sse2 |
| 520 | RGB32_FUNCS 8, 12 |
| 521 | |
| 522 | %if HAVE_AVX_EXTERNAL |
| 523 | INIT_XMM avx |
| 524 | RGB32_FUNCS 8, 12 |
| 525 | %endif |
| 526 | |
| 527 | ;----------------------------------------------------------------------------- |
| 528 | ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. |
| 529 | ; |
| 530 | ; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); |
| 531 | ; and |
| 532 | ; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, |
| 533 | ; const uint8_t *unused, int w); |
| 534 | ;----------------------------------------------------------------------------- |
| 535 | |
| 536 | ; %1 = a (aligned) or u (unaligned) |
| 537 | ; %2 = yuyv or uyvy |
| 538 | %macro LOOP_YUYV_TO_Y 2 |
| 539 | .loop_%1: |
| 540 | mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... } |
| 541 | mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } |
| 542 | %ifidn %2, yuyv |
| 543 | pand m0, m2 ; (word) { Y0, Y1, ..., Y7 } |
| 544 | pand m1, m2 ; (word) { Y8, Y9, ..., Y15 } |
| 545 | %else ; uyvy |
| 546 | psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 } |
| 547 | psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 } |
| 548 | %endif ; yuyv/uyvy |
| 549 | packuswb m0, m1 ; (byte) { Y0, ..., Y15 } |
| 550 | mova [dstq+wq], m0 |
| 551 | add wq, mmsize |
| 552 | jl .loop_%1 |
| 553 | REP_RET |
| 554 | %endmacro |
| 555 | |
| 556 | ; %1 = nr. of XMM registers |
| 557 | ; %2 = yuyv or uyvy |
| 558 | ; %3 = if specified, it means that unaligned and aligned code in loop |
| 559 | ; will be the same (i.e. YUYV+AVX), and thus we don't need to |
| 560 | ; split the loop in an aligned and unaligned case |
| 561 | %macro YUYV_TO_Y_FN 2-3 |
| 562 | cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w |
| 563 | %if ARCH_X86_64 |
| 564 | movsxd wq, wd |
| 565 | %endif |
| 566 | add dstq, wq |
| 567 | %if mmsize == 16 |
| 568 | test srcq, 15 |
| 569 | %endif |
| 570 | lea srcq, [srcq+wq*2] |
| 571 | %ifidn %2, yuyv |
| 572 | pcmpeqb m2, m2 ; (byte) { 0xff } x 16 |
| 573 | psrlw m2, 8 ; (word) { 0x00ff } x 8 |
| 574 | %endif ; yuyv |
| 575 | %if mmsize == 16 |
| 576 | jnz .loop_u_start |
| 577 | neg wq |
| 578 | LOOP_YUYV_TO_Y a, %2 |
| 579 | .loop_u_start: |
| 580 | neg wq |
| 581 | LOOP_YUYV_TO_Y u, %2 |
| 582 | %else ; mmsize == 8 |
| 583 | neg wq |
| 584 | LOOP_YUYV_TO_Y a, %2 |
| 585 | %endif ; mmsize == 8/16 |
| 586 | %endmacro |
| 587 | |
| 588 | ; %1 = a (aligned) or u (unaligned) |
| 589 | ; %2 = yuyv or uyvy |
| 590 | %macro LOOP_YUYV_TO_UV 2 |
| 591 | .loop_%1: |
| 592 | %ifidn %2, yuyv |
| 593 | mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } |
| 594 | mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } |
| 595 | psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 } |
| 596 | psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 } |
| 597 | %else ; uyvy |
| 598 | %if cpuflag(avx) |
| 599 | vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 } |
| 600 | vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 } |
| 601 | %else |
| 602 | mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } |
| 603 | mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } |
| 604 | pand m0, m2 ; (word) { U0, V0, ..., U3, V3 } |
| 605 | pand m1, m2 ; (word) { U4, V4, ..., U7, V7 } |
| 606 | %endif |
| 607 | %endif ; yuyv/uyvy |
| 608 | packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 } |
| 609 | pand m1, m0, m2 ; (word) { U0, U1, ..., U7 } |
| 610 | psrlw m0, 8 ; (word) { V0, V1, ..., V7 } |
| 611 | %if mmsize == 16 |
| 612 | packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 } |
| 613 | movh [dstUq+wq], m1 |
| 614 | movhps [dstVq+wq], m1 |
| 615 | %else ; mmsize == 8 |
| 616 | packuswb m1, m1 ; (byte) { U0, ... U3 } |
| 617 | packuswb m0, m0 ; (byte) { V0, ... V3 } |
| 618 | movh [dstUq+wq], m1 |
| 619 | movh [dstVq+wq], m0 |
| 620 | %endif ; mmsize == 8/16 |
| 621 | add wq, mmsize / 2 |
| 622 | jl .loop_%1 |
| 623 | REP_RET |
| 624 | %endmacro |
| 625 | |
| 626 | ; %1 = nr. of XMM registers |
| 627 | ; %2 = yuyv or uyvy |
| 628 | ; %3 = if specified, it means that unaligned and aligned code in loop |
| 629 | ; will be the same (i.e. UYVY+AVX), and thus we don't need to |
| 630 | ; split the loop in an aligned and unaligned case |
| 631 | %macro YUYV_TO_UV_FN 2-3 |
| 632 | cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w |
| 633 | %if ARCH_X86_64 |
| 634 | movsxd wq, dword r5m |
| 635 | %else ; x86-32 |
| 636 | mov wq, r5m |
| 637 | %endif |
| 638 | add dstUq, wq |
| 639 | add dstVq, wq |
| 640 | %if mmsize == 16 && %0 == 2 |
| 641 | test srcq, 15 |
| 642 | %endif |
| 643 | lea srcq, [srcq+wq*4] |
| 644 | pcmpeqb m2, m2 ; (byte) { 0xff } x 16 |
| 645 | psrlw m2, 8 ; (word) { 0x00ff } x 8 |
| 646 | ; NOTE: if uyvy+avx, u/a are identical |
| 647 | %if mmsize == 16 && %0 == 2 |
| 648 | jnz .loop_u_start |
| 649 | neg wq |
| 650 | LOOP_YUYV_TO_UV a, %2 |
| 651 | .loop_u_start: |
| 652 | neg wq |
| 653 | LOOP_YUYV_TO_UV u, %2 |
| 654 | %else ; mmsize == 8 |
| 655 | neg wq |
| 656 | LOOP_YUYV_TO_UV a, %2 |
| 657 | %endif ; mmsize == 8/16 |
| 658 | %endmacro |
| 659 | |
| 660 | ; %1 = a (aligned) or u (unaligned) |
| 661 | ; %2 = nv12 or nv21 |
| 662 | %macro LOOP_NVXX_TO_UV 2 |
| 663 | .loop_%1: |
| 664 | mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } |
| 665 | mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } |
| 666 | pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } |
| 667 | pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } |
| 668 | psrlw m0, 8 ; (word) { V0, V1, ..., V7 } |
| 669 | psrlw m1, 8 ; (word) { V8, V9, ..., V15 } |
| 670 | packuswb m2, m3 ; (byte) { U0, ..., U15 } |
| 671 | packuswb m0, m1 ; (byte) { V0, ..., V15 } |
| 672 | %ifidn %2, nv12 |
| 673 | mova [dstUq+wq], m2 |
| 674 | mova [dstVq+wq], m0 |
| 675 | %else ; nv21 |
| 676 | mova [dstVq+wq], m2 |
| 677 | mova [dstUq+wq], m0 |
| 678 | %endif ; nv12/21 |
| 679 | add wq, mmsize |
| 680 | jl .loop_%1 |
| 681 | REP_RET |
| 682 | %endmacro |
| 683 | |
| 684 | ; %1 = nr. of XMM registers |
| 685 | ; %2 = nv12 or nv21 |
| 686 | %macro NVXX_TO_UV_FN 2 |
| 687 | cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w |
| 688 | %if ARCH_X86_64 |
| 689 | movsxd wq, dword r5m |
| 690 | %else ; x86-32 |
| 691 | mov wq, r5m |
| 692 | %endif |
| 693 | add dstUq, wq |
| 694 | add dstVq, wq |
| 695 | %if mmsize == 16 |
| 696 | test srcq, 15 |
| 697 | %endif |
| 698 | lea srcq, [srcq+wq*2] |
| 699 | pcmpeqb m5, m5 ; (byte) { 0xff } x 16 |
| 700 | psrlw m5, 8 ; (word) { 0x00ff } x 8 |
| 701 | %if mmsize == 16 |
| 702 | jnz .loop_u_start |
| 703 | neg wq |
| 704 | LOOP_NVXX_TO_UV a, %2 |
| 705 | .loop_u_start: |
| 706 | neg wq |
| 707 | LOOP_NVXX_TO_UV u, %2 |
| 708 | %else ; mmsize == 8 |
| 709 | neg wq |
| 710 | LOOP_NVXX_TO_UV a, %2 |
| 711 | %endif ; mmsize == 8/16 |
| 712 | %endmacro |
| 713 | |
| 714 | %if ARCH_X86_32 |
| 715 | INIT_MMX mmx |
| 716 | YUYV_TO_Y_FN 0, yuyv |
| 717 | YUYV_TO_Y_FN 0, uyvy |
| 718 | YUYV_TO_UV_FN 0, yuyv |
| 719 | YUYV_TO_UV_FN 0, uyvy |
| 720 | NVXX_TO_UV_FN 0, nv12 |
| 721 | NVXX_TO_UV_FN 0, nv21 |
| 722 | %endif |
| 723 | |
| 724 | INIT_XMM sse2 |
| 725 | YUYV_TO_Y_FN 3, yuyv |
| 726 | YUYV_TO_Y_FN 2, uyvy |
| 727 | YUYV_TO_UV_FN 3, yuyv |
| 728 | YUYV_TO_UV_FN 3, uyvy |
| 729 | NVXX_TO_UV_FN 5, nv12 |
| 730 | NVXX_TO_UV_FN 5, nv21 |
| 731 | |
| 732 | %if HAVE_AVX_EXTERNAL |
| 733 | INIT_XMM avx |
| 734 | ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but |
| 735 | ; that's not faster in practice |
| 736 | YUYV_TO_UV_FN 3, yuyv |
| 737 | YUYV_TO_UV_FN 3, uyvy, 1 |
| 738 | NVXX_TO_UV_FN 5, nv12 |
| 739 | NVXX_TO_UV_FN 5, nv21 |
| 740 | %endif |