Commit | Line | Data |
---|---|---|
f6fa7814 DM |
1 | ;****************************************************************************** |
2 | ;* V210 SIMD pack | |
3 | ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_RODATA | |
25 | ||
26 | v210_enc_min_10: times 8 dw 0x4 | |
27 | v210_enc_max_10: times 8 dw 0x3fb | |
28 | ||
29 | v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 | |
30 | v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 | |
31 | ||
32 | v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 | |
33 | v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 | |
34 | ||
35 | v210_enc_min_8: times 16 db 0x1 | |
36 | v210_enc_max_8: times 16 db 0xfe | |
37 | ||
38 | v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 | |
39 | v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 | |
40 | ||
41 | v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 | |
42 | v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 | |
43 | ||
44 | v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 | |
45 | ||
46 | SECTION .text | |
47 | ||
48 | %macro v210_planar_pack_10 0 | |
49 | ||
50 | ; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) | |
51 | cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width | |
52 | lea r0, [yq+2*widthq] | |
53 | add uq, widthq | |
54 | add vq, widthq | |
55 | neg widthq | |
56 | ||
57 | mova m2, [v210_enc_min_10] | |
58 | mova m3, [v210_enc_max_10] | |
59 | ||
60 | .loop | |
61 | movu m0, [yq+2*widthq] | |
62 | CLIPW m0, m2, m3 | |
63 | ||
64 | movq m1, [uq+widthq] | |
65 | movhps m1, [vq+widthq] | |
66 | CLIPW m1, m2, m3 | |
67 | ||
68 | pmullw m0, [v210_enc_luma_mult_10] | |
69 | pshufb m0, [v210_enc_luma_shuf_10] | |
70 | ||
71 | pmullw m1, [v210_enc_chroma_mult_10] | |
72 | pshufb m1, [v210_enc_chroma_shuf_10] | |
73 | ||
74 | por m0, m1 | |
75 | ||
76 | movu [dstq], m0 | |
77 | ||
78 | add dstq, mmsize | |
79 | add widthq, 6 | |
80 | jl .loop | |
81 | ||
82 | RET | |
83 | %endmacro | |
84 | ||
85 | INIT_XMM ssse3 | |
86 | v210_planar_pack_10 | |
87 | ||
88 | %macro v210_planar_pack_8 0 | |
89 | ||
90 | ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) | |
91 | cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width | |
92 | add yq, widthq | |
93 | shr widthq, 1 | |
94 | add uq, widthq | |
95 | add vq, widthq | |
96 | neg widthq | |
97 | ||
98 | mova m4, [v210_enc_min_8] | |
99 | mova m5, [v210_enc_max_8] | |
100 | pxor m6, m6 | |
101 | ||
102 | .loop | |
103 | movu m1, [yq+2*widthq] | |
104 | CLIPUB m1, m4, m5 | |
105 | ||
106 | punpcklbw m0, m1, m6 | |
107 | ; can't unpack high bytes in the same way because we process | |
108 | ; only six bytes at a time | |
109 | pshufb m1, [v210_enc_luma_shuf_8] | |
110 | ||
111 | pmullw m0, [v210_enc_luma_mult_8] | |
112 | pmullw m1, [v210_enc_luma_mult_8] | |
113 | pshufb m0, [v210_enc_luma_shuf_10] | |
114 | pshufb m1, [v210_enc_luma_shuf_10] | |
115 | ||
116 | movq m3, [uq+widthq] | |
117 | movhps m3, [vq+widthq] | |
118 | CLIPUB m3, m4, m5 | |
119 | ||
120 | ; shuffle and multiply to get the same packing as in 10-bit | |
121 | pshufb m2, m3, [v210_enc_chroma_shuf1_8] | |
122 | pshufb m3, [v210_enc_chroma_shuf2_8] | |
123 | ||
124 | pmullw m2, [v210_enc_chroma_mult_8] | |
125 | pmullw m3, [v210_enc_chroma_mult_8] | |
126 | pshufb m2, [v210_enc_chroma_shuf_10] | |
127 | pshufb m3, [v210_enc_chroma_shuf_10] | |
128 | ||
129 | por m0, m2 | |
130 | por m1, m3 | |
131 | ||
132 | movu [dstq], m0 | |
133 | movu [dstq+mmsize], m1 | |
134 | ||
135 | add dstq, 2*mmsize | |
136 | add widthq, 6 | |
137 | jl .loop | |
138 | ||
139 | RET | |
140 | %endmacro | |
141 | ||
142 | INIT_XMM ssse3 | |
143 | v210_planar_pack_8 | |
144 | INIT_XMM avx | |
145 | v210_planar_pack_8 |