Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* SIMD-optimized HuffYUV functions | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* Copyright (c) 2014 Christophe Gisquet | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | ||
25 | SECTION_RODATA | |
26 | pb_f: times 16 db 15 | |
27 | pb_zzzzzzzz77777777: times 8 db -1 | |
28 | pb_7: times 8 db 7 | |
29 | pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |
30 | pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |
31 | ||
32 | SECTION_TEXT | |
33 | ||
34 | ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, | |
35 | ; const uint8_t *diff, int w, | |
36 | ; int *left, int *left_top) | |
37 | %macro HFYU_MEDIAN 0 | |
38 | cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top | |
39 | movu m0, [topq] | |
40 | mova m2, m0 | |
41 | movd m4, [left_topq] | |
42 | LSHIFT m2, 1 | |
43 | mova m1, m0 | |
44 | por m4, m2 | |
45 | movd m3, [leftq] | |
46 | psubb m0, m4 ; t-tl | |
47 | add dstq, wq | |
48 | add topq, wq | |
49 | add diffq, wq | |
50 | neg wq | |
51 | jmp .skip | |
52 | .loop: | |
53 | movu m4, [topq+wq] | |
54 | mova m0, m4 | |
55 | LSHIFT m4, 1 | |
56 | por m4, m1 | |
57 | mova m1, m0 ; t | |
58 | psubb m0, m4 ; t-tl | |
59 | .skip: | |
60 | movu m2, [diffq+wq] | |
61 | %assign i 0 | |
62 | %rep mmsize | |
63 | mova m4, m0 | |
64 | paddb m4, m3 ; t-tl+l | |
65 | mova m5, m3 | |
66 | pmaxub m3, m1 | |
67 | pminub m5, m1 | |
68 | pminub m3, m4 | |
69 | pmaxub m3, m5 ; median | |
70 | paddb m3, m2 ; +residual | |
71 | %if i==0 | |
72 | mova m7, m3 | |
73 | LSHIFT m7, mmsize-1 | |
74 | %else | |
75 | mova m6, m3 | |
76 | RSHIFT m7, 1 | |
77 | LSHIFT m6, mmsize-1 | |
78 | por m7, m6 | |
79 | %endif | |
80 | %if i<mmsize-1 | |
81 | RSHIFT m0, 1 | |
82 | RSHIFT m1, 1 | |
83 | RSHIFT m2, 1 | |
84 | %endif | |
85 | %assign i i+1 | |
86 | %endrep | |
87 | movu [dstq+wq], m7 | |
88 | add wq, mmsize | |
89 | jl .loop | |
90 | movzx r2d, byte [dstq-1] | |
91 | mov [leftq], r2d | |
92 | movzx r2d, byte [topq-1] | |
93 | mov [left_topq], r2d | |
94 | RET | |
95 | %endmacro | |
96 | ||
97 | %if ARCH_X86_32 | |
98 | INIT_MMX mmxext | |
99 | HFYU_MEDIAN | |
100 | %endif | |
101 | INIT_XMM sse2 | |
102 | HFYU_MEDIAN | |
103 | ||
104 | ||
105 | %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned | |
106 | add srcq, wq | |
107 | add dstq, wq | |
108 | neg wq | |
109 | %%.loop: | |
110 | %if %2 | |
111 | mova m1, [srcq+wq] | |
112 | %else | |
113 | movu m1, [srcq+wq] | |
114 | %endif | |
115 | mova m2, m1 | |
116 | psllw m1, 8 | |
117 | paddb m1, m2 | |
118 | mova m2, m1 | |
119 | pshufb m1, m3 | |
120 | paddb m1, m2 | |
121 | pshufb m0, m5 | |
122 | mova m2, m1 | |
123 | pshufb m1, m4 | |
124 | paddb m1, m2 | |
125 | %if mmsize == 16 | |
126 | mova m2, m1 | |
127 | pshufb m1, m6 | |
128 | paddb m1, m2 | |
129 | %endif | |
130 | paddb m0, m1 | |
131 | %if %1 | |
132 | mova [dstq+wq], m0 | |
133 | %else | |
134 | movq [dstq+wq], m0 | |
135 | movhps [dstq+wq+8], m0 | |
136 | %endif | |
137 | add wq, mmsize | |
138 | jl %%.loop | |
139 | mov eax, mmsize-1 | |
140 | sub eax, wd | |
141 | movd m1, eax | |
142 | pshufb m0, m1 | |
143 | movd eax, m0 | |
144 | RET | |
145 | %endmacro | |
146 | ||
147 | ; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left) | |
148 | INIT_MMX ssse3 | |
149 | cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left | |
150 | .skip_prologue: | |
151 | mova m5, [pb_7] | |
152 | mova m4, [pb_zzzz3333zzzzbbbb] | |
153 | mova m3, [pb_zz11zz55zz99zzdd] | |
154 | movd m0, leftm | |
155 | psllq m0, 56 | |
156 | ADD_HFYU_LEFT_LOOP 1, 1 | |
157 | ||
158 | INIT_XMM sse4 | |
159 | cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left | |
160 | mova m5, [pb_f] | |
161 | mova m6, [pb_zzzzzzzz77777777] | |
162 | mova m4, [pb_zzzz3333zzzzbbbb] | |
163 | mova m3, [pb_zz11zz55zz99zzdd] | |
164 | movd m0, leftm | |
165 | pslldq m0, 15 | |
166 | test srcq, 15 | |
167 | jnz .src_unaligned | |
168 | test dstq, 15 | |
169 | jnz .dst_unaligned | |
170 | ADD_HFYU_LEFT_LOOP 1, 1 | |
171 | .dst_unaligned: | |
172 | ADD_HFYU_LEFT_LOOP 0, 1 | |
173 | .src_unaligned: | |
174 | ADD_HFYU_LEFT_LOOP 0, 0 | |
175 | ||
176 | %macro ADD_BYTES 0 | |
177 | cglobal add_bytes, 3,4,2, dst, src, w, size | |
178 | mov sizeq, wq | |
179 | and sizeq, -2*mmsize | |
180 | jz .2 | |
181 | add dstq, sizeq | |
182 | add srcq, sizeq | |
183 | neg sizeq | |
184 | .1: | |
185 | mova m0, [srcq + sizeq] | |
186 | mova m1, [srcq + sizeq + mmsize] | |
187 | paddb m0, [dstq + sizeq] | |
188 | paddb m1, [dstq + sizeq + mmsize] | |
189 | mova [dstq + sizeq], m0 | |
190 | mova [dstq + sizeq + mmsize], m1 | |
191 | add sizeq, 2*mmsize | |
192 | jl .1 | |
193 | .2: | |
194 | and wq, 2*mmsize-1 | |
195 | jz .end | |
196 | add dstq, wq | |
197 | add srcq, wq | |
198 | neg wq | |
199 | .3 | |
200 | mov sizeb, [srcq + wq] | |
201 | add [dstq + wq], sizeb | |
202 | inc wq | |
203 | jl .3 | |
204 | .end: | |
205 | REP_RET | |
206 | %endmacro | |
207 | ||
208 | %if ARCH_X86_32 | |
209 | INIT_MMX mmx | |
210 | ADD_BYTES | |
211 | %endif | |
212 | INIT_XMM sse2 | |
213 | ADD_BYTES | |
214 | ||
215 | ; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src, | |
216 | ; intptr_t w, uint8_t *left) | |
217 | %macro LEFT_BGR32 0 | |
218 | cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left | |
219 | shl wq, 2 | |
220 | movd m0, [leftq] | |
221 | lea dstq, [dstq + wq] | |
222 | lea srcq, [srcq + wq] | |
223 | LSHIFT m0, mmsize-4 | |
224 | neg wq | |
225 | .loop: | |
226 | movu m1, [srcq+wq] | |
227 | mova m2, m1 | |
228 | %if mmsize == 8 | |
229 | punpckhdq m0, m0 | |
230 | %endif | |
231 | LSHIFT m1, 4 | |
232 | paddb m1, m2 | |
233 | %if mmsize == 16 | |
234 | pshufd m0, m0, q3333 | |
235 | mova m2, m1 | |
236 | LSHIFT m1, 8 | |
237 | paddb m1, m2 | |
238 | %endif | |
239 | paddb m0, m1 | |
240 | movu [dstq+wq], m0 | |
241 | add wq, mmsize | |
242 | jl .loop | |
243 | movd m0, [dstq-4] | |
244 | movd [leftq], m0 | |
245 | REP_RET | |
246 | %endmacro | |
247 | ||
248 | %if ARCH_X86_32 | |
249 | INIT_MMX mmx | |
250 | LEFT_BGR32 | |
251 | %endif | |
252 | INIT_XMM sse2 | |
253 | LEFT_BGR32 |