Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / lossless_videodsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* SIMD lossless video DSP utils
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2014 Michael Niedermayer
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pb_ef: times 8 db 14,15
28pb_67: times 8 db 6, 7
29pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
30pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
31
32SECTION_TEXT
33
34%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
35 movd m4, maskd
36 SPLATW m4, m4
37 add wd, wd
38 test wq, 2*mmsize - 1
39 jz %%.tomainloop
40 push tmpq
41%%.wordloop:
42 sub wq, 2
43%ifidn %2, add
44 mov tmpw, [srcq+wq]
45 add tmpw, [dstq+wq]
46%else
47 mov tmpw, [src1q+wq]
48 sub tmpw, [src2q+wq]
49%endif
50 and tmpw, maskw
51 mov [dstq+wq], tmpw
52 test wq, 2*mmsize - 1
53 jnz %%.wordloop
54 pop tmpq
55%%.tomainloop:
56%ifidn %2, add
57 add srcq, wq
58%else
59 add src1q, wq
60 add src2q, wq
61%endif
62 add dstq, wq
63 neg wq
64 jz %%.end
65%%.loop:
66%ifidn %2, add
67 mov%1 m0, [srcq+wq]
68 mov%1 m1, [dstq+wq]
69 mov%1 m2, [srcq+wq+mmsize]
70 mov%1 m3, [dstq+wq+mmsize]
71%else
72 mov%1 m0, [src1q+wq]
73 mov%1 m1, [src2q+wq]
74 mov%1 m2, [src1q+wq+mmsize]
75 mov%1 m3, [src2q+wq+mmsize]
76%endif
77 p%2w m0, m1
78 p%2w m2, m3
79 pand m0, m4
80 pand m2, m4
81 mov%1 [dstq+wq] , m0
82 mov%1 [dstq+wq+mmsize], m2
83 add wq, 2*mmsize
84 jl %%.loop
85%%.end:
86 RET
87%endmacro
88
89INIT_MMX mmx
90cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
91 INT16_LOOP a, add
92
93INIT_XMM sse2
94cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
95 test srcq, mmsize-1
96 jnz .unaligned
97 test dstq, mmsize-1
98 jnz .unaligned
99 INT16_LOOP a, add
100.unaligned:
101 INT16_LOOP u, add
102
103INIT_MMX mmx
104cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
105 INT16_LOOP a, sub
106
107INIT_XMM sse2
108cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
109 test src1q, mmsize-1
110 jnz .unaligned
111 test src2q, mmsize-1
112 jnz .unaligned
113 test dstq, mmsize-1
114 jnz .unaligned
115 INT16_LOOP a, sub
116.unaligned:
117 INT16_LOOP u, sub
118
119
120%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
121 add wd, wd
122 add srcq, wq
123 add dstq, wq
124 neg wq
125%%.loop:
126 mov%2 m1, [srcq+wq]
127 mova m2, m1
128 pslld m1, 16
129 paddw m1, m2
130 mova m2, m1
131
132 pshufb m1, m3
133 paddw m1, m2
134 pshufb m0, m5
135%if mmsize == 16
136 mova m2, m1
137 pshufb m1, m4
138 paddw m1, m2
139%endif
140 paddw m0, m1
141 pand m0, m7
142%ifidn %1, a
143 mova [dstq+wq], m0
144%else
145 movq [dstq+wq], m0
146 movhps [dstq+wq+8], m0
147%endif
148 add wq, mmsize
149 jl %%.loop
150 mov eax, mmsize-1
151 sub eax, wd
152 mov wd, eax
153 shl wd, 8
154 lea eax, [wd+eax-1]
155 movd m1, eax
156 pshufb m0, m1
157 movd eax, m0
158 RET
159%endmacro
160
161; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
162INIT_MMX ssse3
163cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
164.skip_prologue:
165 mova m5, [pb_67]
166 mova m3, [pb_zzzz2323zzzzabab]
167 movd m0, leftm
168 psllq m0, 48
169 movd m7, maskm
170 SPLATW m7 ,m7
171 ADD_HFYU_LEFT_LOOP_INT16 a, a
172
173INIT_XMM sse4
174cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
175 mova m5, [pb_ef]
176 mova m4, [pb_zzzzzzzz67676767]
177 mova m3, [pb_zzzz2323zzzzabab]
178 movd m0, leftm
179 pslldq m0, 14
180 movd m7, maskm
181 SPLATW m7 ,m7
182 test srcq, 15
183 jnz .src_unaligned
184 test dstq, 15
185 jnz .dst_unaligned
186 ADD_HFYU_LEFT_LOOP_INT16 a, a
187.dst_unaligned:
188 ADD_HFYU_LEFT_LOOP_INT16 u, a
189.src_unaligned:
190 ADD_HFYU_LEFT_LOOP_INT16 u, u
191
192; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
193INIT_MMX mmxext
194cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
195 add wd, wd
196 movd mm6, maskd
197 SPLATW mm6, mm6
198 movq mm0, [topq]
199 movq mm2, mm0
200 movd mm4, [left_topq]
201 psllq mm2, 16
202 movq mm1, mm0
203 por mm4, mm2
204 movd mm3, [leftq]
205 psubw mm0, mm4 ; t-tl
206 add dstq, wq
207 add topq, wq
208 add diffq, wq
209 neg wq
210 jmp .skip
211.loop:
212 movq mm4, [topq+wq]
213 movq mm0, mm4
214 psllq mm4, 16
215 por mm4, mm1
216 movq mm1, mm0 ; t
217 psubw mm0, mm4 ; t-tl
218.skip:
219 movq mm2, [diffq+wq]
220%assign i 0
221%rep 4
222 movq mm4, mm0
223 paddw mm4, mm3 ; t-tl+l
224 pand mm4, mm6
225 movq mm5, mm3
226 pmaxsw mm3, mm1
227 pminsw mm5, mm1
228 pminsw mm3, mm4
229 pmaxsw mm3, mm5 ; median
230 paddw mm3, mm2 ; +residual
231 pand mm3, mm6
232%if i==0
233 movq mm7, mm3
234 psllq mm7, 48
235%else
236 movq mm4, mm3
237 psrlq mm7, 16
238 psllq mm4, 48
239 por mm7, mm4
240%endif
241%if i<3
242 psrlq mm0, 16
243 psrlq mm1, 16
244 psrlq mm2, 16
245%endif
246%assign i i+1
247%endrep
248 movq [dstq+wq], mm7
249 add wq, 8
250 jl .loop
251 movzx r2d, word [dstq-2]
252 mov [leftq], r2d
253 movzx r2d, word [topq-2]
254 mov [left_topq], r2d
255 RET
256
257cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
258 add wd, wd
259 movd mm7, maskd
260 SPLATW mm7, mm7
261 movq mm0, [src1q]
262 movq mm2, [src2q]
263 psllq mm0, 16
264 psllq mm2, 16
265 movd mm6, [left_topq]
266 por mm0, mm6
267 movd mm6, [leftq]
268 por mm2, mm6
269 xor maskq, maskq
270.loop:
271 movq mm1, [src1q + maskq]
272 movq mm3, [src2q + maskq]
273 movq mm4, mm2
274 psubw mm2, mm0
275 paddw mm2, mm1
276 pand mm2, mm7
277 movq mm5, mm4
278 pmaxsw mm4, mm1
279 pminsw mm1, mm5
280 pminsw mm4, mm2
281 pmaxsw mm4, mm1
282 psubw mm3, mm4
283 pand mm3, mm7
284 movq [dstq + maskq], mm3
285 add maskq, 8
286 movq mm0, [src1q + maskq - 2]
287 movq mm2, [src2q + maskq - 2]
288 cmp maskq, wq
289 jb .loop
290 movzx maskd, word [src1q + wq - 2]
291 mov [left_topq], maskd
292 movzx maskd, word [src2q + wq - 2]
293 mov [leftq], maskd
294 RET