Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / huffyuvdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* SIMD-optimized HuffYUV functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2014 Christophe Gisquet
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26pb_f: times 16 db 15
27pb_zzzzzzzz77777777: times 8 db -1
28pb_7: times 8 db 7
29pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
30pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31
32SECTION_TEXT
33
34; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
35; const uint8_t *diff, int w,
36; int *left, int *left_top)
37%macro HFYU_MEDIAN 0
38cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top
39 movu m0, [topq]
40 mova m2, m0
41 movd m4, [left_topq]
42 LSHIFT m2, 1
43 mova m1, m0
44 por m4, m2
45 movd m3, [leftq]
46 psubb m0, m4 ; t-tl
47 add dstq, wq
48 add topq, wq
49 add diffq, wq
50 neg wq
51 jmp .skip
52.loop:
53 movu m4, [topq+wq]
54 mova m0, m4
55 LSHIFT m4, 1
56 por m4, m1
57 mova m1, m0 ; t
58 psubb m0, m4 ; t-tl
59.skip:
60 movu m2, [diffq+wq]
61%assign i 0
62%rep mmsize
63 mova m4, m0
64 paddb m4, m3 ; t-tl+l
65 mova m5, m3
66 pmaxub m3, m1
67 pminub m5, m1
68 pminub m3, m4
69 pmaxub m3, m5 ; median
70 paddb m3, m2 ; +residual
71%if i==0
72 mova m7, m3
73 LSHIFT m7, mmsize-1
74%else
75 mova m6, m3
76 RSHIFT m7, 1
77 LSHIFT m6, mmsize-1
78 por m7, m6
79%endif
80%if i<mmsize-1
81 RSHIFT m0, 1
82 RSHIFT m1, 1
83 RSHIFT m2, 1
84%endif
85%assign i i+1
86%endrep
87 movu [dstq+wq], m7
88 add wq, mmsize
89 jl .loop
90 movzx r2d, byte [dstq-1]
91 mov [leftq], r2d
92 movzx r2d, byte [topq-1]
93 mov [left_topq], r2d
94 RET
95%endmacro
96
97%if ARCH_X86_32
98INIT_MMX mmxext
99HFYU_MEDIAN
100%endif
101INIT_XMM sse2
102HFYU_MEDIAN
103
104
105%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
106 add srcq, wq
107 add dstq, wq
108 neg wq
109%%.loop:
110%if %2
111 mova m1, [srcq+wq]
112%else
113 movu m1, [srcq+wq]
114%endif
115 mova m2, m1
116 psllw m1, 8
117 paddb m1, m2
118 mova m2, m1
119 pshufb m1, m3
120 paddb m1, m2
121 pshufb m0, m5
122 mova m2, m1
123 pshufb m1, m4
124 paddb m1, m2
125%if mmsize == 16
126 mova m2, m1
127 pshufb m1, m6
128 paddb m1, m2
129%endif
130 paddb m0, m1
131%if %1
132 mova [dstq+wq], m0
133%else
134 movq [dstq+wq], m0
135 movhps [dstq+wq+8], m0
136%endif
137 add wq, mmsize
138 jl %%.loop
139 mov eax, mmsize-1
140 sub eax, wd
141 movd m1, eax
142 pshufb m0, m1
143 movd eax, m0
144 RET
145%endmacro
146
147; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
148INIT_MMX ssse3
149cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
150.skip_prologue:
151 mova m5, [pb_7]
152 mova m4, [pb_zzzz3333zzzzbbbb]
153 mova m3, [pb_zz11zz55zz99zzdd]
154 movd m0, leftm
155 psllq m0, 56
156 ADD_HFYU_LEFT_LOOP 1, 1
157
158INIT_XMM sse4
159cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
160 mova m5, [pb_f]
161 mova m6, [pb_zzzzzzzz77777777]
162 mova m4, [pb_zzzz3333zzzzbbbb]
163 mova m3, [pb_zz11zz55zz99zzdd]
164 movd m0, leftm
165 pslldq m0, 15
166 test srcq, 15
167 jnz .src_unaligned
168 test dstq, 15
169 jnz .dst_unaligned
170 ADD_HFYU_LEFT_LOOP 1, 1
171.dst_unaligned:
172 ADD_HFYU_LEFT_LOOP 0, 1
173.src_unaligned:
174 ADD_HFYU_LEFT_LOOP 0, 0
175
176%macro ADD_BYTES 0
177cglobal add_bytes, 3,4,2, dst, src, w, size
178 mov sizeq, wq
179 and sizeq, -2*mmsize
180 jz .2
181 add dstq, sizeq
182 add srcq, sizeq
183 neg sizeq
184.1:
185 mova m0, [srcq + sizeq]
186 mova m1, [srcq + sizeq + mmsize]
187 paddb m0, [dstq + sizeq]
188 paddb m1, [dstq + sizeq + mmsize]
189 mova [dstq + sizeq], m0
190 mova [dstq + sizeq + mmsize], m1
191 add sizeq, 2*mmsize
192 jl .1
193.2:
194 and wq, 2*mmsize-1
195 jz .end
196 add dstq, wq
197 add srcq, wq
198 neg wq
199.3
200 mov sizeb, [srcq + wq]
201 add [dstq + wq], sizeb
202 inc wq
203 jl .3
204.end:
205 REP_RET
206%endmacro
207
208%if ARCH_X86_32
209INIT_MMX mmx
210ADD_BYTES
211%endif
212INIT_XMM sse2
213ADD_BYTES
214
215; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
216; intptr_t w, uint8_t *left)
217%macro LEFT_BGR32 0
218cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
219 shl wq, 2
220 movd m0, [leftq]
221 lea dstq, [dstq + wq]
222 lea srcq, [srcq + wq]
223 LSHIFT m0, mmsize-4
224 neg wq
225.loop:
226 movu m1, [srcq+wq]
227 mova m2, m1
228%if mmsize == 8
229 punpckhdq m0, m0
230%endif
231 LSHIFT m1, 4
232 paddb m1, m2
233%if mmsize == 16
234 pshufd m0, m0, q3333
235 mova m2, m1
236 LSHIFT m1, 8
237 paddb m1, m2
238%endif
239 paddb m0, m1
240 movu [dstq+wq], m0
241 add wq, mmsize
242 jl .loop
243 movd m0, [dstq-4]
244 movd [leftq], m0
245 REP_RET
246%endmacro
247
248%if ARCH_X86_32
249INIT_MMX mmx
250LEFT_BGR32
251%endif
252INIT_XMM sse2
253LEFT_BGR32