Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_weight.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* SSE2-optimized weighted prediction code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28;-----------------------------------------------------------------------------
29; biweight pred:
30;
31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
32; int height, int log2_denom, int weightd,
33; int weights, int offset);
34; and
35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
36; int log2_denom, int weight, int offset);
37;-----------------------------------------------------------------------------
38
39%macro WEIGHT_SETUP 0
40 add r5, r5
41 inc r5
42 movd m3, r4d
43 movd m5, r5d
44 movd m6, r3d
45 pslld m5, m6
46 psrld m5, 1
47%if mmsize == 16
48 pshuflw m3, m3, 0
49 pshuflw m5, m5, 0
50 punpcklqdq m3, m3
51 punpcklqdq m5, m5
52%else
53 pshufw m3, m3, 0
54 pshufw m5, m5, 0
55%endif
56 pxor m7, m7
57%endmacro
58
59%macro WEIGHT_OP 2
60 movh m0, [r0+%1]
61 movh m1, [r0+%2]
62 punpcklbw m0, m7
63 punpcklbw m1, m7
64 pmullw m0, m3
65 pmullw m1, m3
66 paddsw m0, m5
67 paddsw m1, m5
68 psraw m0, m6
69 psraw m1, m6
70 packuswb m0, m1
71%endmacro
72
73INIT_MMX mmxext
74cglobal h264_weight_16, 6, 6, 0
75 WEIGHT_SETUP
76.nextrow:
77 WEIGHT_OP 0, 4
78 mova [r0 ], m0
79 WEIGHT_OP 8, 12
80 mova [r0+8], m0
81 add r0, r1
82 dec r2d
83 jnz .nextrow
84 REP_RET
85
86%macro WEIGHT_FUNC_MM 2
87cglobal h264_weight_%1, 6, 6, %2
88 WEIGHT_SETUP
89.nextrow:
90 WEIGHT_OP 0, mmsize/2
91 mova [r0], m0
92 add r0, r1
93 dec r2d
94 jnz .nextrow
95 REP_RET
96%endmacro
97
98INIT_MMX mmxext
99WEIGHT_FUNC_MM 8, 0
100INIT_XMM sse2
101WEIGHT_FUNC_MM 16, 8
102
103%macro WEIGHT_FUNC_HALF_MM 2
104cglobal h264_weight_%1, 6, 6, %2
105 WEIGHT_SETUP
106 sar r2d, 1
107 lea r3, [r1*2]
108.nextrow:
109 WEIGHT_OP 0, r1
110 movh [r0], m0
111%if mmsize == 16
112 movhps [r0+r1], m0
113%else
114 psrlq m0, 32
115 movh [r0+r1], m0
116%endif
117 add r0, r3
118 dec r2d
119 jnz .nextrow
120 REP_RET
121%endmacro
122
123INIT_MMX mmxext
124WEIGHT_FUNC_HALF_MM 4, 0
125INIT_XMM sse2
126WEIGHT_FUNC_HALF_MM 8, 8
127
128%macro BIWEIGHT_SETUP 0
129%if ARCH_X86_64
130%define off_regd r7d
131%else
132%define off_regd r3d
133%endif
134 mov off_regd, r7m
135 add off_regd, 1
136 or off_regd, 1
137 add r4, 1
138 cmp r5, 128
139 jne .normal
140 sar r5, 1
141 sar r6, 1
142 sar off_regd, 1
143 sub r4, 1
144.normal
145%if cpuflag(ssse3)
146 movd m4, r5d
147 movd m0, r6d
148%else
149 movd m3, r5d
150 movd m4, r6d
151%endif
152 movd m5, off_regd
153 movd m6, r4d
154 pslld m5, m6
155 psrld m5, 1
156%if cpuflag(ssse3)
157 punpcklbw m4, m0
158 pshuflw m4, m4, 0
159 pshuflw m5, m5, 0
160 punpcklqdq m4, m4
161 punpcklqdq m5, m5
162
163%else
164%if mmsize == 16
165 pshuflw m3, m3, 0
166 pshuflw m4, m4, 0
167 pshuflw m5, m5, 0
168 punpcklqdq m3, m3
169 punpcklqdq m4, m4
170 punpcklqdq m5, m5
171%else
172 pshufw m3, m3, 0
173 pshufw m4, m4, 0
174 pshufw m5, m5, 0
175%endif
176 pxor m7, m7
177%endif
178%endmacro
179
180%macro BIWEIGHT_STEPA 3
181 movh m%1, [r0+%3]
182 movh m%2, [r1+%3]
183 punpcklbw m%1, m7
184 punpcklbw m%2, m7
185 pmullw m%1, m3
186 pmullw m%2, m4
187 paddsw m%1, m%2
188%endmacro
189
190%macro BIWEIGHT_STEPB 0
191 paddsw m0, m5
192 paddsw m1, m5
193 psraw m0, m6
194 psraw m1, m6
195 packuswb m0, m1
196%endmacro
197
198INIT_MMX mmxext
199cglobal h264_biweight_16, 7, 8, 0
200 BIWEIGHT_SETUP
201 movifnidn r3d, r3m
202.nextrow:
203 BIWEIGHT_STEPA 0, 1, 0
204 BIWEIGHT_STEPA 1, 2, 4
205 BIWEIGHT_STEPB
206 mova [r0], m0
207 BIWEIGHT_STEPA 0, 1, 8
208 BIWEIGHT_STEPA 1, 2, 12
209 BIWEIGHT_STEPB
210 mova [r0+8], m0
211 add r0, r2
212 add r1, r2
213 dec r3d
214 jnz .nextrow
215 REP_RET
216
217%macro BIWEIGHT_FUNC_MM 2
218cglobal h264_biweight_%1, 7, 8, %2
219 BIWEIGHT_SETUP
220 movifnidn r3d, r3m
221.nextrow:
222 BIWEIGHT_STEPA 0, 1, 0
223 BIWEIGHT_STEPA 1, 2, mmsize/2
224 BIWEIGHT_STEPB
225 mova [r0], m0
226 add r0, r2
227 add r1, r2
228 dec r3d
229 jnz .nextrow
230 REP_RET
231%endmacro
232
233INIT_MMX mmxext
234BIWEIGHT_FUNC_MM 8, 0
235INIT_XMM sse2
236BIWEIGHT_FUNC_MM 16, 8
237
238%macro BIWEIGHT_FUNC_HALF_MM 2
239cglobal h264_biweight_%1, 7, 8, %2
240 BIWEIGHT_SETUP
241 movifnidn r3d, r3m
242 sar r3, 1
243 lea r4, [r2*2]
244.nextrow:
245 BIWEIGHT_STEPA 0, 1, 0
246 BIWEIGHT_STEPA 1, 2, r2
247 BIWEIGHT_STEPB
248 movh [r0], m0
249%if mmsize == 16
250 movhps [r0+r2], m0
251%else
252 psrlq m0, 32
253 movh [r0+r2], m0
254%endif
255 add r0, r4
256 add r1, r4
257 dec r3d
258 jnz .nextrow
259 REP_RET
260%endmacro
261
262INIT_MMX mmxext
263BIWEIGHT_FUNC_HALF_MM 4, 0
264INIT_XMM sse2
265BIWEIGHT_FUNC_HALF_MM 8, 8
266
267%macro BIWEIGHT_SSSE3_OP 0
268 pmaddubsw m0, m4
269 pmaddubsw m2, m4
270 paddsw m0, m5
271 paddsw m2, m5
272 psraw m0, m6
273 psraw m2, m6
274 packuswb m0, m2
275%endmacro
276
277INIT_XMM ssse3
278cglobal h264_biweight_16, 7, 8, 8
279 BIWEIGHT_SETUP
280 movifnidn r3d, r3m
281
282.nextrow:
283 movh m0, [r0]
284 movh m2, [r0+8]
285 movh m3, [r1+8]
286 punpcklbw m0, [r1]
287 punpcklbw m2, m3
288 BIWEIGHT_SSSE3_OP
289 mova [r0], m0
290 add r0, r2
291 add r1, r2
292 dec r3d
293 jnz .nextrow
294 REP_RET
295
296INIT_XMM ssse3
297cglobal h264_biweight_8, 7, 8, 8
298 BIWEIGHT_SETUP
299 movifnidn r3d, r3m
300 sar r3, 1
301 lea r4, [r2*2]
302
303.nextrow:
304 movh m0, [r0]
305 movh m1, [r1]
306 movh m2, [r0+r2]
307 movh m3, [r1+r2]
308 punpcklbw m0, m1
309 punpcklbw m2, m3
310 BIWEIGHT_SSSE3_OP
311 movh [r0], m0
312 movhps [r0+r2], m0
313 add r0, r4
314 add r1, r4
315 dec r3d
316 jnz .nextrow
317 REP_RET