Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / idctdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* SIMD-optimized IDCT-related routines
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2003-2013 Michael Niedermayer
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pb_80
29
30SECTION_TEXT
31
32;--------------------------------------------------------------------------
33;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
f6fa7814 34; ptrdiff_t line_size)
2ba45a60
DM
35;--------------------------------------------------------------------------
36
37%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
38 mova m1, [blockq+mmsize*0+%1]
39 mova m2, [blockq+mmsize*2+%1]
40%if mmsize == 8
41 mova m3, [blockq+mmsize*4+%1]
42 mova m4, [blockq+mmsize*6+%1]
43%endif
44 packsswb m1, [blockq+mmsize*1+%1]
45 packsswb m2, [blockq+mmsize*3+%1]
46%if mmsize == 8
47 packsswb m3, [blockq+mmsize*5+%1]
48 packsswb m4, [blockq+mmsize*7+%1]
49%endif
50 paddb m1, m0
51 paddb m2, m0
52%if mmsize == 8
53 paddb m3, m0
54 paddb m4, m0
55 movq [pixelsq+lsizeq*0], m1
56 movq [pixelsq+lsizeq*1], m2
57 movq [pixelsq+lsizeq*2], m3
58 movq [pixelsq+lsize3q ], m4
59%else
60 movq [pixelsq+lsizeq*0], m1
61 movhps [pixelsq+lsizeq*1], m1
62 movq [pixelsq+lsizeq*2], m2
63 movhps [pixelsq+lsize3q ], m2
64%endif
65%endmacro
66
67%macro PUT_SIGNED_PIXELS_CLAMPED 1
68cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
69 mova m0, [pb_80]
70 lea lsize3q, [lsizeq*3]
71 PUT_SIGNED_PIXELS_CLAMPED_HALF 0
72 lea pixelsq, [pixelsq+lsizeq*4]
73 PUT_SIGNED_PIXELS_CLAMPED_HALF 64
74 RET
75%endmacro
76
77INIT_MMX mmx
78PUT_SIGNED_PIXELS_CLAMPED 0
79INIT_XMM sse2
80PUT_SIGNED_PIXELS_CLAMPED 3
f6fa7814
DM
81
82;--------------------------------------------------------------------------
83; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
84; ptrdiff_t line_size);
85;--------------------------------------------------------------------------
86; %1 = block offset
87%macro PUT_PIXELS_CLAMPED_HALF 1
88 mova m0, [blockq+mmsize*0+%1]
89 mova m1, [blockq+mmsize*2+%1]
90%if mmsize == 8
91 mova m2, [blockq+mmsize*4+%1]
92 mova m3, [blockq+mmsize*6+%1]
93%endif
94 packuswb m0, [blockq+mmsize*1+%1]
95 packuswb m1, [blockq+mmsize*3+%1]
96%if mmsize == 8
97 packuswb m2, [blockq+mmsize*5+%1]
98 packuswb m3, [blockq+mmsize*7+%1]
99 movq [pixelsq], m0
100 movq [lsizeq+pixelsq], m1
101 movq [2*lsizeq+pixelsq], m2
102 movq [lsize3q+pixelsq], m3
103%else
104 movq [pixelsq], m0
105 movhps [lsizeq+pixelsq], m0
106 movq [2*lsizeq+pixelsq], m1
107 movhps [lsize3q+pixelsq], m1
108%endif
109%endmacro
110
111%macro PUT_PIXELS_CLAMPED 0
112cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
113 lea lsize3q, [lsizeq*3]
114 PUT_PIXELS_CLAMPED_HALF 0
115 lea pixelsq, [pixelsq+lsizeq*4]
116 PUT_PIXELS_CLAMPED_HALF 64
117 RET
118%endmacro
119
120INIT_MMX mmx
121PUT_PIXELS_CLAMPED
122INIT_XMM sse2
123PUT_PIXELS_CLAMPED
124
125;--------------------------------------------------------------------------
126; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
127; ptrdiff_t line_size);
128;--------------------------------------------------------------------------
129; %1 = block offset
130%macro ADD_PIXELS_CLAMPED 1
131 mova m0, [blockq+mmsize*0+%1]
132 mova m1, [blockq+mmsize*1+%1]
133%if mmsize == 8
134 mova m5, [blockq+mmsize*2+%1]
135 mova m6, [blockq+mmsize*3+%1]
136%endif
137 movq m2, [pixelsq]
138 movq m3, [pixelsq+lsizeq]
139%if mmsize == 8
140 mova m7, m2
141 punpcklbw m2, m4
142 punpckhbw m7, m4
143 paddsw m0, m2
144 paddsw m1, m7
145 mova m7, m3
146 punpcklbw m3, m4
147 punpckhbw m7, m4
148 paddsw m5, m3
149 paddsw m6, m7
150%else
151 punpcklbw m2, m4
152 punpcklbw m3, m4
153 paddsw m0, m2
154 paddsw m1, m3
155%endif
156 packuswb m0, m1
157%if mmsize == 8
158 packuswb m5, m6
159 movq [pixelsq], m0
160 movq [pixelsq+lsizeq], m5
161%else
162 movq [pixelsq], m0
163 movhps [pixelsq+lsizeq], m0
164%endif
165%endmacro
166
167%macro ADD_PIXELS_CLAMPED 0
168cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
169 pxor m4, m4
170 ADD_PIXELS_CLAMPED 0
171 lea pixelsq, [pixelsq+lsizeq*2]
172 ADD_PIXELS_CLAMPED 32
173 lea pixelsq, [pixelsq+lsizeq*2]
174 ADD_PIXELS_CLAMPED 64
175 lea pixelsq, [pixelsq+lsizeq*2]
176 ADD_PIXELS_CLAMPED 96
177 RET
178%endmacro
179
180INIT_MMX mmx
181ADD_PIXELS_CLAMPED
182INIT_XMM sse2
183ADD_PIXELS_CLAMPED