Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / rv34dsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pw_row_coeffs: times 4 dw 13
26 times 4 dw 17
27 times 4 dw 7
28pd_512: times 2 dd 0x200
29pw_col_coeffs: dw 13, 13, 13, -13
30 dw 17, 7, 7, -17
31 dw 13, -13, 13, 13
32 dw -7, 17, -17, -7
33
34SECTION .text
35
36%macro IDCT_DC_NOROUND 1
37 imul %1, 13*13*3
38 sar %1, 11
39%endmacro
40
41%macro IDCT_DC_ROUND 1
42 imul %1, 13*13
43 add %1, 0x200
44 sar %1, 10
45%endmacro
46
47%macro rv34_idct 1
48cglobal rv34_idct_%1, 1, 2, 0
49 movsx r1, word [r0]
50 IDCT_DC r1
51 movd m0, r1d
52 pshufw m0, m0, 0
53 movq [r0+ 0], m0
54 movq [r0+ 8], m0
55 movq [r0+16], m0
56 movq [r0+24], m0
57 REP_RET
58%endmacro
59
60INIT_MMX mmxext
61%define IDCT_DC IDCT_DC_ROUND
62rv34_idct dc
63%define IDCT_DC IDCT_DC_NOROUND
64rv34_idct dc_noround
65
66; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
67INIT_MMX mmx
68cglobal rv34_idct_dc_add, 3, 3
69 ; calculate DC
70 IDCT_DC_ROUND r2
71 pxor m1, m1
72 movd m0, r2d
73 psubw m1, m0
74 packuswb m0, m0
75 packuswb m1, m1
76 punpcklbw m0, m0
77 punpcklbw m1, m1
78 punpcklwd m0, m0
79 punpcklwd m1, m1
80
81 ; add DC
82 lea r2, [r0+r1*2]
83 movh m2, [r0]
84 movh m3, [r0+r1]
85 movh m4, [r2]
86 movh m5, [r2+r1]
87 paddusb m2, m0
88 paddusb m3, m0
89 paddusb m4, m0
90 paddusb m5, m0
91 psubusb m2, m1
92 psubusb m3, m1
93 psubusb m4, m1
94 psubusb m5, m1
95 movh [r0], m2
96 movh [r0+r1], m3
97 movh [r2], m4
98 movh [r2+r1], m5
99 RET
100
101; Load coeffs and perform row transform
102; Output: coeffs in mm[0467], rounder in mm5
103%macro ROW_TRANSFORM 1
104 pxor mm7, mm7
105 mova mm0, [%1+ 0*8]
106 mova mm1, [%1+ 1*8]
107 mova mm2, [%1+ 2*8]
108 mova mm3, [%1+ 3*8]
109 mova [%1+ 0*8], mm7
110 mova [%1+ 1*8], mm7
111 mova [%1+ 2*8], mm7
112 mova [%1+ 3*8], mm7
113 mova mm4, mm0
114 mova mm6, [pw_row_coeffs+ 0]
115 paddsw mm0, mm2 ; b0 + b2
116 psubsw mm4, mm2 ; b0 - b2
117 pmullw mm0, mm6 ; *13 = z0
118 pmullw mm4, mm6 ; *13 = z1
119 mova mm5, mm1
120 pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
121 pmullw mm5, [pw_row_coeffs+16] ; b1* 7
122 mova mm7, mm3
123 pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
124 pmullw mm7, [pw_row_coeffs+16] ; b3* 7
125 paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
126 psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
127 mova mm7, mm0
128 mova mm6, mm4
129 paddsw mm0, mm1 ; z0 + z3
130 psubsw mm7, mm1 ; z0 - z3
131 paddsw mm4, mm5 ; z1 + z2
132 psubsw mm6, mm5 ; z1 - z2
133 mova mm5, [pd_512] ; 0x200
134%endmacro
135
136; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
137%macro COL_TRANSFORM 4
138 pshufw mm3, %2, 0xDD ; col. 1,3,1,3
139 pshufw %2, %2, 0x88 ; col. 0,2,0,2
140 pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
141 pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
142 paddd %2, mm5
143 pshufw mm1, %2, 01001110b ; z1 | z0
144 pshufw mm2, mm3, 01001110b ; z2 | z3
145 paddd %2, mm3 ; z0+z3 | z1+z2
146 psubd mm1, mm2 ; z1-z2 | z0-z3
147 movd mm3, %1
148 psrad %2, 10
149 pxor mm2, mm2
150 psrad mm1, 10
151 punpcklbw mm3, mm2
152 packssdw %2, mm1
153 paddw %2, mm3
154 packuswb %2, %2
155 movd %1, %2
156%endmacro
157INIT_MMX mmxext
158cglobal rv34_idct_add, 3,3,0, d, s, b
159 ROW_TRANSFORM bq
160 COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
161 mova mm0, [pw_col_coeffs+ 0]
162 COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
163 mova mm4, [pw_col_coeffs+ 8]
164 lea dq, [dq + 2*sq]
165 COL_TRANSFORM [dq], mm6, mm0, mm4
166 COL_TRANSFORM [dq+sq], mm7, mm0, mm4
167 ret
168
169; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
170INIT_XMM sse4
171cglobal rv34_idct_dc_add, 3, 3, 6
172 ; load data
173 IDCT_DC_ROUND r2
174 pxor m1, m1
175
176 ; calculate DC
177 movd m0, r2d
178 lea r2, [r0+r1*2]
179 movd m2, [r0]
180 movd m3, [r0+r1]
181 pshuflw m0, m0, 0
182 movd m4, [r2]
183 movd m5, [r2+r1]
184 punpcklqdq m0, m0
185 punpckldq m2, m3
186 punpckldq m4, m5
187 punpcklbw m2, m1
188 punpcklbw m4, m1
189 paddw m2, m0
190 paddw m4, m0
191 packuswb m2, m4
192 movd [r0], m2
193 pextrd [r0+r1], m2, 1
194 pextrd [r2], m2, 2
195 pextrd [r2+r1], m2, 3
196 RET