Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / ttadsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* TTA DSP SIMD optimizations
3;*
4;* Copyright (C) 2014 James Almer
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pd_n0113: dd ~0, ~1, ~1, ~3
28pd_1224: dd 1, 2, 2, 4
29
30SECTION .text
31
32%macro TTA_FILTER 2
33INIT_XMM %1
34cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round
35 mova m2, [qmq ]
36 mova m3, [qmq + 0x10]
37 mova m4, [dxq ]
38 mova m5, [dxq + 0x10]
39
40 movd m6, [errorq] ; if (filter->error < 0) {
41 SPLATD m6 ; for (int i = 0; i < 8; i++)
42 psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
43 psignd m1, m5, m6 ; } else if (filter->error > 0) {
44 paddd m2, m0 ; for (int i = 0; i < 8; i++)
45 paddd m3, m1 ; filter->qm[i] += filter->dx[i];
46 mova [qmq ], m2 ; }
47 mova [qmq + 0x10], m3 ;
48
49 mova m0, [dlq ]
50 mova m1, [dlq + 0x10]
51
52%if cpuflag(sse4)
53 pmulld m2, m0
54 pmulld m3, m1
55%else
56 pshufd m6, m0, 0xb1
57 pshufd m7, m2, 0xb1
58 pmuludq m6, m7
59 pshufd m6, m6, 0xd8
60 pmuludq m2, m0
61 pshufd m2, m2, 0xd8
62 punpckldq m2, m6
63
64 pshufd m6, m1, 0xb1
65 pshufd m7, m3, 0xb1
66 pmuludq m6, m7
67 pshufd m6, m6, 0xd8
68 pmuludq m3, m1
69 pshufd m3, m3, 0xd8
70 punpckldq m3, m6
71%endif
72 ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
73 paddd m2, m3 ; int sum = filter->round +
74 ; filter->dl[0] * filter->qm[0] +
75 pshufd m3, m2, 0xe ; filter->dl[1] * filter->qm[1] +
76 paddd m2, m3 ; filter->dl[2] * filter->qm[2] +
77 ; filter->dl[3] * filter->qm[3] +
78 movd m6, roundm ; filter->dl[4] * filter->qm[4] +
79 paddd m6, m2 ; filter->dl[5] * filter->qm[5] +
80 pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] +
81 paddd m6, m2 ; filter->dl[7] * filter->qm[7];
82
83 palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
84 ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
85
86 palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
87 ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
88
89 psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
90 por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
91 pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
92 ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
93
94 mova [dlq ], m2
95 mova [dxq ], m5
96 mova [dxq + 0x10], m4
97 movd m0, [inq] ; filter->error = *in;
98 movd [errorq], m0 ;
99
100 movd m2, shiftm ; *in += (sum >> filter->shift);
101 psrad m6, m2 ;
102 paddd m0, m6 ;
103 movd [inq], m0 ;
104
105 psrldq m1, 4 ;
106 pslldq m0, 12 ; filter->dl[4] = -filter->dl[5];
107 pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6];
108 psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7];
109 psrldq m1, m0, 4 ; filter->dl[7] = *in;
110 pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6];
111 paddd m0, m1 ; filter->dl[4] += filter->dl[5];
112 psrldq m1, 4 ;
113 paddd m0, m1 ;
114 mova [dlq + 0x10], m0 ;
115 RET
116%endmacro
117
118TTA_FILTER ssse3, 8
119TTA_FILTER sse4, 7