Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* SIMD-optimized MPEG encoding functions | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (c) 2000, 2001 Fabrice Bellard | |
5 | ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
6 | ;* | |
7 | ;* This file is part of FFmpeg. | |
8 | ;* | |
9 | ;* FFmpeg is free software; you can redistribute it and/or | |
10 | ;* modify it under the terms of the GNU Lesser General Public | |
11 | ;* License as published by the Free Software Foundation; either | |
12 | ;* version 2.1 of the License, or (at your option) any later version. | |
13 | ;* | |
14 | ;* FFmpeg is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;* Lesser General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU Lesser General Public | |
20 | ;* License along with FFmpeg; if not, write to the Free Software | |
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | ;***************************************************************************** | |
23 | ||
24 | %include "libavutil/x86/x86util.asm" | |
25 | ||
26 | SECTION_RODATA | |
27 | ||
28 | cextern pw_1 | |
29 | ||
30 | SECTION .text | |
31 | ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) | |
32 | ; %1 = number of xmm registers used | |
33 | ; %2 = number of loops | |
34 | ; %3 = number of GPRs used | |
35 | %macro PIX_SUM16 4 | |
36 | cglobal pix_sum16, 2, %3, %1 | |
37 | movsxdifnidn r1, r1d | |
38 | mov r2, %2 | |
39 | %if cpuflag(xop) | |
40 | lea r3, [r1*3] | |
41 | %else | |
42 | pxor m5, m5 | |
43 | %endif | |
44 | pxor m4, m4 | |
45 | .loop: | |
46 | %if cpuflag(xop) | |
47 | vphaddubq m0, [r0] | |
48 | vphaddubq m1, [r0+r1] | |
49 | vphaddubq m2, [r0+r1*2] | |
50 | vphaddubq m3, [r0+r3] | |
51 | %else | |
52 | mova m0, [r0] | |
53 | %if mmsize == 8 | |
54 | mova m1, [r0+8] | |
55 | %else | |
56 | mova m1, [r0+r1] | |
57 | %endif | |
58 | punpckhbw m2, m0, m5 | |
59 | punpcklbw m0, m5 | |
60 | punpckhbw m3, m1, m5 | |
61 | punpcklbw m1, m5 | |
62 | %endif ; cpuflag(xop) | |
63 | paddw m1, m0 | |
64 | paddw m3, m2 | |
65 | paddw m3, m1 | |
66 | paddw m4, m3 | |
67 | %if mmsize == 8 | |
68 | add r0, r1 | |
69 | %else | |
70 | lea r0, [r0+r1*%4] | |
71 | %endif | |
72 | dec r2 | |
73 | jne .loop | |
74 | %if cpuflag(xop) | |
75 | pshufd m0, m4, q0032 | |
76 | paddd m4, m0 | |
77 | %else | |
78 | HADDW m4, m5 | |
79 | %endif | |
80 | movd eax, m4 | |
81 | RET | |
82 | %endmacro | |
83 | ||
84 | INIT_MMX mmx | |
85 | PIX_SUM16 0, 16, 3, 0 | |
86 | INIT_XMM sse2 | |
87 | PIX_SUM16 6, 8, 3, 2 | |
88 | %if HAVE_XOP_EXTERNAL | |
89 | INIT_XMM xop | |
90 | PIX_SUM16 5, 4, 4, 4 | |
91 | %endif | |
92 | ||
93 | ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) | |
94 | ; %1 = number of xmm registers used | |
95 | ; %2 = number of loops | |
96 | %macro PIX_NORM1 2 | |
97 | cglobal pix_norm1, 2, 3, %1 | |
98 | movsxdifnidn r1, r1d | |
99 | mov r2, %2 | |
100 | pxor m0, m0 | |
101 | pxor m5, m5 | |
102 | .loop: | |
103 | mova m2, [r0+0] | |
104 | %if mmsize == 8 | |
105 | mova m3, [r0+8] | |
106 | %else | |
107 | mova m3, [r0+r1] | |
108 | %endif | |
109 | punpckhbw m1, m2, m0 | |
110 | punpcklbw m2, m0 | |
111 | punpckhbw m4, m3, m0 | |
112 | punpcklbw m3, m0 | |
113 | pmaddwd m1, m1 | |
114 | pmaddwd m2, m2 | |
115 | pmaddwd m3, m3 | |
116 | pmaddwd m4, m4 | |
117 | paddd m2, m1 | |
118 | paddd m4, m3 | |
119 | paddd m5, m2 | |
120 | paddd m5, m4 | |
121 | %if mmsize == 8 | |
122 | add r0, r1 | |
123 | %else | |
124 | lea r0, [r0+r1*2] | |
125 | %endif | |
126 | dec r2 | |
127 | jne .loop | |
128 | HADDD m5, m1 | |
129 | movd eax, m5 | |
130 | RET | |
131 | %endmacro | |
132 | ||
133 | INIT_MMX mmx | |
134 | PIX_NORM1 0, 16 | |
135 | INIT_XMM sse2 | |
136 | PIX_NORM1 6, 8 | |
137 |