Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* SIMD-optimized MPEG encoding functions | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (c) 2000, 2001 Fabrice Bellard | |
5 | ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
6 | ;* | |
7 | ;* This file is part of FFmpeg. | |
8 | ;* | |
9 | ;* FFmpeg is free software; you can redistribute it and/or | |
10 | ;* modify it under the terms of the GNU Lesser General Public | |
11 | ;* License as published by the Free Software Foundation; either | |
12 | ;* version 2.1 of the License, or (at your option) any later version. | |
13 | ;* | |
14 | ;* FFmpeg is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;* Lesser General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU Lesser General Public | |
20 | ;* License along with FFmpeg; if not, write to the Free Software | |
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | ;***************************************************************************** | |
23 | ||
24 | %include "libavutil/x86/x86util.asm" | |
25 | ||
26 | SECTION_RODATA | |
27 | ||
28 | cextern pw_1 | |
29 | ||
30 | SECTION .text | |
31 | ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) | |
f6fa7814 DM |
32 | ; %1 = number of loops |
33 | ; %2 = number of GPRs used | |
34 | %macro PIX_SUM16 3 | |
35 | cglobal pix_sum16, 2, %2, 6 | |
2ba45a60 | 36 | movsxdifnidn r1, r1d |
f6fa7814 DM |
37 | mov r2, %1 |
38 | %if mmsize == 16 | |
2ba45a60 | 39 | lea r3, [r1*3] |
f6fa7814 DM |
40 | %endif |
41 | %if notcpuflag(xop) | |
2ba45a60 DM |
42 | pxor m5, m5 |
43 | %endif | |
44 | pxor m4, m4 | |
45 | .loop: | |
46 | %if cpuflag(xop) | |
47 | vphaddubq m0, [r0] | |
48 | vphaddubq m1, [r0+r1] | |
49 | vphaddubq m2, [r0+r1*2] | |
50 | vphaddubq m3, [r0+r3] | |
51 | %else | |
52 | mova m0, [r0] | |
53 | %if mmsize == 8 | |
54 | mova m1, [r0+8] | |
f6fa7814 DM |
55 | %if cpuflag(mmxext) |
56 | mova m2, [r0+r1] | |
57 | mova m3, [r0+r1+8] | |
58 | %endif | |
59 | %else ; sse2 | |
2ba45a60 | 60 | mova m1, [r0+r1] |
f6fa7814 DM |
61 | mova m2, [r0+r1*2] |
62 | mova m3, [r0+r3] | |
2ba45a60 | 63 | %endif |
f6fa7814 DM |
64 | %if cpuflag(mmxext) |
65 | psadbw m0, m5 | |
66 | psadbw m1, m5 | |
67 | psadbw m2, m5 | |
68 | psadbw m3, m5 | |
69 | %else ; mmx | |
2ba45a60 DM |
70 | punpckhbw m2, m0, m5 |
71 | punpcklbw m0, m5 | |
72 | punpckhbw m3, m1, m5 | |
73 | punpcklbw m1, m5 | |
f6fa7814 | 74 | %endif ; cpuflag(mmxext) |
2ba45a60 DM |
75 | %endif ; cpuflag(xop) |
76 | paddw m1, m0 | |
77 | paddw m3, m2 | |
78 | paddw m3, m1 | |
79 | paddw m4, m3 | |
f6fa7814 DM |
80 | %if cpuflag(mmxext) |
81 | lea r0, [r0+r1*%3] | |
2ba45a60 | 82 | %else |
f6fa7814 | 83 | add r0, r1 |
2ba45a60 DM |
84 | %endif |
85 | dec r2 | |
86 | jne .loop | |
f6fa7814 | 87 | %if mmsize == 16 |
2ba45a60 DM |
88 | pshufd m0, m4, q0032 |
89 | paddd m4, m0 | |
f6fa7814 | 90 | %elif notcpuflag(mmxext) |
2ba45a60 DM |
91 | HADDW m4, m5 |
92 | %endif | |
93 | movd eax, m4 | |
94 | RET | |
95 | %endmacro | |
96 | ||
f6fa7814 | 97 | %if ARCH_X86_32 |
2ba45a60 | 98 | INIT_MMX mmx |
f6fa7814 DM |
99 | PIX_SUM16 16, 3, 0 |
100 | INIT_MMX mmxext | |
101 | PIX_SUM16 8, 4, 2 | |
102 | %endif | |
2ba45a60 | 103 | INIT_XMM sse2 |
f6fa7814 | 104 | PIX_SUM16 4, 4, 4 |
2ba45a60 DM |
105 | %if HAVE_XOP_EXTERNAL |
106 | INIT_XMM xop | |
f6fa7814 | 107 | PIX_SUM16 4, 4, 4 |
2ba45a60 DM |
108 | %endif |
109 | ||
110 | ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) | |
111 | ; %1 = number of xmm registers used | |
112 | ; %2 = number of loops | |
113 | %macro PIX_NORM1 2 | |
114 | cglobal pix_norm1, 2, 3, %1 | |
115 | movsxdifnidn r1, r1d | |
116 | mov r2, %2 | |
117 | pxor m0, m0 | |
118 | pxor m5, m5 | |
119 | .loop: | |
120 | mova m2, [r0+0] | |
121 | %if mmsize == 8 | |
122 | mova m3, [r0+8] | |
123 | %else | |
124 | mova m3, [r0+r1] | |
125 | %endif | |
126 | punpckhbw m1, m2, m0 | |
127 | punpcklbw m2, m0 | |
128 | punpckhbw m4, m3, m0 | |
129 | punpcklbw m3, m0 | |
130 | pmaddwd m1, m1 | |
131 | pmaddwd m2, m2 | |
132 | pmaddwd m3, m3 | |
133 | pmaddwd m4, m4 | |
134 | paddd m2, m1 | |
135 | paddd m4, m3 | |
136 | paddd m5, m2 | |
137 | paddd m5, m4 | |
138 | %if mmsize == 8 | |
139 | add r0, r1 | |
140 | %else | |
141 | lea r0, [r0+r1*2] | |
142 | %endif | |
143 | dec r2 | |
144 | jne .loop | |
145 | HADDD m5, m1 | |
146 | movd eax, m5 | |
147 | RET | |
148 | %endmacro | |
149 | ||
150 | INIT_MMX mmx | |
151 | PIX_NORM1 0, 16 | |
152 | INIT_XMM sse2 | |
153 | PIX_NORM1 6, 8 | |
154 |