Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd | |
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | |
4 | * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 | * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 | * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 | * | |
10 | * This file is part of FFmpeg. | |
11 | * | |
12 | * FFmpeg is free software; you can redistribute it and/or | |
13 | * modify it under the terms of the GNU Lesser General Public | |
14 | * License as published by the Free Software Foundation; either | |
15 | * version 2.1 of the License, or (at your option) any later version. | |
16 | * | |
17 | * FFmpeg is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | * Lesser General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Lesser General Public | |
23 | * License along with FFmpeg; if not, write to the Free Software | |
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 | */ | |
26 | ||
27 | #include <stddef.h> | |
28 | #include <stdint.h> | |
29 | ||
30 | // put_pixels | |
31 | static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
32 | { | |
33 | MOVQ_BFE(mm6); | |
34 | __asm__ volatile( | |
35 | "lea (%3, %3), %%"REG_a" \n\t" | |
36 | ".p2align 3 \n\t" | |
37 | "1: \n\t" | |
38 | "movq (%1), %%mm0 \n\t" | |
39 | "movq 1(%1), %%mm1 \n\t" | |
40 | "movq (%1, %3), %%mm2 \n\t" | |
41 | "movq 1(%1, %3), %%mm3 \n\t" | |
42 | PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
43 | "movq %%mm4, (%2) \n\t" | |
44 | "movq %%mm5, (%2, %3) \n\t" | |
45 | "add %%"REG_a", %1 \n\t" | |
46 | "add %%"REG_a", %2 \n\t" | |
47 | "movq (%1), %%mm0 \n\t" | |
48 | "movq 1(%1), %%mm1 \n\t" | |
49 | "movq (%1, %3), %%mm2 \n\t" | |
50 | "movq 1(%1, %3), %%mm3 \n\t" | |
51 | PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
52 | "movq %%mm4, (%2) \n\t" | |
53 | "movq %%mm5, (%2, %3) \n\t" | |
54 | "add %%"REG_a", %1 \n\t" | |
55 | "add %%"REG_a", %2 \n\t" | |
56 | "subl $4, %0 \n\t" | |
57 | "jnz 1b \n\t" | |
58 | :"+g"(h), "+S"(pixels), "+D"(block) | |
59 | :"r"((x86_reg)line_size) | |
60 | :REG_a, "memory"); | |
61 | } | |
62 | ||
63 | static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
64 | { | |
65 | MOVQ_BFE(mm6); | |
66 | __asm__ volatile( | |
67 | "lea (%3, %3), %%"REG_a" \n\t" | |
68 | ".p2align 3 \n\t" | |
69 | "1: \n\t" | |
70 | "movq (%1), %%mm0 \n\t" | |
71 | "movq 1(%1), %%mm1 \n\t" | |
72 | "movq (%1, %3), %%mm2 \n\t" | |
73 | "movq 1(%1, %3), %%mm3 \n\t" | |
74 | PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
75 | "movq %%mm4, (%2) \n\t" | |
76 | "movq %%mm5, (%2, %3) \n\t" | |
77 | "movq 8(%1), %%mm0 \n\t" | |
78 | "movq 9(%1), %%mm1 \n\t" | |
79 | "movq 8(%1, %3), %%mm2 \n\t" | |
80 | "movq 9(%1, %3), %%mm3 \n\t" | |
81 | PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
82 | "movq %%mm4, 8(%2) \n\t" | |
83 | "movq %%mm5, 8(%2, %3) \n\t" | |
84 | "add %%"REG_a", %1 \n\t" | |
85 | "add %%"REG_a", %2 \n\t" | |
86 | "movq (%1), %%mm0 \n\t" | |
87 | "movq 1(%1), %%mm1 \n\t" | |
88 | "movq (%1, %3), %%mm2 \n\t" | |
89 | "movq 1(%1, %3), %%mm3 \n\t" | |
90 | PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
91 | "movq %%mm4, (%2) \n\t" | |
92 | "movq %%mm5, (%2, %3) \n\t" | |
93 | "movq 8(%1), %%mm0 \n\t" | |
94 | "movq 9(%1), %%mm1 \n\t" | |
95 | "movq 8(%1, %3), %%mm2 \n\t" | |
96 | "movq 9(%1, %3), %%mm3 \n\t" | |
97 | PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
98 | "movq %%mm4, 8(%2) \n\t" | |
99 | "movq %%mm5, 8(%2, %3) \n\t" | |
100 | "add %%"REG_a", %1 \n\t" | |
101 | "add %%"REG_a", %2 \n\t" | |
102 | "subl $4, %0 \n\t" | |
103 | "jnz 1b \n\t" | |
104 | :"+g"(h), "+S"(pixels), "+D"(block) | |
105 | :"r"((x86_reg)line_size) | |
106 | :REG_a, "memory"); | |
107 | } | |
108 | ||
109 | static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
110 | { | |
111 | MOVQ_BFE(mm6); | |
112 | __asm__ volatile( | |
113 | "lea (%3, %3), %%"REG_a" \n\t" | |
114 | "movq (%1), %%mm0 \n\t" | |
115 | ".p2align 3 \n\t" | |
116 | "1: \n\t" | |
117 | "movq (%1, %3), %%mm1 \n\t" | |
118 | "movq (%1, %%"REG_a"),%%mm2 \n\t" | |
119 | PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
120 | "movq %%mm4, (%2) \n\t" | |
121 | "movq %%mm5, (%2, %3) \n\t" | |
122 | "add %%"REG_a", %1 \n\t" | |
123 | "add %%"REG_a", %2 \n\t" | |
124 | "movq (%1, %3), %%mm1 \n\t" | |
125 | "movq (%1, %%"REG_a"),%%mm0 \n\t" | |
126 | PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
127 | "movq %%mm4, (%2) \n\t" | |
128 | "movq %%mm5, (%2, %3) \n\t" | |
129 | "add %%"REG_a", %1 \n\t" | |
130 | "add %%"REG_a", %2 \n\t" | |
131 | "subl $4, %0 \n\t" | |
132 | "jnz 1b \n\t" | |
133 | :"+g"(h), "+S"(pixels), "+D"(block) | |
134 | :"r"((x86_reg)line_size) | |
135 | :REG_a, "memory"); | |
136 | } | |
137 | ||
138 | static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
139 | { | |
140 | MOVQ_BFE(mm6); | |
141 | JUMPALIGN(); | |
142 | do { | |
143 | __asm__ volatile( | |
144 | "movq %1, %%mm0 \n\t" | |
145 | "movq 1%1, %%mm1 \n\t" | |
146 | "movq %0, %%mm3 \n\t" | |
147 | PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
148 | PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) | |
149 | "movq %%mm0, %0 \n\t" | |
150 | "movq 8%1, %%mm0 \n\t" | |
151 | "movq 9%1, %%mm1 \n\t" | |
152 | "movq 8%0, %%mm3 \n\t" | |
153 | PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
154 | PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) | |
155 | "movq %%mm0, 8%0 \n\t" | |
156 | :"+m"(*block) | |
157 | :"m"(*pixels) | |
158 | :"memory"); | |
159 | pixels += line_size; | |
160 | block += line_size; | |
161 | } while (--h); | |
162 | } | |
163 | ||
164 | static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
165 | { | |
166 | MOVQ_BFE(mm6); | |
167 | __asm__ volatile( | |
168 | "lea (%3, %3), %%"REG_a" \n\t" | |
169 | "movq (%1), %%mm0 \n\t" | |
170 | ".p2align 3 \n\t" | |
171 | "1: \n\t" | |
172 | "movq (%1, %3), %%mm1 \n\t" | |
173 | "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
174 | PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
175 | "movq (%2), %%mm3 \n\t" | |
176 | PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) | |
177 | "movq (%2, %3), %%mm3 \n\t" | |
178 | PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) | |
179 | "movq %%mm0, (%2) \n\t" | |
180 | "movq %%mm1, (%2, %3) \n\t" | |
181 | "add %%"REG_a", %1 \n\t" | |
182 | "add %%"REG_a", %2 \n\t" | |
183 | ||
184 | "movq (%1, %3), %%mm1 \n\t" | |
185 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
186 | PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
187 | "movq (%2), %%mm3 \n\t" | |
188 | PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) | |
189 | "movq (%2, %3), %%mm3 \n\t" | |
190 | PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) | |
191 | "movq %%mm2, (%2) \n\t" | |
192 | "movq %%mm1, (%2, %3) \n\t" | |
193 | "add %%"REG_a", %1 \n\t" | |
194 | "add %%"REG_a", %2 \n\t" | |
195 | ||
196 | "subl $4, %0 \n\t" | |
197 | "jnz 1b \n\t" | |
198 | :"+g"(h), "+S"(pixels), "+D"(block) | |
199 | :"r"((x86_reg)line_size) | |
200 | :REG_a, "memory"); | |
201 | } |