Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd | |
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | |
4 | * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 | * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 | * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 | * | |
10 | * This file is part of FFmpeg. | |
11 | * | |
12 | * FFmpeg is free software; you can redistribute it and/or | |
13 | * modify it under the terms of the GNU Lesser General Public | |
14 | * License as published by the Free Software Foundation; either | |
15 | * version 2.1 of the License, or (at your option) any later version. | |
16 | * | |
17 | * FFmpeg is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | * Lesser General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Lesser General Public | |
23 | * License along with FFmpeg; if not, write to the Free Software | |
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 | */ | |
26 | ||
27 | #include <stddef.h> | |
28 | #include <stdint.h> | |
29 | ||
30 | #include "inline_asm.h" | |
31 | ||
32 | // put_pixels | |
33 | STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, | |
34 | ptrdiff_t line_size, int h) | |
35 | { | |
36 | MOVQ_ZERO(mm7); | |
37 | SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
38 | __asm__ volatile( | |
39 | "movq (%1), %%mm0 \n\t" | |
40 | "movq 1(%1), %%mm4 \n\t" | |
41 | "movq %%mm0, %%mm1 \n\t" | |
42 | "movq %%mm4, %%mm5 \n\t" | |
43 | "punpcklbw %%mm7, %%mm0 \n\t" | |
44 | "punpcklbw %%mm7, %%mm4 \n\t" | |
45 | "punpckhbw %%mm7, %%mm1 \n\t" | |
46 | "punpckhbw %%mm7, %%mm5 \n\t" | |
47 | "paddusw %%mm0, %%mm4 \n\t" | |
48 | "paddusw %%mm1, %%mm5 \n\t" | |
49 | "xor %%"REG_a", %%"REG_a" \n\t" | |
50 | "add %3, %1 \n\t" | |
51 | ".p2align 3 \n\t" | |
52 | "1: \n\t" | |
53 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
54 | "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
55 | "movq %%mm0, %%mm1 \n\t" | |
56 | "movq %%mm2, %%mm3 \n\t" | |
57 | "punpcklbw %%mm7, %%mm0 \n\t" | |
58 | "punpcklbw %%mm7, %%mm2 \n\t" | |
59 | "punpckhbw %%mm7, %%mm1 \n\t" | |
60 | "punpckhbw %%mm7, %%mm3 \n\t" | |
61 | "paddusw %%mm2, %%mm0 \n\t" | |
62 | "paddusw %%mm3, %%mm1 \n\t" | |
63 | "paddusw %%mm6, %%mm4 \n\t" | |
64 | "paddusw %%mm6, %%mm5 \n\t" | |
65 | "paddusw %%mm0, %%mm4 \n\t" | |
66 | "paddusw %%mm1, %%mm5 \n\t" | |
67 | "psrlw $2, %%mm4 \n\t" | |
68 | "psrlw $2, %%mm5 \n\t" | |
69 | "packuswb %%mm5, %%mm4 \n\t" | |
70 | "movq %%mm4, (%2, %%"REG_a") \n\t" | |
71 | "add %3, %%"REG_a" \n\t" | |
72 | ||
73 | "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |
74 | "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
75 | "movq %%mm2, %%mm3 \n\t" | |
76 | "movq %%mm4, %%mm5 \n\t" | |
77 | "punpcklbw %%mm7, %%mm2 \n\t" | |
78 | "punpcklbw %%mm7, %%mm4 \n\t" | |
79 | "punpckhbw %%mm7, %%mm3 \n\t" | |
80 | "punpckhbw %%mm7, %%mm5 \n\t" | |
81 | "paddusw %%mm2, %%mm4 \n\t" | |
82 | "paddusw %%mm3, %%mm5 \n\t" | |
83 | "paddusw %%mm6, %%mm0 \n\t" | |
84 | "paddusw %%mm6, %%mm1 \n\t" | |
85 | "paddusw %%mm4, %%mm0 \n\t" | |
86 | "paddusw %%mm5, %%mm1 \n\t" | |
87 | "psrlw $2, %%mm0 \n\t" | |
88 | "psrlw $2, %%mm1 \n\t" | |
89 | "packuswb %%mm1, %%mm0 \n\t" | |
90 | "movq %%mm0, (%2, %%"REG_a") \n\t" | |
91 | "add %3, %%"REG_a" \n\t" | |
92 | ||
93 | "subl $2, %0 \n\t" | |
94 | "jnz 1b \n\t" | |
95 | :"+g"(h), "+S"(pixels) | |
96 | :"D"(block), "r"((x86_reg)line_size) | |
97 | :REG_a, "memory"); | |
98 | } | |
99 | ||
100 | // avg_pixels | |
101 | // this routine is 'slightly' suboptimal but mostly unused | |
102 | STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, | |
103 | ptrdiff_t line_size, int h) | |
104 | { | |
105 | MOVQ_ZERO(mm7); | |
106 | SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
107 | __asm__ volatile( | |
108 | "movq (%1), %%mm0 \n\t" | |
109 | "movq 1(%1), %%mm4 \n\t" | |
110 | "movq %%mm0, %%mm1 \n\t" | |
111 | "movq %%mm4, %%mm5 \n\t" | |
112 | "punpcklbw %%mm7, %%mm0 \n\t" | |
113 | "punpcklbw %%mm7, %%mm4 \n\t" | |
114 | "punpckhbw %%mm7, %%mm1 \n\t" | |
115 | "punpckhbw %%mm7, %%mm5 \n\t" | |
116 | "paddusw %%mm0, %%mm4 \n\t" | |
117 | "paddusw %%mm1, %%mm5 \n\t" | |
118 | "xor %%"REG_a", %%"REG_a" \n\t" | |
119 | "add %3, %1 \n\t" | |
120 | ".p2align 3 \n\t" | |
121 | "1: \n\t" | |
122 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
123 | "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
124 | "movq %%mm0, %%mm1 \n\t" | |
125 | "movq %%mm2, %%mm3 \n\t" | |
126 | "punpcklbw %%mm7, %%mm0 \n\t" | |
127 | "punpcklbw %%mm7, %%mm2 \n\t" | |
128 | "punpckhbw %%mm7, %%mm1 \n\t" | |
129 | "punpckhbw %%mm7, %%mm3 \n\t" | |
130 | "paddusw %%mm2, %%mm0 \n\t" | |
131 | "paddusw %%mm3, %%mm1 \n\t" | |
132 | "paddusw %%mm6, %%mm4 \n\t" | |
133 | "paddusw %%mm6, %%mm5 \n\t" | |
134 | "paddusw %%mm0, %%mm4 \n\t" | |
135 | "paddusw %%mm1, %%mm5 \n\t" | |
136 | "psrlw $2, %%mm4 \n\t" | |
137 | "psrlw $2, %%mm5 \n\t" | |
138 | "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
139 | "packuswb %%mm5, %%mm4 \n\t" | |
140 | "pcmpeqd %%mm2, %%mm2 \n\t" | |
141 | "paddb %%mm2, %%mm2 \n\t" | |
142 | PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) | |
143 | "movq %%mm5, (%2, %%"REG_a") \n\t" | |
144 | "add %3, %%"REG_a" \n\t" | |
145 | ||
146 | "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |
147 | "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
148 | "movq %%mm2, %%mm3 \n\t" | |
149 | "movq %%mm4, %%mm5 \n\t" | |
150 | "punpcklbw %%mm7, %%mm2 \n\t" | |
151 | "punpcklbw %%mm7, %%mm4 \n\t" | |
152 | "punpckhbw %%mm7, %%mm3 \n\t" | |
153 | "punpckhbw %%mm7, %%mm5 \n\t" | |
154 | "paddusw %%mm2, %%mm4 \n\t" | |
155 | "paddusw %%mm3, %%mm5 \n\t" | |
156 | "paddusw %%mm6, %%mm0 \n\t" | |
157 | "paddusw %%mm6, %%mm1 \n\t" | |
158 | "paddusw %%mm4, %%mm0 \n\t" | |
159 | "paddusw %%mm5, %%mm1 \n\t" | |
160 | "psrlw $2, %%mm0 \n\t" | |
161 | "psrlw $2, %%mm1 \n\t" | |
162 | "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
163 | "packuswb %%mm1, %%mm0 \n\t" | |
164 | "pcmpeqd %%mm2, %%mm2 \n\t" | |
165 | "paddb %%mm2, %%mm2 \n\t" | |
166 | PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) | |
167 | "movq %%mm1, (%2, %%"REG_a") \n\t" | |
168 | "add %3, %%"REG_a" \n\t" | |
169 | ||
170 | "subl $2, %0 \n\t" | |
171 | "jnz 1b \n\t" | |
172 | :"+g"(h), "+S"(pixels) | |
173 | :"D"(block), "r"((x86_reg)line_size) | |
174 | :REG_a, "memory"); | |
175 | } |