Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / ppc / h264chroma_template.c
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mem.h"
22
23/* this code assume that stride % 16 == 0 */
24
25#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
26 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
27 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
28\
29 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
30 psum = vec_mladd(vB, vsrc1ssH, psum);\
31 psum = vec_mladd(vC, vsrc2ssH, psum);\
32 psum = vec_mladd(vD, vsrc3ssH, psum);\
33 psum = BIAS2(psum);\
34 psum = vec_sr(psum, v6us);\
35\
36 vdst = vec_ld(0, dst);\
37 ppsum = (vec_u8)vec_pack(psum, psum);\
38 vfdst = vec_perm(vdst, ppsum, fperm);\
39\
40 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
41\
42 vec_st(fsum, 0, dst);\
43\
44 vsrc0ssH = vsrc2ssH;\
45 vsrc1ssH = vsrc3ssH;\
46\
47 dst += stride;\
48 src += stride;
49
50#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
51\
52 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
53 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
54\
55 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
56 psum = vec_mladd(vE, vsrc1ssH, psum);\
57 psum = vec_sr(psum, v6us);\
58\
59 vdst = vec_ld(0, dst);\
60 ppsum = (vec_u8)vec_pack(psum, psum);\
61 vfdst = vec_perm(vdst, ppsum, fperm);\
62\
63 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
64\
65 vec_st(fsum, 0, dst);\
66\
67 dst += stride;\
68 src += stride;
69
70#define noop(a) a
71#define add28(a) vec_add(v28ss, a)
72
73#ifdef PREFIX_h264_chroma_mc8_altivec
74static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75 int stride, int h, int x, int y) {
76 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
77 {((8 - x) * (8 - y)),
78 (( x) * (8 - y)),
79 ((8 - x) * ( y)),
80 (( x) * ( y))};
81 register int i;
82 vec_u8 fperm;
83 const vec_s32 vABCD = vec_ld(0, ABCD);
84 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
85 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
86 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
87 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
88 LOAD_ZERO;
89 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
90 const vec_u16 v6us = vec_splat_u16(6);
91 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
92 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
93
94 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
95 vec_u8 vsrc0uc, vsrc1uc;
96 vec_s16 vsrc0ssH, vsrc1ssH;
97 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
98 vec_s16 vsrc2ssH, vsrc3ssH, psum;
99 vec_u8 vdst, ppsum, vfdst, fsum;
100
101 if (((unsigned long)dst) % 16 == 0) {
102 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
103 0x14, 0x15, 0x16, 0x17,
104 0x08, 0x09, 0x0A, 0x0B,
105 0x0C, 0x0D, 0x0E, 0x0F};
106 } else {
107 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
108 0x04, 0x05, 0x06, 0x07,
109 0x18, 0x19, 0x1A, 0x1B,
110 0x1C, 0x1D, 0x1E, 0x1F};
111 }
112
113 vsrcAuc = vec_ld(0, src);
114
115 if (loadSecond)
116 vsrcBuc = vec_ld(16, src);
117 vsrcperm0 = vec_lvsl(0, src);
118 vsrcperm1 = vec_lvsl(1, src);
119
120 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
121 if (reallyBadAlign)
122 vsrc1uc = vsrcBuc;
123 else
124 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
125
126 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
127 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
128
129 if (ABCD[3]) {
130 if (!loadSecond) {// -> !reallyBadAlign
131 for (i = 0 ; i < h ; i++) {
132 vsrcCuc = vec_ld(stride + 0, src);
133 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
134 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
135
136 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
137 }
138 } else {
139 vec_u8 vsrcDuc;
140 for (i = 0 ; i < h ; i++) {
141 vsrcCuc = vec_ld(stride + 0, src);
142 vsrcDuc = vec_ld(stride + 16, src);
143 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
144 if (reallyBadAlign)
145 vsrc3uc = vsrcDuc;
146 else
147 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
148
149 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
150 }
151 }
152 } else {
153 const vec_s16 vE = vec_add(vB, vC);
154 if (ABCD[2]) { // x == 0 B == 0
155 if (!loadSecond) {// -> !reallyBadAlign
156 for (i = 0 ; i < h ; i++) {
157 vsrcCuc = vec_ld(stride + 0, src);
158 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
159 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
160
161 vsrc0uc = vsrc1uc;
162 }
163 } else {
164 vec_u8 vsrcDuc;
165 for (i = 0 ; i < h ; i++) {
166 vsrcCuc = vec_ld(stride + 0, src);
167 vsrcDuc = vec_ld(stride + 15, src);
168 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
169 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
170
171 vsrc0uc = vsrc1uc;
172 }
173 }
174 } else { // y == 0 C == 0
175 if (!loadSecond) {// -> !reallyBadAlign
176 for (i = 0 ; i < h ; i++) {
177 vsrcCuc = vec_ld(0, src);
178 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
179 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
180
181 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
182 }
183 } else {
184 vec_u8 vsrcDuc;
185 for (i = 0 ; i < h ; i++) {
186 vsrcCuc = vec_ld(0, src);
187 vsrcDuc = vec_ld(15, src);
188 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
189 if (reallyBadAlign)
190 vsrc1uc = vsrcDuc;
191 else
192 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
193
194 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
195 }
196 }
197 }
198 }
199}
200#endif
201
202/* this code assume that stride % 16 == 0 */
203#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
204static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
205 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
206 {((8 - x) * (8 - y)),
207 (( x) * (8 - y)),
208 ((8 - x) * ( y)),
209 (( x) * ( y))};
210 register int i;
211 vec_u8 fperm;
212 const vec_s32 vABCD = vec_ld(0, ABCD);
213 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
214 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
215 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
216 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
217 LOAD_ZERO;
218 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
219 const vec_u16 v6us = vec_splat_u16(6);
220 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
221 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
222
223 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
224 vec_u8 vsrc0uc, vsrc1uc;
225 vec_s16 vsrc0ssH, vsrc1ssH;
226 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
227 vec_s16 vsrc2ssH, vsrc3ssH, psum;
228 vec_u8 vdst, ppsum, vfdst, fsum;
229
230 if (((unsigned long)dst) % 16 == 0) {
231 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
232 0x14, 0x15, 0x16, 0x17,
233 0x08, 0x09, 0x0A, 0x0B,
234 0x0C, 0x0D, 0x0E, 0x0F};
235 } else {
236 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
237 0x04, 0x05, 0x06, 0x07,
238 0x18, 0x19, 0x1A, 0x1B,
239 0x1C, 0x1D, 0x1E, 0x1F};
240 }
241
242 vsrcAuc = vec_ld(0, src);
243
244 if (loadSecond)
245 vsrcBuc = vec_ld(16, src);
246 vsrcperm0 = vec_lvsl(0, src);
247 vsrcperm1 = vec_lvsl(1, src);
248
249 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
250 if (reallyBadAlign)
251 vsrc1uc = vsrcBuc;
252 else
253 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
254
255 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
256 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
257
258 if (!loadSecond) {// -> !reallyBadAlign
259 for (i = 0 ; i < h ; i++) {
260
261
262 vsrcCuc = vec_ld(stride + 0, src);
263
264 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
265 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
266
267 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
268 }
269 } else {
270 vec_u8 vsrcDuc;
271 for (i = 0 ; i < h ; i++) {
272 vsrcCuc = vec_ld(stride + 0, src);
273 vsrcDuc = vec_ld(stride + 16, src);
274
275 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
276 if (reallyBadAlign)
277 vsrc3uc = vsrcDuc;
278 else
279 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
280
281 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
282 }
283 }
284}
285#endif
286
287#undef noop
288#undef add28
289#undef CHROMA_MC8_ALTIVEC_CORE