0059b3b448281bd71e886aae04522890b79d2ef4
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/mem.h"
32 #include "libavutil/ppc/cpu.h"
33 #include "libavutil/ppc/types_altivec.h"
34 #include "libavcodec/blockdsp.h"
36 /* ***** WARNING ***** WARNING ***** WARNING ***** */
38 * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
39 * a cache line size not equal to 32 bytes. Fortunately all processors used
40 * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
41 * cache lines. This is due to the use of the 'dcbz' instruction. It simply
42 * clears a single cache line to zero, so you need to know the cache line
43 * size to use it! It's absurd, but it's fast...
45 * update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
46 * cache line size: 128 bytes. Oups.
47 * The semantics of dcbz was changed, it always clears 32 bytes. So the function
48 * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
49 * which is defined to clear a cache line (as dcbz before). So we can still
50 * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
52 * see <http://developer.apple.com/technotes/tn/tn2087.html>
53 * and <http://developer.apple.com/technotes/tn/tn2086.html>
55 static void clear_blocks_dcbz32_ppc(int16_t *blocks
)
57 register int misal
= (unsigned long) blocks
& 0x00000010, i
= 0;
60 ((unsigned long *) blocks
)[0] = 0L;
61 ((unsigned long *) blocks
)[1] = 0L;
62 ((unsigned long *) blocks
)[2] = 0L;
63 ((unsigned long *) blocks
)[3] = 0L;
66 for (; i
< sizeof(int16_t) * 6 * 64 - 31; i
+= 32)
67 __asm__
volatile ("dcbz %0,%1" :: "b" (blocks
), "r" (i
) : "memory");
69 ((unsigned long *) blocks
)[188] = 0L;
70 ((unsigned long *) blocks
)[189] = 0L;
71 ((unsigned long *) blocks
)[190] = 0L;
72 ((unsigned long *) blocks
)[191] = 0L;
77 /* Same as above, when dcbzl clears a whole 128 bytes cache line
78 * i.e. the PPC970 AKA G5. */
79 static void clear_blocks_dcbz128_ppc(int16_t *blocks
)
82 register int misal
= (unsigned long) blocks
& 0x0000007f, i
= 0;
85 /* We could probably also optimize this case,
86 * but there's not much point as the machines
87 * aren't available yet (2003-06-26). */
88 memset(blocks
, 0, sizeof(int16_t) * 6 * 64);
90 for (; i
< sizeof(int16_t) * 6 * 64; i
+= 128)
91 __asm__
volatile ("dcbzl %0,%1" :: "b" (blocks
), "r" (i
) : "memory");
94 memset(blocks
, 0, sizeof(int16_t) * 6 * 64);
98 /* Check dcbz report how many bytes are set to 0 by dcbz. */
99 /* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
100 * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
101 * assembler knows about dcbzl ... */
102 static long check_dcbzl_effect(void)
106 register char *fakedata
= av_malloc(1024);
107 register char *fakedata_middle
;
108 register long zero
= 0, i
= 0;
113 fakedata_middle
= fakedata
+ 512;
115 memset(fakedata
, 0xFF, 1024);
117 /* Below the constraint "b" seems to mean "address base register"
118 * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */
119 __asm__
volatile ("dcbzl %0, %1" :: "b" (fakedata_middle
), "r" (zero
));
121 for (i
= 0; i
< 1024; i
++)
122 if (fakedata
[i
] == (char) 0)
132 static void clear_block_altivec(int16_t *block
)
135 vec_st(zero_s16v
, 0, block
);
136 vec_st(zero_s16v
, 16, block
);
137 vec_st(zero_s16v
, 32, block
);
138 vec_st(zero_s16v
, 48, block
);
139 vec_st(zero_s16v
, 64, block
);
140 vec_st(zero_s16v
, 80, block
);
141 vec_st(zero_s16v
, 96, block
);
142 vec_st(zero_s16v
, 112, block
);
144 #endif /* HAVE_ALTIVEC */
146 av_cold
void ff_blockdsp_init_ppc(BlockDSPContext
*c
, unsigned high_bit_depth
)
148 // common optimizations whether AltiVec is available or not
149 if (!high_bit_depth
) {
150 switch (check_dcbzl_effect()) {
152 c
->clear_blocks
= clear_blocks_dcbz32_ppc
;
155 c
->clear_blocks
= clear_blocks_dcbz128_ppc
;
163 if (!PPC_ALTIVEC(av_get_cpu_flags()))
167 c
->clear_block
= clear_block_altivec
;
168 #endif /* HAVE_ALTIVEC */