Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2002 Brian Foley | |
3 | * Copyright (c) 2002 Dieter Shirley | |
4 | * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "config.h" | |
24 | #if HAVE_ALTIVEC_H | |
25 | #include <altivec.h> | |
26 | #endif | |
27 | #include <string.h> | |
28 | ||
29 | #include "libavutil/attributes.h" | |
30 | #include "libavutil/cpu.h" | |
31 | #include "libavutil/mem.h" | |
32 | #include "libavutil/ppc/cpu.h" | |
33 | #include "libavutil/ppc/types_altivec.h" | |
34 | #include "libavcodec/blockdsp.h" | |
35 | ||
36 | /* ***** WARNING ***** WARNING ***** WARNING ***** */ | |
37 | /* | |
38 | * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with | |
39 | * a cache line size not equal to 32 bytes. Fortunately all processors used | |
40 | * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte | |
41 | * cache lines. This is due to the use of the 'dcbz' instruction. It simply | |
42 | * clears a single cache line to zero, so you need to know the cache line | |
43 | * size to use it! It's absurd, but it's fast... | |
44 | * | |
45 | * update 24/06/2003: Apple released the G5 yesterday, with a PPC970. | |
46 | * cache line size: 128 bytes. Oups. | |
47 | * The semantics of dcbz was changed, it always clears 32 bytes. So the function | |
48 | * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, | |
49 | * which is defined to clear a cache line (as dcbz before). So we can still | |
50 | * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. | |
51 | * | |
52 | * see <http://developer.apple.com/technotes/tn/tn2087.html> | |
53 | * and <http://developer.apple.com/technotes/tn/tn2086.html> | |
54 | */ | |
55 | static void clear_blocks_dcbz32_ppc(int16_t *blocks) | |
56 | { | |
57 | register int misal = (unsigned long) blocks & 0x00000010, i = 0; | |
58 | ||
59 | if (misal) { | |
60 | ((unsigned long *) blocks)[0] = 0L; | |
61 | ((unsigned long *) blocks)[1] = 0L; | |
62 | ((unsigned long *) blocks)[2] = 0L; | |
63 | ((unsigned long *) blocks)[3] = 0L; | |
64 | i += 16; | |
65 | } | |
66 | for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32) | |
67 | __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory"); | |
68 | if (misal) { | |
69 | ((unsigned long *) blocks)[188] = 0L; | |
70 | ((unsigned long *) blocks)[189] = 0L; | |
71 | ((unsigned long *) blocks)[190] = 0L; | |
72 | ((unsigned long *) blocks)[191] = 0L; | |
73 | i += 16; | |
74 | } | |
75 | } | |
76 | ||
77 | /* Same as above, when dcbzl clears a whole 128 bytes cache line | |
78 | * i.e. the PPC970 AKA G5. */ | |
79 | static void clear_blocks_dcbz128_ppc(int16_t *blocks) | |
80 | { | |
81 | #if HAVE_DCBZL | |
82 | register int misal = (unsigned long) blocks & 0x0000007f, i = 0; | |
83 | ||
84 | if (misal) { | |
85 | /* We could probably also optimize this case, | |
86 | * but there's not much point as the machines | |
87 | * aren't available yet (2003-06-26). */ | |
88 | memset(blocks, 0, sizeof(int16_t) * 6 * 64); | |
89 | } else { | |
90 | for (; i < sizeof(int16_t) * 6 * 64; i += 128) | |
91 | __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory"); | |
92 | } | |
93 | #else | |
94 | memset(blocks, 0, sizeof(int16_t) * 6 * 64); | |
95 | #endif | |
96 | } | |
97 | ||
98 | /* Check dcbz report how many bytes are set to 0 by dcbz. */ | |
99 | /* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect | |
100 | * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the | |
101 | * assembler knows about dcbzl ... */ | |
102 | static long check_dcbzl_effect(void) | |
103 | { | |
104 | long count = 0; | |
105 | #if HAVE_DCBZL | |
106 | register char *fakedata = av_malloc(1024); | |
107 | register char *fakedata_middle; | |
108 | register long zero = 0, i = 0; | |
109 | ||
110 | if (!fakedata) | |
111 | return 0L; | |
112 | ||
113 | fakedata_middle = fakedata + 512; | |
114 | ||
115 | memset(fakedata, 0xFF, 1024); | |
116 | ||
117 | /* Below the constraint "b" seems to mean "address base register" | |
118 | * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */ | |
119 | __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero)); | |
120 | ||
121 | for (i = 0; i < 1024; i++) | |
122 | if (fakedata[i] == (char) 0) | |
123 | count++; | |
124 | ||
125 | av_free(fakedata); | |
126 | #endif | |
127 | ||
128 | return count; | |
129 | } | |
130 | ||
131 | #if HAVE_ALTIVEC | |
132 | static void clear_block_altivec(int16_t *block) | |
133 | { | |
134 | LOAD_ZERO; | |
135 | vec_st(zero_s16v, 0, block); | |
136 | vec_st(zero_s16v, 16, block); | |
137 | vec_st(zero_s16v, 32, block); | |
138 | vec_st(zero_s16v, 48, block); | |
139 | vec_st(zero_s16v, 64, block); | |
140 | vec_st(zero_s16v, 80, block); | |
141 | vec_st(zero_s16v, 96, block); | |
142 | vec_st(zero_s16v, 112, block); | |
143 | } | |
144 | #endif /* HAVE_ALTIVEC */ | |
145 | ||
146 | av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth) | |
147 | { | |
148 | // common optimizations whether AltiVec is available or not | |
149 | if (!high_bit_depth) { | |
150 | switch (check_dcbzl_effect()) { | |
151 | case 32: | |
152 | c->clear_blocks = clear_blocks_dcbz32_ppc; | |
153 | break; | |
154 | case 128: | |
155 | c->clear_blocks = clear_blocks_dcbz128_ppc; | |
156 | break; | |
157 | default: | |
158 | break; | |
159 | } | |
160 | } | |
161 | ||
162 | #if HAVE_ALTIVEC | |
163 | if (!PPC_ALTIVEC(av_get_cpu_flags())) | |
164 | return; | |
165 | ||
166 | if (!high_bit_depth) | |
167 | c->clear_block = clear_block_altivec; | |
168 | #endif /* HAVE_ALTIVEC */ | |
169 | } |