Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * MMX optimized discrete wavelet transform | |
3 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
4 | * Copyright (c) 2010 David Conrad | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "libavutil/x86/asm.h" | |
24 | #include "libavutil/x86/cpu.h" | |
25 | #include "dirac_dwt.h" | |
26 | ||
27 | #define COMPOSE_VERTICAL(ext, align) \ | |
28 | void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ | |
29 | void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ | |
30 | void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ | |
31 | void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ | |
32 | void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ | |
33 | void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ | |
34 | void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ | |
35 | \ | |
36 | static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ | |
37 | { \ | |
38 | int i, width_align = width&~(align-1); \ | |
39 | \ | |
40 | for(i=width_align; i<width; i++) \ | |
41 | b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ | |
42 | \ | |
43 | ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ | |
44 | } \ | |
45 | \ | |
46 | static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ | |
47 | { \ | |
48 | int i, width_align = width&~(align-1); \ | |
49 | \ | |
50 | for(i=width_align; i<width; i++) \ | |
51 | b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ | |
52 | \ | |
53 | ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ | |
54 | } \ | |
55 | \ | |
56 | static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ | |
57 | IDWTELEM *b3, IDWTELEM *b4, int width) \ | |
58 | { \ | |
59 | int i, width_align = width&~(align-1); \ | |
60 | \ | |
61 | for(i=width_align; i<width; i++) \ | |
62 | b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ | |
63 | \ | |
64 | ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ | |
65 | } \ | |
66 | \ | |
67 | static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ | |
68 | IDWTELEM *b3, IDWTELEM *b4, int width) \ | |
69 | { \ | |
70 | int i, width_align = width&~(align-1); \ | |
71 | \ | |
72 | for(i=width_align; i<width; i++) \ | |
73 | b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ | |
74 | \ | |
75 | ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ | |
76 | } \ | |
77 | static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \ | |
78 | { \ | |
79 | int i, width_align = width&~(align-1); \ | |
80 | \ | |
81 | for(i=width_align; i<width; i++) { \ | |
82 | b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ | |
83 | b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ | |
84 | } \ | |
85 | \ | |
86 | ff_vertical_compose_haar##ext(b0, b1, width_align); \ | |
87 | } \ | |
88 | static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ | |
89 | {\ | |
90 | int w2= w>>1;\ | |
91 | int x= w2 - (w2&(align-1));\ | |
92 | ff_horizontal_compose_haar0i##ext(b, tmp, w);\ | |
93 | \ | |
94 | for (; x < w2; x++) {\ | |
95 | b[2*x ] = tmp[x];\ | |
96 | b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ | |
97 | }\ | |
98 | }\ | |
99 | static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ | |
100 | {\ | |
101 | int w2= w>>1;\ | |
102 | int x= w2 - (w2&(align-1));\ | |
103 | ff_horizontal_compose_haar1i##ext(b, tmp, w);\ | |
104 | \ | |
105 | for (; x < w2; x++) {\ | |
106 | b[2*x ] = (tmp[x] + 1)>>1;\ | |
107 | b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ | |
108 | }\ | |
109 | }\ | |
110 | \ | |
111 | ||
112 | #if HAVE_YASM | |
113 | #if !ARCH_X86_64 | |
114 | COMPOSE_VERTICAL(_mmx, 4) | |
115 | #endif | |
116 | COMPOSE_VERTICAL(_sse2, 8) | |
117 | ||
118 | ||
119 | void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); | |
120 | ||
121 | static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w) | |
122 | { | |
123 | int w2= w>>1; | |
124 | int x= w2 - (w2&7); | |
125 | ff_horizontal_compose_dd97i_ssse3(b, tmp, w); | |
126 | ||
127 | for (; x < w2; x++) { | |
128 | b[2*x ] = (tmp[x] + 1)>>1; | |
129 | b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; | |
130 | } | |
131 | } | |
132 | #endif | |
133 | ||
134 | void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) | |
135 | { | |
136 | #if HAVE_YASM | |
137 | int mm_flags = av_get_cpu_flags(); | |
138 | ||
139 | #if !ARCH_X86_64 | |
140 | if (!(mm_flags & AV_CPU_FLAG_MMX)) | |
141 | return; | |
142 | ||
143 | switch (type) { | |
144 | case DWT_DIRAC_DD9_7: | |
145 | d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; | |
146 | d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; | |
147 | break; | |
148 | case DWT_DIRAC_LEGALL5_3: | |
149 | d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; | |
150 | d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; | |
151 | break; | |
152 | case DWT_DIRAC_DD13_7: | |
153 | d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; | |
154 | d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; | |
155 | break; | |
156 | case DWT_DIRAC_HAAR0: | |
157 | d->vertical_compose = (void*)vertical_compose_haar_mmx; | |
158 | d->horizontal_compose = horizontal_compose_haar0i_mmx; | |
159 | break; | |
160 | case DWT_DIRAC_HAAR1: | |
161 | d->vertical_compose = (void*)vertical_compose_haar_mmx; | |
162 | d->horizontal_compose = horizontal_compose_haar1i_mmx; | |
163 | break; | |
164 | } | |
165 | #endif | |
166 | ||
167 | if (!(mm_flags & AV_CPU_FLAG_SSE2)) | |
168 | return; | |
169 | ||
170 | switch (type) { | |
171 | case DWT_DIRAC_DD9_7: | |
172 | d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; | |
173 | d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; | |
174 | break; | |
175 | case DWT_DIRAC_LEGALL5_3: | |
176 | d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; | |
177 | d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; | |
178 | break; | |
179 | case DWT_DIRAC_DD13_7: | |
180 | d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; | |
181 | d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; | |
182 | break; | |
183 | case DWT_DIRAC_HAAR0: | |
184 | d->vertical_compose = (void*)vertical_compose_haar_sse2; | |
185 | d->horizontal_compose = horizontal_compose_haar0i_sse2; | |
186 | break; | |
187 | case DWT_DIRAC_HAAR1: | |
188 | d->vertical_compose = (void*)vertical_compose_haar_sse2; | |
189 | d->horizontal_compose = horizontal_compose_haar1i_sse2; | |
190 | break; | |
191 | } | |
192 | ||
193 | if (!(mm_flags & AV_CPU_FLAG_SSSE3)) | |
194 | return; | |
195 | ||
196 | switch (type) { | |
197 | case DWT_DIRAC_DD9_7: | |
198 | d->horizontal_compose = horizontal_compose_dd97i_ssse3; | |
199 | break; | |
200 | } | |
201 | #endif // HAVE_YASM | |
202 | } |