2 * Copyright (c) 2012 Stefano Sabatini
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * flite voice synth source
26 #include <flite/flite.h>
27 #include "libavutil/channel_layout.h"
28 #include "libavutil/file.h"
29 #include "libavutil/opt.h"
41 int16_t *wave_samples
;
45 struct voice_entry
*voice_entry
;
47 int frame_nb_samples
; ///< number of samples per frame
50 #define OFFSET(x) offsetof(FliteContext, x)
51 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
53 static const AVOption flite_options
[] = {
54 { "list_voices", "list voices and exit", OFFSET(list_voices
), AV_OPT_TYPE_INT
, {.i64
=0}, 0, 1, FLAGS
},
55 { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples
), AV_OPT_TYPE_INT
, {.i64
=512}, 0, INT_MAX
, FLAGS
},
56 { "n", "set number of samples per frame", OFFSET(frame_nb_samples
), AV_OPT_TYPE_INT
, {.i64
=512}, 0, INT_MAX
, FLAGS
},
57 { "text", "set text to speak", OFFSET(text
), AV_OPT_TYPE_STRING
, {.str
=NULL
}, CHAR_MIN
, CHAR_MAX
, FLAGS
},
58 { "textfile", "set filename of the text to speak", OFFSET(textfile
), AV_OPT_TYPE_STRING
, {.str
=NULL
}, CHAR_MIN
, CHAR_MAX
, FLAGS
},
59 { "v", "set voice", OFFSET(voice_str
), AV_OPT_TYPE_STRING
, {.str
="kal"}, CHAR_MIN
, CHAR_MAX
, FLAGS
},
60 { "voice", "set voice", OFFSET(voice_str
), AV_OPT_TYPE_STRING
, {.str
="kal"}, CHAR_MIN
, CHAR_MAX
, FLAGS
},
64 AVFILTER_DEFINE_CLASS(flite
);
66 static volatile int flite_inited
= 0;
68 /* declare functions for all the supported voices */
69 #define DECLARE_REGISTER_VOICE_FN(name) \
70 cst_voice *register_cmu_us_## name(const char *); \
71 void unregister_cmu_us_## name(cst_voice *);
72 DECLARE_REGISTER_VOICE_FN(awb
);
73 DECLARE_REGISTER_VOICE_FN(kal
);
74 DECLARE_REGISTER_VOICE_FN(kal16
);
75 DECLARE_REGISTER_VOICE_FN(rms
);
76 DECLARE_REGISTER_VOICE_FN(slt
);
80 cst_voice
* (*register_fn
)(const char *);
81 void (*unregister_fn
)(cst_voice
*);
86 #define MAKE_VOICE_STRUCTURE(voice_name) { \
87 .name = #voice_name, \
88 .register_fn = register_cmu_us_ ## voice_name, \
89 .unregister_fn = unregister_cmu_us_ ## voice_name, \
91 static struct voice_entry voice_entries
[] = {
92 MAKE_VOICE_STRUCTURE(awb
),
93 MAKE_VOICE_STRUCTURE(kal
),
94 MAKE_VOICE_STRUCTURE(kal16
),
95 MAKE_VOICE_STRUCTURE(rms
),
96 MAKE_VOICE_STRUCTURE(slt
),
99 static void list_voices(void *log_ctx
, const char *sep
)
101 int i
, n
= FF_ARRAY_ELEMS(voice_entries
);
102 for (i
= 0; i
< n
; i
++)
103 av_log(log_ctx
, AV_LOG_INFO
, "%s%s",
104 voice_entries
[i
].name
, i
< (n
-1) ? sep
: "\n");
107 static int select_voice(struct voice_entry
**entry_ret
, const char *voice_name
, void *log_ctx
)
111 for (i
= 0; i
< FF_ARRAY_ELEMS(voice_entries
); i
++) {
112 struct voice_entry
*entry
= &voice_entries
[i
];
113 if (!strcmp(entry
->name
, voice_name
)) {
115 entry
->voice
= entry
->register_fn(NULL
);
117 av_log(log_ctx
, AV_LOG_ERROR
,
118 "Could not register voice '%s'\n", voice_name
);
119 return AVERROR_UNKNOWN
;
121 entry
->usage_count
++;
127 av_log(log_ctx
, AV_LOG_ERROR
, "Could not find voice '%s'\n", voice_name
);
128 av_log(log_ctx
, AV_LOG_INFO
, "Choose between the voices: ");
129 list_voices(log_ctx
, ", ");
131 return AVERROR(EINVAL
);
134 static av_cold
int init(AVFilterContext
*ctx
)
136 FliteContext
*flite
= ctx
->priv
;
139 if (flite
->list_voices
) {
140 list_voices(ctx
, "\n");
145 if (flite_init() < 0) {
146 av_log(ctx
, AV_LOG_ERROR
, "flite initialization failed\n");
147 return AVERROR_UNKNOWN
;
152 if ((ret
= select_voice(&flite
->voice_entry
, flite
->voice_str
, ctx
)) < 0)
154 flite
->voice
= flite
->voice_entry
->voice
;
156 if (flite
->textfile
&& flite
->text
) {
157 av_log(ctx
, AV_LOG_ERROR
,
158 "Both text and textfile options set: only one must be specified\n");
159 return AVERROR(EINVAL
);
162 if (flite
->textfile
) {
166 if ((ret
= av_file_map(flite
->textfile
, &textbuf
, &textbuf_size
, 0, ctx
)) < 0) {
167 av_log(ctx
, AV_LOG_ERROR
,
168 "The text file '%s' could not be read: %s\n",
169 flite
->textfile
, av_err2str(ret
));
173 if (!(flite
->text
= av_malloc(textbuf_size
+1)))
174 return AVERROR(ENOMEM
);
175 memcpy(flite
->text
, textbuf
, textbuf_size
);
176 flite
->text
[textbuf_size
] = 0;
177 av_file_unmap(textbuf
, textbuf_size
);
181 av_log(ctx
, AV_LOG_ERROR
,
182 "No speech text specified, specify the 'text' or 'textfile' option\n");
183 return AVERROR(EINVAL
);
186 /* synth all the file data in block */
187 flite
->wave
= flite_text_to_wave(flite
->text
, flite
->voice
);
188 flite
->wave_samples
= flite
->wave
->samples
;
189 flite
->wave_nb_samples
= flite
->wave
->num_samples
;
193 static av_cold
void uninit(AVFilterContext
*ctx
)
195 FliteContext
*flite
= ctx
->priv
;
197 if (!--flite
->voice_entry
->usage_count
)
198 flite
->voice_entry
->unregister_fn(flite
->voice
);
200 flite
->voice_entry
= NULL
;
201 delete_wave(flite
->wave
);
205 static int query_formats(AVFilterContext
*ctx
)
207 FliteContext
*flite
= ctx
->priv
;
209 AVFilterChannelLayouts
*chlayouts
= NULL
;
210 int64_t chlayout
= av_get_default_channel_layout(flite
->wave
->num_channels
);
211 AVFilterFormats
*sample_formats
= NULL
;
212 AVFilterFormats
*sample_rates
= NULL
;
214 ff_add_channel_layout(&chlayouts
, chlayout
);
215 ff_set_common_channel_layouts(ctx
, chlayouts
);
216 ff_add_format(&sample_formats
, AV_SAMPLE_FMT_S16
);
217 ff_set_common_formats(ctx
, sample_formats
);
218 ff_add_format(&sample_rates
, flite
->wave
->sample_rate
);
219 ff_set_common_samplerates (ctx
, sample_rates
);
224 static int config_props(AVFilterLink
*outlink
)
226 AVFilterContext
*ctx
= outlink
->src
;
227 FliteContext
*flite
= ctx
->priv
;
229 outlink
->sample_rate
= flite
->wave
->sample_rate
;
230 outlink
->time_base
= (AVRational
){1, flite
->wave
->sample_rate
};
232 av_log(ctx
, AV_LOG_VERBOSE
, "voice:%s fmt:%s sample_rate:%d\n",
234 av_get_sample_fmt_name(outlink
->format
), outlink
->sample_rate
);
238 static int request_frame(AVFilterLink
*outlink
)
241 FliteContext
*flite
= outlink
->src
->priv
;
242 int nb_samples
= FFMIN(flite
->wave_nb_samples
, flite
->frame_nb_samples
);
247 samplesref
= ff_get_audio_buffer(outlink
, nb_samples
);
249 return AVERROR(ENOMEM
);
251 memcpy(samplesref
->data
[0], flite
->wave_samples
,
252 nb_samples
* flite
->wave
->num_channels
* 2);
253 samplesref
->pts
= flite
->pts
;
254 av_frame_set_pkt_pos(samplesref
, -1);
255 av_frame_set_sample_rate(samplesref
, flite
->wave
->sample_rate
);
256 flite
->pts
+= nb_samples
;
257 flite
->wave_samples
+= nb_samples
* flite
->wave
->num_channels
;
258 flite
->wave_nb_samples
-= nb_samples
;
260 return ff_filter_frame(outlink
, samplesref
);
263 static const AVFilterPad flite_outputs
[] = {
266 .type
= AVMEDIA_TYPE_AUDIO
,
267 .config_props
= config_props
,
268 .request_frame
= request_frame
,
273 AVFilter ff_asrc_flite
= {
275 .description
= NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
276 .query_formats
= query_formats
,
279 .priv_size
= sizeof(FliteContext
),
281 .outputs
= flite_outputs
,
282 .priv_class
= &flite_class
,