2 * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * tempo scaling audio filter -- an implementation of WSOLA algorithm
25 * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
26 * from Apprentice Video player by Pavel Koshevoy.
27 * https://sourceforge.net/projects/apprenticevideo/
29 * An explanation of SOLA algorithm is available at
30 * http://www.surina.net/article/time-and-pitch-scaling.html
32 * WSOLA is very similar to SOLA, only one major difference exists between
33 * these algorithms. SOLA shifts audio fragments along the output stream,
34 * where as WSOLA shifts audio fragments along the input stream.
36 * The advantage of WSOLA algorithm is that the overlap region size is
37 * always the same, therefore the blending function is constant and
42 #include "libavcodec/avfft.h"
43 #include "libavutil/avassert.h"
44 #include "libavutil/avstring.h"
45 #include "libavutil/channel_layout.h"
46 #include "libavutil/eval.h"
47 #include "libavutil/opt.h"
48 #include "libavutil/samplefmt.h"
54 * A fragment of audio waveform
57 // index of the first sample of this fragment in the overall waveform;
58 // 0: input sample position
59 // 1: output sample position
62 // original packed multi-channel samples:
65 // number of samples in this fragment:
68 // rDFT transform of the down-mixed mono fragment, used for
69 // fast waveform alignment via correlation in frequency domain:
74 * Filter state machine states
80 YAE_OUTPUT_OVERLAP_ADD
,
85 * Filter state machine
90 // ring-buffer of input samples, necessary because some times
91 // input fragment position may be adjusted backwards:
94 // ring-buffer maximum capacity, expressed in sample rate time base:
97 // ring-buffer house keeping:
102 // 0: input sample position corresponding to the ring buffer tail
103 // 1: output sample position
107 enum AVSampleFormat format
;
109 // number of channels:
112 // row of bytes to skip from one sample to next, across multple channels;
113 // stride = (number-of-channels * bits-per-sample-per-channel) / 8
116 // fragment window size, power-of-two integer:
119 // Hann window coefficients, for feathering
120 // (blending) the overlapping fragment region:
123 // tempo scaling factor:
126 // a snapshot of previous fragment input and output position values
127 // captured when the tempo scale factor was set most recently:
130 // current/previous fragment ring-buffer:
131 AudioFragment frag
[2];
133 // current fragment index:
139 // for fast correlation calculation in frequency domain:
140 RDFTContext
*real_to_complex
;
141 RDFTContext
*complex_to_real
;
142 FFTSample
*correlation
;
144 // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame
148 uint64_t nsamples_in
;
149 uint64_t nsamples_out
;
152 #define OFFSET(x) offsetof(ATempoContext, x)
154 static const AVOption atempo_options
[] = {
155 { "tempo", "set tempo scale factor",
156 OFFSET(tempo
), AV_OPT_TYPE_DOUBLE
, { .dbl
= 1.0 }, 0.5, 2.0,
157 AV_OPT_FLAG_AUDIO_PARAM
| AV_OPT_FLAG_FILTERING_PARAM
},
161 AVFILTER_DEFINE_CLASS(atempo
);
163 inline static AudioFragment
*yae_curr_frag(ATempoContext
*atempo
)
165 return &atempo
->frag
[atempo
->nfrag
% 2];
168 inline static AudioFragment
*yae_prev_frag(ATempoContext
*atempo
)
170 return &atempo
->frag
[(atempo
->nfrag
+ 1) % 2];
174 * Reset filter to initial state, do not deallocate existing local buffers.
176 static void yae_clear(ATempoContext
*atempo
)
183 atempo
->state
= YAE_LOAD_FRAGMENT
;
185 atempo
->position
[0] = 0;
186 atempo
->position
[1] = 0;
188 atempo
->origin
[0] = 0;
189 atempo
->origin
[1] = 0;
191 atempo
->frag
[0].position
[0] = 0;
192 atempo
->frag
[0].position
[1] = 0;
193 atempo
->frag
[0].nsamples
= 0;
195 atempo
->frag
[1].position
[0] = 0;
196 atempo
->frag
[1].position
[1] = 0;
197 atempo
->frag
[1].nsamples
= 0;
199 // shift left position of 1st fragment by half a window
200 // so that no re-normalization would be required for
201 // the left half of the 1st fragment:
202 atempo
->frag
[0].position
[0] = -(int64_t)(atempo
->window
/ 2);
203 atempo
->frag
[0].position
[1] = -(int64_t)(atempo
->window
/ 2);
205 av_frame_free(&atempo
->dst_buffer
);
207 atempo
->dst_end
= NULL
;
209 atempo
->nsamples_in
= 0;
210 atempo
->nsamples_out
= 0;
214 * Reset filter to initial state and deallocate all buffers.
216 static void yae_release_buffers(ATempoContext
*atempo
)
220 av_freep(&atempo
->frag
[0].data
);
221 av_freep(&atempo
->frag
[1].data
);
222 av_freep(&atempo
->frag
[0].xdat
);
223 av_freep(&atempo
->frag
[1].xdat
);
225 av_freep(&atempo
->buffer
);
226 av_freep(&atempo
->hann
);
227 av_freep(&atempo
->correlation
);
229 av_rdft_end(atempo
->real_to_complex
);
230 atempo
->real_to_complex
= NULL
;
232 av_rdft_end(atempo
->complex_to_real
);
233 atempo
->complex_to_real
= NULL
;
236 /* av_realloc is not aligned enough; fortunately, the data does not need to
238 #define RE_MALLOC_OR_FAIL(field, field_size) \
241 field = av_malloc(field_size); \
243 yae_release_buffers(atempo); \
244 return AVERROR(ENOMEM); \
249 * Prepare filter for processing audio data of given format,
250 * sample rate and number of channels.
252 static int yae_reset(ATempoContext
*atempo
,
253 enum AVSampleFormat format
,
257 const int sample_size
= av_get_bytes_per_sample(format
);
258 uint32_t nlevels
= 0;
262 atempo
->format
= format
;
263 atempo
->channels
= channels
;
264 atempo
->stride
= sample_size
* channels
;
266 // pick a segment window size:
267 atempo
->window
= sample_rate
/ 24;
269 // adjust window size to be a power-of-two integer:
270 nlevels
= av_log2(atempo
->window
);
272 av_assert0(pot
<= atempo
->window
);
274 if (pot
< atempo
->window
) {
275 atempo
->window
= pot
* 2;
279 // initialize audio fragment buffers:
280 RE_MALLOC_OR_FAIL(atempo
->frag
[0].data
, atempo
->window
* atempo
->stride
);
281 RE_MALLOC_OR_FAIL(atempo
->frag
[1].data
, atempo
->window
* atempo
->stride
);
282 RE_MALLOC_OR_FAIL(atempo
->frag
[0].xdat
, atempo
->window
* sizeof(FFTComplex
));
283 RE_MALLOC_OR_FAIL(atempo
->frag
[1].xdat
, atempo
->window
* sizeof(FFTComplex
));
285 // initialize rDFT contexts:
286 av_rdft_end(atempo
->real_to_complex
);
287 atempo
->real_to_complex
= NULL
;
289 av_rdft_end(atempo
->complex_to_real
);
290 atempo
->complex_to_real
= NULL
;
292 atempo
->real_to_complex
= av_rdft_init(nlevels
+ 1, DFT_R2C
);
293 if (!atempo
->real_to_complex
) {
294 yae_release_buffers(atempo
);
295 return AVERROR(ENOMEM
);
298 atempo
->complex_to_real
= av_rdft_init(nlevels
+ 1, IDFT_C2R
);
299 if (!atempo
->complex_to_real
) {
300 yae_release_buffers(atempo
);
301 return AVERROR(ENOMEM
);
304 RE_MALLOC_OR_FAIL(atempo
->correlation
, atempo
->window
* sizeof(FFTComplex
));
306 atempo
->ring
= atempo
->window
* 3;
307 RE_MALLOC_OR_FAIL(atempo
->buffer
, atempo
->ring
* atempo
->stride
);
309 // initialize the Hann window function:
310 RE_MALLOC_OR_FAIL(atempo
->hann
, atempo
->window
* sizeof(float));
312 for (i
= 0; i
< atempo
->window
; i
++) {
313 double t
= (double)i
/ (double)(atempo
->window
- 1);
314 double h
= 0.5 * (1.0 - cos(2.0 * M_PI
* t
));
315 atempo
->hann
[i
] = (float)h
;
322 static int yae_set_tempo(AVFilterContext
*ctx
, const char *arg_tempo
)
324 const AudioFragment
*prev
;
325 ATempoContext
*atempo
= ctx
->priv
;
327 double tempo
= av_strtod(arg_tempo
, &tail
);
330 av_log(ctx
, AV_LOG_ERROR
, "Invalid tempo value '%s'\n", arg_tempo
);
331 return AVERROR(EINVAL
);
334 if (tempo
< 0.5 || tempo
> 2.0) {
335 av_log(ctx
, AV_LOG_ERROR
, "Tempo value %f exceeds [0.5, 2.0] range\n",
337 return AVERROR(EINVAL
);
340 prev
= yae_prev_frag(atempo
);
341 atempo
->origin
[0] = prev
->position
[0] + atempo
->window
/ 2;
342 atempo
->origin
[1] = prev
->position
[1] + atempo
->window
/ 2;
343 atempo
->tempo
= tempo
;
348 * A helper macro for initializing complex data buffer with scalar data
351 #define yae_init_xdat(scalar_type, scalar_max) \
353 const uint8_t *src_end = src + \
354 frag->nsamples * atempo->channels * sizeof(scalar_type); \
356 FFTSample *xdat = frag->xdat; \
359 if (atempo->channels == 1) { \
360 for (; src < src_end; xdat++) { \
361 tmp = *(const scalar_type *)src; \
362 src += sizeof(scalar_type); \
364 *xdat = (FFTSample)tmp; \
367 FFTSample s, max, ti, si; \
370 for (; src < src_end; xdat++) { \
371 tmp = *(const scalar_type *)src; \
372 src += sizeof(scalar_type); \
374 max = (FFTSample)tmp; \
375 s = FFMIN((FFTSample)scalar_max, \
376 (FFTSample)fabsf(max)); \
378 for (i = 1; i < atempo->channels; i++) { \
379 tmp = *(const scalar_type *)src; \
380 src += sizeof(scalar_type); \
382 ti = (FFTSample)tmp; \
383 si = FFMIN((FFTSample)scalar_max, \
384 (FFTSample)fabsf(ti)); \
398 * Initialize complex data buffer of a given audio fragment
399 * with down-mixed mono data of appropriate scalar type.
401 static void yae_downmix(ATempoContext
*atempo
, AudioFragment
*frag
)
404 const uint8_t *src
= frag
->data
;
406 // init complex data buffer used for FFT and Correlation:
407 memset(frag
->xdat
, 0, sizeof(FFTComplex
) * atempo
->window
);
409 if (atempo
->format
== AV_SAMPLE_FMT_U8
) {
410 yae_init_xdat(uint8_t, 127);
411 } else if (atempo
->format
== AV_SAMPLE_FMT_S16
) {
412 yae_init_xdat(int16_t, 32767);
413 } else if (atempo
->format
== AV_SAMPLE_FMT_S32
) {
414 yae_init_xdat(int, 2147483647);
415 } else if (atempo
->format
== AV_SAMPLE_FMT_FLT
) {
416 yae_init_xdat(float, 1);
417 } else if (atempo
->format
== AV_SAMPLE_FMT_DBL
) {
418 yae_init_xdat(double, 1);
423 * Populate the internal data buffer on as-needed basis.
426 * 0 if requested data was already available or was successfully loaded,
427 * AVERROR(EAGAIN) if more input data is required.
429 static int yae_load_data(ATempoContext
*atempo
,
430 const uint8_t **src_ref
,
431 const uint8_t *src_end
,
435 const uint8_t *src
= *src_ref
;
436 const int read_size
= stop_here
- atempo
->position
[0];
438 if (stop_here
<= atempo
->position
[0]) {
442 // samples are not expected to be skipped:
443 av_assert0(read_size
<= atempo
->ring
);
445 while (atempo
->position
[0] < stop_here
&& src
< src_end
) {
446 int src_samples
= (src_end
- src
) / atempo
->stride
;
448 // load data piece-wise, in order to avoid complicating the logic:
449 int nsamples
= FFMIN(read_size
, src_samples
);
453 nsamples
= FFMIN(nsamples
, atempo
->ring
);
454 na
= FFMIN(nsamples
, atempo
->ring
- atempo
->tail
);
455 nb
= FFMIN(nsamples
- na
, atempo
->ring
);
458 uint8_t *a
= atempo
->buffer
+ atempo
->tail
* atempo
->stride
;
459 memcpy(a
, src
, na
* atempo
->stride
);
461 src
+= na
* atempo
->stride
;
462 atempo
->position
[0] += na
;
464 atempo
->size
= FFMIN(atempo
->size
+ na
, atempo
->ring
);
465 atempo
->tail
= (atempo
->tail
+ na
) % atempo
->ring
;
467 atempo
->size
< atempo
->ring
?
468 atempo
->tail
- atempo
->size
:
473 uint8_t *b
= atempo
->buffer
;
474 memcpy(b
, src
, nb
* atempo
->stride
);
476 src
+= nb
* atempo
->stride
;
477 atempo
->position
[0] += nb
;
479 atempo
->size
= FFMIN(atempo
->size
+ nb
, atempo
->ring
);
480 atempo
->tail
= (atempo
->tail
+ nb
) % atempo
->ring
;
482 atempo
->size
< atempo
->ring
?
483 atempo
->tail
- atempo
->size
:
488 // pass back the updated source buffer pointer:
492 av_assert0(atempo
->position
[0] <= stop_here
);
494 return atempo
->position
[0] == stop_here
? 0 : AVERROR(EAGAIN
);
498 * Populate current audio fragment data buffer.
501 * 0 when the fragment is ready,
502 * AVERROR(EAGAIN) if more input data is required.
504 static int yae_load_frag(ATempoContext
*atempo
,
505 const uint8_t **src_ref
,
506 const uint8_t *src_end
)
509 AudioFragment
*frag
= yae_curr_frag(atempo
);
511 int64_t missing
, start
, zeros
;
513 const uint8_t *a
, *b
;
514 int i0
, i1
, n0
, n1
, na
, nb
;
516 int64_t stop_here
= frag
->position
[0] + atempo
->window
;
517 if (src_ref
&& yae_load_data(atempo
, src_ref
, src_end
, stop_here
) != 0) {
518 return AVERROR(EAGAIN
);
521 // calculate the number of samples we don't have:
523 stop_here
> atempo
->position
[0] ?
524 stop_here
- atempo
->position
[0] : 0;
527 missing
< (int64_t)atempo
->window
?
528 (uint32_t)(atempo
->window
- missing
) : 0;
530 // setup the output buffer:
531 frag
->nsamples
= nsamples
;
534 start
= atempo
->position
[0] - atempo
->size
;
537 if (frag
->position
[0] < start
) {
538 // what we don't have we substitute with zeros:
539 zeros
= FFMIN(start
- frag
->position
[0], (int64_t)nsamples
);
540 av_assert0(zeros
!= nsamples
);
542 memset(dst
, 0, zeros
* atempo
->stride
);
543 dst
+= zeros
* atempo
->stride
;
546 if (zeros
== nsamples
) {
550 // get the remaining data from the ring buffer:
551 na
= (atempo
->head
< atempo
->tail
?
552 atempo
->tail
- atempo
->head
:
553 atempo
->ring
- atempo
->head
);
555 nb
= atempo
->head
< atempo
->tail
? 0 : atempo
->tail
;
558 av_assert0(nsamples
<= zeros
+ na
+ nb
);
560 a
= atempo
->buffer
+ atempo
->head
* atempo
->stride
;
563 i0
= frag
->position
[0] + zeros
- start
;
564 i1
= i0
< na
? 0 : i0
- na
;
566 n0
= i0
< na
? FFMIN(na
- i0
, (int)(nsamples
- zeros
)) : 0;
567 n1
= nsamples
- zeros
- n0
;
570 memcpy(dst
, a
+ i0
* atempo
->stride
, n0
* atempo
->stride
);
571 dst
+= n0
* atempo
->stride
;
575 memcpy(dst
, b
+ i1
* atempo
->stride
, n1
* atempo
->stride
);
582 * Prepare for loading next audio fragment.
584 static void yae_advance_to_next_frag(ATempoContext
*atempo
)
586 const double fragment_step
= atempo
->tempo
* (double)(atempo
->window
/ 2);
588 const AudioFragment
*prev
;
592 prev
= yae_prev_frag(atempo
);
593 frag
= yae_curr_frag(atempo
);
595 frag
->position
[0] = prev
->position
[0] + (int64_t)fragment_step
;
596 frag
->position
[1] = prev
->position
[1] + atempo
->window
/ 2;
601 * Calculate cross-correlation via rDFT.
603 * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
604 * and transform back via complex_to_real rDFT.
606 static void yae_xcorr_via_rdft(FFTSample
*xcorr
,
607 RDFTContext
*complex_to_real
,
608 const FFTComplex
*xa
,
609 const FFTComplex
*xb
,
612 FFTComplex
*xc
= (FFTComplex
*)xcorr
;
615 // NOTE: first element requires special care -- Given Y = rDFT(X),
616 // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
617 // stores Re(Y[N/2]) in place of Im(Y[0]).
619 xc
->re
= xa
->re
* xb
->re
;
620 xc
->im
= xa
->im
* xb
->im
;
625 for (i
= 1; i
< window
; i
++, xa
++, xb
++, xc
++) {
626 xc
->re
= (xa
->re
* xb
->re
+ xa
->im
* xb
->im
);
627 xc
->im
= (xa
->im
* xb
->re
- xa
->re
* xb
->im
);
630 // apply inverse rDFT:
631 av_rdft_calc(complex_to_real
, xcorr
);
635 * Calculate alignment offset for given fragment
636 * relative to the previous fragment.
638 * @return alignment offset of current fragment relative to previous.
640 static int yae_align(AudioFragment
*frag
,
641 const AudioFragment
*prev
,
645 FFTSample
*correlation
,
646 RDFTContext
*complex_to_real
)
648 int best_offset
= -drift
;
649 FFTSample best_metric
= -FLT_MAX
;
656 yae_xcorr_via_rdft(correlation
,
658 (const FFTComplex
*)prev
->xdat
,
659 (const FFTComplex
*)frag
->xdat
,
662 // identify search window boundaries:
663 i0
= FFMAX(window
/ 2 - delta_max
- drift
, 0);
664 i0
= FFMIN(i0
, window
);
666 i1
= FFMIN(window
/ 2 + delta_max
- drift
, window
- window
/ 16);
669 // identify cross-correlation peaks within search window:
670 xcorr
= correlation
+ i0
;
672 for (i
= i0
; i
< i1
; i
++, xcorr
++) {
673 FFTSample metric
= *xcorr
;
676 FFTSample drifti
= (FFTSample
)(drift
+ i
);
677 metric
*= drifti
* (FFTSample
)(i
- i0
) * (FFTSample
)(i1
- i
);
679 if (metric
> best_metric
) {
680 best_metric
= metric
;
681 best_offset
= i
- window
/ 2;
689 * Adjust current fragment position for better alignment
690 * with previous fragment.
692 * @return alignment correction.
694 static int yae_adjust_position(ATempoContext
*atempo
)
696 const AudioFragment
*prev
= yae_prev_frag(atempo
);
697 AudioFragment
*frag
= yae_curr_frag(atempo
);
699 const double prev_output_position
=
700 (double)(prev
->position
[1] - atempo
->origin
[1] + atempo
->window
/ 2);
702 const double ideal_output_position
=
703 (double)(prev
->position
[0] - atempo
->origin
[0] + atempo
->window
/ 2) /
706 const int drift
= (int)(prev_output_position
- ideal_output_position
);
708 const int delta_max
= atempo
->window
/ 2;
709 const int correction
= yae_align(frag
,
715 atempo
->complex_to_real
);
718 // adjust fragment position:
719 frag
->position
[0] -= correction
;
721 // clear so that the fragment can be reloaded:
729 * A helper macro for blending the overlap region of previous
730 * and current audio fragment.
732 #define yae_blend(scalar_type) \
734 const scalar_type *aaa = (const scalar_type *)a; \
735 const scalar_type *bbb = (const scalar_type *)b; \
737 scalar_type *out = (scalar_type *)dst; \
738 scalar_type *out_end = (scalar_type *)dst_end; \
741 for (i = 0; i < overlap && out < out_end; \
742 i++, atempo->position[1]++, wa++, wb++) { \
747 for (j = 0; j < atempo->channels; \
748 j++, aaa++, bbb++, out++) { \
749 float t0 = (float)*aaa; \
750 float t1 = (float)*bbb; \
753 frag->position[0] + i < 0 ? \
755 (scalar_type)(t0 * w0 + t1 * w1); \
758 dst = (uint8_t *)out; \
762 * Blend the overlap region of previous and current audio fragment
763 * and output the results to the given destination buffer.
766 * 0 if the overlap region was completely stored in the dst buffer,
767 * AVERROR(EAGAIN) if more destination buffer space is required.
769 static int yae_overlap_add(ATempoContext
*atempo
,
774 const AudioFragment
*prev
= yae_prev_frag(atempo
);
775 const AudioFragment
*frag
= yae_curr_frag(atempo
);
777 const int64_t start_here
= FFMAX(atempo
->position
[1],
780 const int64_t stop_here
= FFMIN(prev
->position
[1] + prev
->nsamples
,
781 frag
->position
[1] + frag
->nsamples
);
783 const int64_t overlap
= stop_here
- start_here
;
785 const int64_t ia
= start_here
- prev
->position
[1];
786 const int64_t ib
= start_here
- frag
->position
[1];
788 const float *wa
= atempo
->hann
+ ia
;
789 const float *wb
= atempo
->hann
+ ib
;
791 const uint8_t *a
= prev
->data
+ ia
* atempo
->stride
;
792 const uint8_t *b
= frag
->data
+ ib
* atempo
->stride
;
794 uint8_t *dst
= *dst_ref
;
796 av_assert0(start_here
<= stop_here
&&
797 frag
->position
[1] <= start_here
&&
798 overlap
<= frag
->nsamples
);
800 if (atempo
->format
== AV_SAMPLE_FMT_U8
) {
802 } else if (atempo
->format
== AV_SAMPLE_FMT_S16
) {
804 } else if (atempo
->format
== AV_SAMPLE_FMT_S32
) {
806 } else if (atempo
->format
== AV_SAMPLE_FMT_FLT
) {
808 } else if (atempo
->format
== AV_SAMPLE_FMT_DBL
) {
812 // pass-back the updated destination buffer pointer:
815 return atempo
->position
[1] == stop_here
? 0 : AVERROR(EAGAIN
);
819 * Feed as much data to the filter as it is able to consume
820 * and receive as much processed data in the destination buffer
821 * as it is able to produce or store.
824 yae_apply(ATempoContext
*atempo
,
825 const uint8_t **src_ref
,
826 const uint8_t *src_end
,
831 if (atempo
->state
== YAE_LOAD_FRAGMENT
) {
832 // load additional data for the current fragment:
833 if (yae_load_frag(atempo
, src_ref
, src_end
) != 0) {
838 yae_downmix(atempo
, yae_curr_frag(atempo
));
841 av_rdft_calc(atempo
->real_to_complex
, yae_curr_frag(atempo
)->xdat
);
843 // must load the second fragment before alignment can start:
844 if (!atempo
->nfrag
) {
845 yae_advance_to_next_frag(atempo
);
849 atempo
->state
= YAE_ADJUST_POSITION
;
852 if (atempo
->state
== YAE_ADJUST_POSITION
) {
853 // adjust position for better alignment:
854 if (yae_adjust_position(atempo
)) {
855 // reload the fragment at the corrected position, so that the
856 // Hann window blending would not require normalization:
857 atempo
->state
= YAE_RELOAD_FRAGMENT
;
859 atempo
->state
= YAE_OUTPUT_OVERLAP_ADD
;
863 if (atempo
->state
== YAE_RELOAD_FRAGMENT
) {
864 // load additional data if necessary due to position adjustment:
865 if (yae_load_frag(atempo
, src_ref
, src_end
) != 0) {
870 yae_downmix(atempo
, yae_curr_frag(atempo
));
873 av_rdft_calc(atempo
->real_to_complex
, yae_curr_frag(atempo
)->xdat
);
875 atempo
->state
= YAE_OUTPUT_OVERLAP_ADD
;
878 if (atempo
->state
== YAE_OUTPUT_OVERLAP_ADD
) {
879 // overlap-add and output the result:
880 if (yae_overlap_add(atempo
, dst_ref
, dst_end
) != 0) {
884 // advance to the next fragment, repeat:
885 yae_advance_to_next_frag(atempo
);
886 atempo
->state
= YAE_LOAD_FRAGMENT
;
892 * Flush any buffered data from the filter.
895 * 0 if all data was completely stored in the dst buffer,
896 * AVERROR(EAGAIN) if more destination buffer space is required.
898 static int yae_flush(ATempoContext
*atempo
,
902 AudioFragment
*frag
= yae_curr_frag(atempo
);
915 atempo
->state
= YAE_FLUSH_OUTPUT
;
917 if (atempo
->position
[0] == frag
->position
[0] + frag
->nsamples
&&
918 atempo
->position
[1] == frag
->position
[1] + frag
->nsamples
) {
919 // the current fragment is already flushed:
923 if (frag
->position
[0] + frag
->nsamples
< atempo
->position
[0]) {
924 // finish loading the current (possibly partial) fragment:
925 yae_load_frag(atempo
, NULL
, NULL
);
929 yae_downmix(atempo
, frag
);
932 av_rdft_calc(atempo
->real_to_complex
, frag
->xdat
);
934 // align current fragment to previous fragment:
935 if (yae_adjust_position(atempo
)) {
936 // reload the current fragment due to adjusted position:
937 yae_load_frag(atempo
, NULL
, NULL
);
942 // flush the overlap region:
943 overlap_end
= frag
->position
[1] + FFMIN(atempo
->window
/ 2,
946 while (atempo
->position
[1] < overlap_end
) {
947 if (yae_overlap_add(atempo
, dst_ref
, dst_end
) != 0) {
948 return AVERROR(EAGAIN
);
952 // check whether all of the input samples have been consumed:
953 if (frag
->position
[0] + frag
->nsamples
< atempo
->position
[0]) {
954 yae_advance_to_next_frag(atempo
);
955 return AVERROR(EAGAIN
);
958 // flush the remainder of the current fragment:
959 start_here
= FFMAX(atempo
->position
[1], overlap_end
);
960 stop_here
= frag
->position
[1] + frag
->nsamples
;
961 offset
= start_here
- frag
->position
[1];
962 av_assert0(start_here
<= stop_here
&& frag
->position
[1] <= start_here
);
964 src
= frag
->data
+ offset
* atempo
->stride
;
965 dst
= (uint8_t *)*dst_ref
;
967 src_size
= (int)(stop_here
- start_here
) * atempo
->stride
;
968 dst_size
= dst_end
- dst
;
969 nbytes
= FFMIN(src_size
, dst_size
);
971 memcpy(dst
, src
, nbytes
);
974 atempo
->position
[1] += (nbytes
/ atempo
->stride
);
976 // pass-back the updated destination buffer pointer:
977 *dst_ref
= (uint8_t *)dst
;
979 return atempo
->position
[1] == stop_here
? 0 : AVERROR(EAGAIN
);
982 static av_cold
int init(AVFilterContext
*ctx
)
984 ATempoContext
*atempo
= ctx
->priv
;
985 atempo
->format
= AV_SAMPLE_FMT_NONE
;
986 atempo
->state
= YAE_LOAD_FRAGMENT
;
990 static av_cold
void uninit(AVFilterContext
*ctx
)
992 ATempoContext
*atempo
= ctx
->priv
;
993 yae_release_buffers(atempo
);
996 static int query_formats(AVFilterContext
*ctx
)
998 AVFilterChannelLayouts
*layouts
= NULL
;
999 AVFilterFormats
*formats
= NULL
;
1001 // WSOLA necessitates an internal sliding window ring buffer
1002 // for incoming audio stream.
1004 // Planar sample formats are too cumbersome to store in a ring buffer,
1005 // therefore planar sample formats are not supported.
1007 static const enum AVSampleFormat sample_fmts
[] = {
1016 layouts
= ff_all_channel_layouts();
1018 return AVERROR(ENOMEM
);
1020 ff_set_common_channel_layouts(ctx
, layouts
);
1022 formats
= ff_make_format_list(sample_fmts
);
1024 return AVERROR(ENOMEM
);
1026 ff_set_common_formats(ctx
, formats
);
1028 formats
= ff_all_samplerates();
1030 return AVERROR(ENOMEM
);
1032 ff_set_common_samplerates(ctx
, formats
);
1037 static int config_props(AVFilterLink
*inlink
)
1039 AVFilterContext
*ctx
= inlink
->dst
;
1040 ATempoContext
*atempo
= ctx
->priv
;
1042 enum AVSampleFormat format
= inlink
->format
;
1043 int sample_rate
= (int)inlink
->sample_rate
;
1044 int channels
= av_get_channel_layout_nb_channels(inlink
->channel_layout
);
1046 ctx
->outputs
[0]->flags
|= FF_LINK_FLAG_REQUEST_LOOP
;
1048 return yae_reset(atempo
, format
, sample_rate
, channels
);
1051 static int push_samples(ATempoContext
*atempo
,
1052 AVFilterLink
*outlink
,
1057 atempo
->dst_buffer
->sample_rate
= outlink
->sample_rate
;
1058 atempo
->dst_buffer
->nb_samples
= n_out
;
1061 atempo
->dst_buffer
->pts
=
1062 av_rescale_q(atempo
->nsamples_out
,
1063 (AVRational
){ 1, outlink
->sample_rate
},
1064 outlink
->time_base
);
1066 ret
= ff_filter_frame(outlink
, atempo
->dst_buffer
);
1067 atempo
->dst_buffer
= NULL
;
1069 atempo
->dst_end
= NULL
;
1073 atempo
->nsamples_out
+= n_out
;
1077 static int filter_frame(AVFilterLink
*inlink
, AVFrame
*src_buffer
)
1079 AVFilterContext
*ctx
= inlink
->dst
;
1080 ATempoContext
*atempo
= ctx
->priv
;
1081 AVFilterLink
*outlink
= ctx
->outputs
[0];
1084 int n_in
= src_buffer
->nb_samples
;
1085 int n_out
= (int)(0.5 + ((double)n_in
) / atempo
->tempo
);
1087 const uint8_t *src
= src_buffer
->data
[0];
1088 const uint8_t *src_end
= src
+ n_in
* atempo
->stride
;
1090 while (src
< src_end
) {
1091 if (!atempo
->dst_buffer
) {
1092 atempo
->dst_buffer
= ff_get_audio_buffer(outlink
, n_out
);
1093 if (!atempo
->dst_buffer
)
1094 return AVERROR(ENOMEM
);
1095 av_frame_copy_props(atempo
->dst_buffer
, src_buffer
);
1097 atempo
->dst
= atempo
->dst_buffer
->data
[0];
1098 atempo
->dst_end
= atempo
->dst
+ n_out
* atempo
->stride
;
1101 yae_apply(atempo
, &src
, src_end
, &atempo
->dst
, atempo
->dst_end
);
1103 if (atempo
->dst
== atempo
->dst_end
) {
1104 int n_samples
= ((atempo
->dst
- atempo
->dst_buffer
->data
[0]) /
1106 ret
= push_samples(atempo
, outlink
, n_samples
);
1112 atempo
->nsamples_in
+= n_in
;
1114 av_frame_free(&src_buffer
);
1118 static int request_frame(AVFilterLink
*outlink
)
1120 AVFilterContext
*ctx
= outlink
->src
;
1121 ATempoContext
*atempo
= ctx
->priv
;
1124 ret
= ff_request_frame(ctx
->inputs
[0]);
1126 if (ret
== AVERROR_EOF
) {
1127 // flush the filter:
1128 int n_max
= atempo
->ring
;
1130 int err
= AVERROR(EAGAIN
);
1132 while (err
== AVERROR(EAGAIN
)) {
1133 if (!atempo
->dst_buffer
) {
1134 atempo
->dst_buffer
= ff_get_audio_buffer(outlink
, n_max
);
1135 if (!atempo
->dst_buffer
)
1136 return AVERROR(ENOMEM
);
1138 atempo
->dst
= atempo
->dst_buffer
->data
[0];
1139 atempo
->dst_end
= atempo
->dst
+ n_max
* atempo
->stride
;
1142 err
= yae_flush(atempo
, &atempo
->dst
, atempo
->dst_end
);
1144 n_out
= ((atempo
->dst
- atempo
->dst_buffer
->data
[0]) /
1148 ret
= push_samples(atempo
, outlink
, n_out
);
1152 av_frame_free(&atempo
->dst_buffer
);
1154 atempo
->dst_end
= NULL
;
1162 static int process_command(AVFilterContext
*ctx
,
1169 return !strcmp(cmd
, "tempo") ? yae_set_tempo(ctx
, arg
) : AVERROR(ENOSYS
);
1172 static const AVFilterPad atempo_inputs
[] = {
1175 .type
= AVMEDIA_TYPE_AUDIO
,
1176 .filter_frame
= filter_frame
,
1177 .config_props
= config_props
,
1182 static const AVFilterPad atempo_outputs
[] = {
1185 .request_frame
= request_frame
,
1186 .type
= AVMEDIA_TYPE_AUDIO
,
1191 AVFilter ff_af_atempo
= {
1193 .description
= NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
1196 .query_formats
= query_formats
,
1197 .process_command
= process_command
,
1198 .priv_size
= sizeof(ATempoContext
),
1199 .priv_class
= &atempo_class
,
1200 .inputs
= atempo_inputs
,
1201 .outputs
= atempo_outputs
,