Toward more accurate timing in the face of varying pitch ratio

2021-05-13 18:04:43 +01:00
parent c79c426e80
commit d06b4efc16
9 changed files with 242 additions and 56 deletions
--- a/main/main.cpp
+++ b/main/main.cpp
@@ -500,11 +500,12 @@ int main(int argc, char **argv)
    if (shortwin)    options |= RubberBandStretcher::OptionWindowShort;
    if (smoothing)   options |= RubberBandStretcher::OptionSmoothingOn;
    if (formant)     options |= RubberBandStretcher::OptionFormantPreserved;
-    if (hqpitch)     options |= RubberBandStretcher::OptionPitchHighQuality;
    if (together)    options |= RubberBandStretcher::OptionChannelsTogether;

    if (freqOrPitchMapSpecified) {
        options |= RubberBandStretcher::OptionPitchHighConsistency;
+    } else if (hqpitch) {
+        options |= RubberBandStretcher::OptionPitchHighQuality;
    }
    
    switch (threading) {
@@ -647,13 +648,13 @@ int main(int argc, char **argv)
            int thisBlockSize = ibs;

            while (freqMapItr != freqMap.end()) {
-                size_t nextFreqFrame = freqMapItr->first + ts.getLatency();
+                size_t nextFreqFrame = freqMapItr->first; // + ts.getLatency();
                if (nextFreqFrame <= countIn) {
                    double s = frequencyshift * freqMapItr->second;
                    if (debug > 0) {
                        cerr << "at frame " << countIn
                             << " (requested at " << freqMapItr->first
-                             << " plus latency " << ts.getLatency()
+                             << " [NOT] plus latency " << ts.getLatency()
                             << ") updating frequency ratio to " << s << endl;
                    }
                    ts.setPitchScale(s);
--- a/src/StretchCalculator.cpp
+++ b/src/StretchCalculator.cpp
@@ -44,9 +44,13 @@ StretchCalculator::StretchCalculator(size_t sampleRate,
    m_divergence(0),
    m_recovery(0),
    m_prevRatio(1.0),
+    m_prevTimeRatio(1.0),
    m_transientAmnesty(0),
    m_debugLevel(0),
-    m_useHardPeaks(useHardPeaks)
+    m_useHardPeaks(useHardPeaks),
+    m_inFrameCounter(0),
+    m_frameCheckpoint(0, 0),
+    m_outFrameCounter(0)
 {
 //    std::cerr << "StretchCalculator::StretchCalculator: useHardPeaks = " << useHardPeaks << std::endl;
 }    
@@ -318,17 +322,106 @@ StretchCalculator::mapPeaks(std::vector<Peak> &peaks,
    }
 }    

-int
-StretchCalculator::calculateSingle(double ratio,
-                                   float df,
-                                   size_t increment)
+int64_t
+StretchCalculator::expectedOutFrame(int64_t inFrame, double timeRatio)
 {
+    int64_t checkpointedAt = m_frameCheckpoint.first;
+    int64_t checkpointed = m_frameCheckpoint.second;
+    return int64_t(round(checkpointed + (inFrame - checkpointedAt) * timeRatio));
+}
+
+int
+StretchCalculator::calculateSingle(double timeRatio,
+                                   double effectivePitchRatio,
+                                   float df,
+                                   size_t inIncrement,
+                                   size_t analysisWindowSize,
+                                   size_t synthesisWindowSize)
+{
+    double ratio = timeRatio / effectivePitchRatio;
+    
+    int increment = int(inIncrement);
    if (increment == 0) increment = m_increment;

+    int outIncrement = lrint(increment * ratio); // the normal case
    bool isTransient = false;
    
    // We want to ensure, as close as possible, that the phase reset
-    // points appear at _exactly_ the right audio frame numbers.
+    // points appear at the right audio frame numbers. To this end we
+    // track the incoming frame number, its corresponding expected
+    // output frame number, and the actual output frame number
+    // projected based on the ratios provided.
+    //
+    // There are two subtleties:
+    // 
+    // (1) on a ratio change, we need to checkpoint the expected
+    // output frame number reached so far and start counting again
+    // with the new ratio. We could do this with a reset to zero, but
+    // it's easier to reason about absolute input/output frame
+    // matches, so for the moment at least we're doing this by
+    // explicitly checkpointing the current numbers (hence the use of
+    // the above expectedOutFrame() function which refers to the
+    // last checkpointed values).
+    //
+    // (2) in the case of a pitch shift in a configuration where
+    // resampling occurs after stretching, all of our output
+    // increments will be effectively modified by resampling after we
+    // return. This is why we separate out timeRatio and
+    // effectivePitchRatio arguments - the former is the ratio that
+    // has already been applied and the latter is the ratio that will
+    // be applied by any subsequent resampling step (which will be 1.0
+    // / pitchScale if resampling is happening after stretching). So
+    // the overall ratio is timeRatio / effectivePitchRatio.
+
+    bool ratioChanged = (ratio != m_prevRatio);
+    if (ratioChanged) {
+        // Reset our frame counters from the ratio change.
+
+        // m_outFrameCounter tracks the frames counted at output from
+        // this function, which normally precedes resampling - hence
+        // the use of timeRatio rather than ratio here
+
+        if (m_debugLevel > 1) {
+            std::cerr << "StretchCalculator: ratio changed from " << m_prevRatio << " to " << ratio << std::endl;
+        }
+
+        int64_t toCheckpoint = expectedOutFrame
+            (m_inFrameCounter, m_prevTimeRatio);
+        m_frameCheckpoint =
+            std::pair<int64_t, int64_t>(m_inFrameCounter, toCheckpoint);
+    }
+    
+    m_prevRatio = ratio;
+    m_prevTimeRatio = timeRatio;
+
+    if (m_debugLevel > 2) {
+        std::cerr << "StretchCalculator::calculateSingle: timeRatio = "
+                  << timeRatio << ", effectivePitchRatio = "
+                  << effectivePitchRatio << " (that's 1.0 / "
+                  << (1.0 / effectivePitchRatio)
+                  << "), ratio = " << ratio << ", df = " << df
+                  << ", inIncrement = " << inIncrement
+                  << ", default outIncrement = " << outIncrement
+                  << ", analysisWindowSize = " << analysisWindowSize
+                  << ", synthesisWindowSize = " << synthesisWindowSize
+                  << std::endl;
+
+        std::cerr << "inFrameCounter = " << m_inFrameCounter
+                  << ", outFrameCounter = " << m_outFrameCounter
+                  << std::endl;
+
+        std::cerr << "The next sample out is input sample " << m_inFrameCounter << std::endl;
+    }
+    
+    int64_t intended = expectedOutFrame
+        (m_inFrameCounter + analysisWindowSize/4, timeRatio);
+    int64_t projected = int64_t
+        (round(m_outFrameCounter + (synthesisWindowSize/4 * effectivePitchRatio)));
+    m_divergence = projected - intended;
+
+    if (m_debugLevel > 2) {
+        std::cerr << "for current frame + quarter frame: intended " << intended << ", projected " << projected << ", divergence " << m_divergence << std::endl;
+    }
    
    // In principle, the threshold depends on chunk size: larger chunk
    // sizes need higher thresholds.  Since chunk size depends on
@@ -350,53 +443,81 @@ StretchCalculator::calculateSingle(double ratio,

    m_prevDf = df;

-    bool ratioChanged = (ratio != m_prevRatio);
-    m_prevRatio = ratio;
-
-    if (isTransient && m_transientAmnesty == 0) {
-        if (m_debugLevel > 1) {
-            std::cerr << "StretchCalculator::calculateSingle: transient (df " << df << ", threshold " << transientThreshold << ")" << std::endl;
+    if (m_transientAmnesty > 0) {
+        if (isTransient) {
+            if (m_debugLevel > 1) {
+                std::cerr << "StretchCalculator::calculateSingle: transient, but we have an amnesty (df " << df << ", threshold " << transientThreshold << ")" << std::endl;
+            }
+            isTransient = false;
+        }
+        --m_transientAmnesty;
+    }
+            
+    if (isTransient) {
+        if (m_debugLevel > 1) {
+            std::cerr << "StretchCalculator::calculateSingle: transient at (df " << df << ", threshold " << transientThreshold << ")" << std::endl;
        }
-        m_divergence += increment - (increment * ratio);

        // as in offline mode, 0.05 sec approx min between transients
        m_transientAmnesty =
            lrint(ceil(double(m_sampleRate) / (20 * double(increment))));

        m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
-        return -int(increment);
+
+        outIncrement = increment;
+
+    } else {
+
+        if (ratioChanged) {
+            m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
+        }
+
+        int incr = lrint(outIncrement - m_recovery);
+        if (m_debugLevel > 2 || (m_debugLevel > 1 && m_divergence != 0)) {
+            std::cerr << "divergence = " << m_divergence << ", recovery = " << m_recovery << ", incr = " << incr << ", ";
+        }
+        if (incr < lrint((increment * ratio) / 2)) {
+            incr = lrint((increment * ratio) / 2);
+        } else if (incr > lrint(increment * ratio * 2)) {
+            incr = lrint(increment * ratio * 2);
+        }
+
+        double divdiff = (increment * ratio) - incr;
+
+        if (m_debugLevel > 2 || (m_debugLevel > 1 && m_divergence != 0)) {
+            std::cerr << "possibly clamped to " << incr << ", divdiff = " << divdiff << std::endl;
+        }
+
+        double prevDivergence = m_divergence;
+        m_divergence -= divdiff;
+        if ((prevDivergence < 0 && m_divergence > 0) ||
+            (prevDivergence > 0 && m_divergence < 0)) {
+            m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
+        }
+
+        if (incr < 0) {
+            std::cerr << "WARNING: internal error: incr < 0 in calculateSingle"
+                      << std::endl;
+            outIncrement = 0;
+        } else {
+            outIncrement = incr;
+        }
    }

-    if (ratioChanged) {
-        m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
+    if (m_debugLevel > 1) {
+        std::cerr << "StretchCalculator::calculateSingle: returning isTransient = "
+                  << isTransient << ", outIncrement = " << outIncrement
+                  << std::endl;
    }

-    if (m_transientAmnesty > 0) --m_transientAmnesty;
+    m_inFrameCounter += inIncrement;
+    m_outFrameCounter += outIncrement * effectivePitchRatio;
    
-    int incr = lrint(increment * ratio - m_recovery);
-    if (m_debugLevel > 2 || (m_debugLevel > 1 && m_divergence != 0)) {
-        std::cerr << "divergence = " << m_divergence << ", recovery = " << m_recovery << ", incr = " << incr << ", ";
+    if (isTransient) {
+        return -outIncrement;
+    } else {
+        return outIncrement;
    }
-    if (incr < lrint((increment * ratio) / 2)) {
-        incr = lrint((increment * ratio) / 2);
-    } else if (incr > lrint(increment * ratio * 2)) {
-        incr = lrint(increment * ratio * 2);
-    }
-
-    double divdiff = (increment * ratio) - incr;
-
-    if (m_debugLevel > 2 || (m_debugLevel > 1 && m_divergence != 0)) {
-        std::cerr << "divdiff = " << divdiff << std::endl;
-    }
-
-    double prevDivergence = m_divergence;
-    m_divergence -= divdiff;
-    if ((prevDivergence < 0 && m_divergence > 0) ||
-        (prevDivergence > 0 && m_divergence < 0)) {
-        m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
-    }
-
-    return incr;
 }

 void
--- a/src/StretchCalculator.h
+++ b/src/StretchCalculator.h
@@ -68,8 +68,12 @@ public:
     * If increment is non-zero, use it for the input increment for
     * this block in preference to m_increment.
     */
-    int calculateSingle(double ratio, float curveValue,
-                        size_t increment = 0);
+    int calculateSingle(double timeRatio,
+                        double effectivePitchRatio,
+                        float curveValue,
+                        size_t increment,
+                        size_t analysisWindowSize,
+                        size_t synthesisWindowSize);

    void setUseHardPeaks(bool use) { m_useHardPeaks = use; }

@@ -105,11 +109,16 @@ protected:
    size_t m_increment;
    float m_prevDf;
    double m_divergence;
-    float m_recovery;
-    float m_prevRatio;
+    double m_recovery;
+    double m_prevRatio;
+    double m_prevTimeRatio;
    int m_transientAmnesty; // only in RT mode; handled differently offline
    int m_debugLevel;
    bool m_useHardPeaks;
+    int64_t m_inFrameCounter;
+    std::pair<int64_t, int64_t> m_frameCheckpoint;
+    int64_t expectedOutFrame(int64_t inFrame, double timeRatio);
+    double m_outFrameCounter;

    std::map<size_t, size_t> m_keyFrameMap;
    std::vector<Peak> m_peaks;
--- a/src/StretcherImpl.cpp
+++ b/src/StretcherImpl.cpp
@@ -738,11 +738,12 @@ RubberBandStretcher::Impl::configure()
    // number of onset detector chunks will be the number of audio
    // samples input, divided by the input increment, plus one.

+    //!!!
    // In real-time mode, we don't do this prefill -- it's better to
    // start with a swoosh than introduce more latency, and we don't
    // want gaps when the ratio changes.

-    if (!m_realtime) {
+//    if (!m_realtime) {
        if (m_debugLevel > 1) {
            cerr << "Not real time mode: prefilling" << endl;
        }
@@ -750,7 +751,7 @@ RubberBandStretcher::Impl::configure()
            m_channelData[c]->reset();
            m_channelData[c]->inbuf->zero(m_aWindowSize/2);
        }
-    }
+//    }
 }


@@ -777,6 +778,8 @@ RubberBandStretcher::Impl::reconfigure()

    calculateSizes();

+    bool somethingChanged = false;
+    
    // There are various allocations in this function, but they should
    // never happen in normal use -- they just recover from the case
    // where not all of the things we need were correctly created when
@@ -811,12 +814,15 @@ RubberBandStretcher::Impl::reconfigure()
            m_channelData[c]->setSizes(std::max(m_aWindowSize, m_sWindowSize),
                                       m_fftSize);
        }
+
+        somethingChanged = true;
    }

    if (m_outbufSize != prevOutbufSize) {
        for (size_t c = 0; c < m_channels; ++c) {
            m_channelData[c]->setOutbufSize(m_outbufSize);
        }
+        somethingChanged = true;
    }

    if (m_pitchScale != 1.0) {
@@ -839,11 +845,22 @@ RubberBandStretcher::Impl::reconfigure()
                lrintf(ceil((m_increment * m_timeRatio * 2) / m_pitchScale));
            if (rbs < m_increment * 16) rbs = m_increment * 16;
            m_channelData[c]->setResampleBufSize(rbs);
+
+            somethingChanged = true;
        }
    }

    if (m_fftSize != prevFftSize) {
        m_phaseResetAudioCurve->setFftSize(m_fftSize);
+        somethingChanged = true;
+    }
+
+    if (m_debugLevel > 0) {
+        if (somethingChanged) {
+            std::cerr << "reconfigure: at least one parameter changed" << std::endl;
+        } else {
+            std::cerr << "reconfigure: nothing changed" << std::endl;
+        }
    }
 }

--- a/src/StretcherProcess.cpp
+++ b/src/StretcherProcess.cpp
@@ -616,8 +616,14 @@ RubberBandStretcher::Impl::calculateIncrements(size_t &phaseIncrementRtn,
        }
    }

+    double effectivePitchRatio = 1.0 / m_pitchScale;
+    if (cd.resampler) {
+        effectivePitchRatio = cd.resampler->getEffectiveRatio(effectivePitchRatio);
+    }
+    
    int incr = m_stretchCalculator->calculateSingle
-        (getEffectiveRatio(), df, m_increment);
+        (m_timeRatio, effectivePitchRatio, df, m_increment,
+         m_aWindowSize, m_sWindowSize);

    if (m_lastProcessPhaseResetDf.getWriteSpace() > 0) {
        m_lastProcessPhaseResetDf.write(&df, 1);
@@ -1142,11 +1148,13 @@ RubberBandStretcher::Impl::writeOutput(RingBuffer<float> &to, float *from, size_
    // samples, because the first chunk is centred on the start of the
    // output.  In RT mode we didn't apply any pre-padding in
    // configure(), so we don't want to remove any here.
-
+//!!!
    size_t startSkip = 0;
-    if (!m_realtime) {
+//    if (!m_realtime) {
+    //!!! lock down the latency to this initial value in RT mode
        startSkip = lrintf((m_sWindowSize/2) / m_pitchScale);
-    }
+//    startSkip = m_sWindowSize/2;
+//    }

    if (outCount > startSkip) {
        
--- a/src/dsp/BQResampler.cpp
+++ b/src/dsp/BQResampler.cpp
@@ -220,6 +220,15 @@ BQResampler::resampleInterleaved(float *const out,
    return o / m_channels;
 }

+double
+BQResampler::getEffectiveRatio(double ratio) const {
+    if (m_initialised && ratio == m_s->parameters.ratio) {
+        return m_s->parameters.effective;
+    } else {
+        return pick_params(ratio).effective;
+    }
+}
+    
 int
 BQResampler::gcd(int a, int b) const
 {
--- a/src/dsp/BQResampler.h
+++ b/src/dsp/BQResampler.h
@@ -60,6 +60,8 @@ public:
                            const float *const in, int incount,
                            double ratio, bool final);

+    double getEffectiveRatio(double ratio) const;
+    
    void reset();

 private:
--- a/src/dsp/Resampler.cpp
+++ b/src/dsp/Resampler.cpp
@@ -99,6 +99,7 @@ public:
                                    bool final) = 0;

    virtual int getChannelCount() const = 0;
+    virtual double getEffectiveRatio(double ratio) const = 0;

    virtual void reset() = 0;
 };
@@ -130,6 +131,7 @@ public:
                            bool final = false);

    int getChannelCount() const { return m_channels; }
+    double getEffectiveRatio(double ratio) const { return ratio; }

    void reset();

@@ -561,6 +563,7 @@ public:
                            bool final = false);

    int getChannelCount() const { return m_channels; }
+    double getEffectiveRatio(double ratio) const { return ratio; }

    void reset();

@@ -785,6 +788,7 @@ public:
                            bool final);

    int getChannelCount() const { return m_channels; }
+    double getEffectiveRatio(double ratio) const { return ratio; }

    void reset();

@@ -970,7 +974,13 @@ public:
                            double ratio,
                            bool final = false);

-    int getChannelCount() const { return m_channels; }
+    int getChannelCount() const {
+        return m_channels;
+    }
+
+    double getEffectiveRatio(double ratio) const {
+        return m_resampler->getEffectiveRatio(ratio);
+    }

    void reset();

@@ -1121,6 +1131,7 @@ public:
                            bool final = false);

    int getChannelCount() const { return m_channels; }
+    double getEffectiveRatio(double ratio) const { return ratio; }

    void reset();

@@ -1545,6 +1556,12 @@ Resampler::getChannelCount() const
    return d->getChannelCount();
 }

+double
+Resampler::getEffectiveRatio(double ratio) const
+{
+    return d->getEffectiveRatio(ratio);
+}
+
 void
 Resampler::reset()
 {
--- a/src/dsp/Resampler.h
+++ b/src/dsp/Resampler.h
@@ -148,6 +148,8 @@ public:

    int getChannelCount() const;

+    double getEffectiveRatio(double ratio) const;
+    
    void reset();

    class Impl;