Pull out per-channel analysis and resynthesis functions

This commit is contained in:
Chris Cannam
2022-05-25 11:26:16 +01:00
parent 47476b9088
commit f5b381e086
2 changed files with 279 additions and 265 deletions

View File

@@ -179,9 +179,7 @@ void
R3StretcherImpl::consume() R3StretcherImpl::consume()
{ {
double ratio = getEffectiveRatio(); double ratio = getEffectiveRatio();
int longest = m_guideConfiguration.longestFftSize; int longest = m_guideConfiguration.longestFftSize;
int classify = m_guideConfiguration.classificationFftSize;
m_calculator->setDebugLevel(3); m_calculator->setDebugLevel(3);
@@ -194,11 +192,14 @@ R3StretcherImpl::consume()
std::cout << "outhop = " << outhop << std::endl; std::cout << "outhop = " << outhop << std::endl;
double instantaneousRatio = double(m_prevOuthop) / double(m_inhop);
m_prevOuthop = outhop;
while (m_channelData.at(0)->outbuf->getWriteSpace() >= outhop) { while (m_channelData.at(0)->outbuf->getWriteSpace() >= outhop) {
// NB our ChannelData, ScaleData, and ChannelScaleData maps
// contain shared_ptrs; whenever we retain one of them in a
// variable, we do so by reference to avoid copying the
// shared_ptr (as that is not realtime safe). Same goes for
// the map iterators
int readSpace = m_channelData.at(0)->inbuf->getReadSpace(); int readSpace = m_channelData.at(0)->inbuf->getReadSpace();
if (readSpace < longest) { if (readSpace < longest) {
if (m_draining) { if (m_draining) {
@@ -210,158 +211,10 @@ R3StretcherImpl::consume()
} }
} }
// Analysis. This is per channel // Analysis
for (int c = 0; c < m_parameters.channels; ++c) { for (int c = 0; c < m_parameters.channels; ++c) {
analyseChannel(c, m_prevOuthop);
// Our ChannelData, ScaleData, and ChannelScaleData maps
// contain shared_ptrs; whenever we retain one of them in
// a variable here, we do so by reference to avoid copying
// the shared_ptr (as that is not realtime safe). Same
// goes for the map iterators
auto &cd = m_channelData.at(c);
auto &longestScale = cd->scales.at(longest);
double *buf = longestScale->timeDomain.data();
if (readSpace < longest) {
cd->inbuf->peek(buf, readSpace);
v_zero(buf + readSpace, longest - readSpace);
} else {
cd->inbuf->peek(buf, longest);
}
// We have a single unwindowed frame at the longest FFT
// size ("scale"). Populate the shorter FFT sizes from the
// centre of it, windowing as we copy. The classification
// scale is handled separately because it has readahead,
// so skip it here as well as the longest. (In practice
// this means we are probably only populating one scale)
for (auto &it: cd->scales) {
int fftSize = it.first;
if (fftSize == classify || fftSize == longest) continue;
int offset = (longest - fftSize) / 2;
m_scaleData.at(fftSize)->analysisWindow.cut
(buf + offset, it.second->timeDomain.data());
}
// The classification scale has a one-hop readahead (note
// that inhop is fixed), so populate its current data from
// the readahead and the readahead from further down the
// long unwindowed frame.
auto &classifyScale = cd->scales.at(classify);
ClassificationReadaheadData &readahead = cd->readahead;
m_scaleData.at(classify)->analysisWindow.cut
(buf + (longest - classify) / 2 + m_inhop,
readahead.timeDomain.data());
// Finally window the longest scale
m_scaleData.at(longest)->analysisWindow.cut(buf);
// FFT shift, forward FFT, and carry out cartesian-polar
// conversion for each FFT size.
// For the classification scale we need magnitudes for the
// full range (polar only in a subset) and we operate in
// the readahead, pulling current values from the existing
// readahead
v_fftshift(readahead.timeDomain.data(), classify);
v_copy(classifyScale->mag.data(),
readahead.mag.data(),
classifyScale->bufSize);
v_copy(classifyScale->phase.data(),
readahead.phase.data(),
classifyScale->bufSize);
m_scaleData.at(classify)->fft.forward(readahead.timeDomain.data(),
classifyScale->real.data(),
classifyScale->imag.data());
for (const auto &b : m_guideConfiguration.fftBandLimits) {
if (b.fftSize == classify) {
if (b.b0min > 0) {
v_cartesian_to_magnitudes(readahead.mag.data(),
classifyScale->real.data(),
classifyScale->imag.data(),
b.b0min);
}
v_cartesian_to_polar(readahead.mag.data() + b.b0min,
readahead.phase.data() + b.b0min,
classifyScale->real.data() + b.b0min,
classifyScale->imag.data() + b.b0min,
b.b1max - b.b0min);
if (b.b1max < classify/2 + 1) {
v_cartesian_to_magnitudes
(readahead.mag.data() + b.b1max,
classifyScale->real.data() + b.b1max,
classifyScale->imag.data() + b.b1max,
classify/2 + 1 - b.b1max);
}
v_scale(classifyScale->mag.data(),
1.0 / double(classify),
classifyScale->mag.size());
break;
}
}
// For the others we operate directly in the scale data
// and restrict the range for cartesian-polar conversion
for (auto &it: cd->scales) {
int fftSize = it.first;
if (fftSize == classify) continue;
auto &scale = it.second;
v_fftshift(scale->timeDomain.data(), fftSize);
m_scaleData.at(fftSize)->fft.forward(scale->timeDomain.data(),
scale->real.data(),
scale->imag.data());
//!!! This should be a map
for (const auto &b : m_guideConfiguration.fftBandLimits) {
if (b.fftSize == fftSize) {
v_cartesian_to_polar(scale->mag.data() + b.b0min,
scale->phase.data() + b.b0min,
scale->real.data() + b.b0min,
scale->imag.data() + b.b0min,
b.b1max - b.b0min);
v_scale(scale->mag.data() + b.b0min,
1.0 / double(fftSize),
b.b1max - b.b0min);
break;
}
}
}
// Use the classification scale to get a bin segmentation
// and calculate the adaptive frequency guide for this
// channel
cd->prevSegmentation = cd->segmentation;
cd->segmentation = cd->nextSegmentation;
cd->nextSegmentation = cd->segmenter->segment(readahead.mag.data());
m_troughPicker.findNearestAndNextPeaks
(classifyScale->mag.data(), 3, nullptr,
classifyScale->troughs.data());
m_guide.calculate(instantaneousRatio,
classifyScale->mag.data(),
classifyScale->troughs.data(),
classifyScale->prevMag.data(),
cd->segmentation,
cd->prevSegmentation,
cd->nextSegmentation,
cd->guidance);
} }
// Phase update. This is synchronised across all channels // Phase update. This is synchronised across all channels
@@ -370,11 +223,11 @@ R3StretcherImpl::consume()
int fftSize = it.first; int fftSize = it.first;
for (int c = 0; c < m_parameters.channels; ++c) { for (int c = 0; c < m_parameters.channels; ++c) {
auto &cd = m_channelData.at(c); auto &cd = m_channelData.at(c);
auto &classifyScale = cd->scales.at(fftSize); auto &scale = cd->scales.at(fftSize);
m_channelAssembly.mag[c] = classifyScale->mag.data(); m_channelAssembly.mag[c] = scale->mag.data();
m_channelAssembly.phase[c] = classifyScale->phase.data(); m_channelAssembly.phase[c] = scale->phase.data();
m_channelAssembly.guidance[c] = &cd->guidance; m_channelAssembly.guidance[c] = &cd->guidance;
m_channelAssembly.outPhase[c] = classifyScale->advancedPhase.data(); m_channelAssembly.outPhase[c] = scale->advancedPhase.data();
} }
m_scaleData.at(fftSize)->guided.advance m_scaleData.at(fftSize)->guided.advance
(m_channelAssembly.outPhase.data(), (m_channelAssembly.outPhase.data(),
@@ -386,134 +239,293 @@ R3StretcherImpl::consume()
outhop); outhop);
} }
// Resynthesis. This is per channel // Resynthesis
for (int c = 0; c < m_parameters.channels; ++c) { for (int c = 0; c < m_parameters.channels; ++c) {
synthesiseChannel(c, outhop);
}
}
auto &cd = m_channelData.at(c); m_prevOuthop = outhop;
}
for (auto &it : cd->scales) { void
auto &scale = it.second; R3StretcherImpl::analyseChannel(int c, int prevOuthop)
int bufSize = scale->bufSize; {
// copy to prevMag before filtering int longest = m_guideConfiguration.longestFftSize;
v_copy(scale->prevMag.data(), int classify = m_guideConfiguration.classificationFftSize;
scale->mag.data(),
bufSize);
}
for (const auto &band : cd->guidance.fftBands) {
int fftSize = band.fftSize;
auto &scale = cd->scales.at(fftSize);
auto &scaleData = m_scaleData.at(fftSize);
//!!! messy and slow, but leave it until we've auto &cd = m_channelData.at(c);
//!!! discovered whether we need a window accumulator double *buf = cd->scales.at(longest)->timeDomain.data();
//!!! (we probably do)
int analysisWindowSize = scaleData->analysisWindow.getSize();
int synthesisWindowSize = scaleData->synthesisWindow.getSize();
int offset = (analysisWindowSize - synthesisWindowSize) / 2;
double winscale = 0.0;
for (int i = 0; i < synthesisWindowSize; ++i) {
winscale += scaleData->analysisWindow.getValue(i + offset) *
scaleData->synthesisWindow.getValue(i);
}
winscale = double(outhop) / winscale;
// The frequency filter is applied naively in the int readSpace = cd->inbuf->getReadSpace();
// frequency domain. Aliasing is reduced by the if (readSpace < longest) {
// shorter resynthesis window cd->inbuf->peek(buf, readSpace);
v_zero(buf + readSpace, longest - readSpace);
double factor = m_parameters.sampleRate / double(fftSize); } else {
for (int i = 0; i < fftSize/2 + 1; ++i) { cd->inbuf->peek(buf, longest);
double f = double(i) * factor; }
if (f >= band.f0 && f < band.f1) {
//!!! check the mod 2 bit from stretch-fn // We have a single unwindowed frame at the longest FFT
scale->mag[i] *= winscale; // size ("scale"). Populate the shorter FFT sizes from the
} else { // centre of it, windowing as we copy. The classification
scale->mag[i] = 0.f; // scale is handled separately because it has readahead,
} // so skip it here as well as the longest. (In practice
} // this means we are probably only populating one scale)
}
// Resynthesise each FFT size (scale) individually, then for (auto &it: cd->scales) {
// sum. This is easier to manage scaling for in situations int fftSize = it.first;
// with a varying resynthesis hop if (fftSize == classify || fftSize == longest) continue;
int offset = (longest - fftSize) / 2;
m_scaleData.at(fftSize)->analysisWindow.cut
(buf + offset, it.second->timeDomain.data());
}
// The classification scale has a one-hop readahead (note
// that inhop is fixed), so populate its current data from
// the readahead and the readahead from further down the
// long unwindowed frame.
auto &classifyScale = cd->scales.at(classify);
ClassificationReadaheadData &readahead = cd->readahead;
m_scaleData.at(classify)->analysisWindow.cut
(buf + (longest - classify) / 2 + m_inhop,
readahead.timeDomain.data());
for (auto &it : cd->scales) { // Finally window the longest scale
int fftSize = it.first; m_scaleData.at(longest)->analysisWindow.cut(buf);
auto &scale = it.second;
auto &scaleData = m_scaleData.at(fftSize);
for (const auto &b : m_guideConfiguration.fftBandLimits) {
if (b.fftSize == fftSize) {
int offset = b.b0min;
v_zero(scale->real.data(), fftSize/2 + 1);
v_zero(scale->imag.data(), fftSize/2 + 1);
v_polar_to_cartesian
(scale->real.data() + offset,
scale->imag.data() + offset,
scale->mag.data() + offset,
scale->advancedPhase.data() + offset,
b.b1max - offset);
break;
}
}
scaleData->fft.inverse(scale->real.data(), // FFT shift, forward FFT, and carry out cartesian-polar
scale->imag.data(), // conversion for each FFT size.
scale->timeDomain.data());
v_fftshift(scale->timeDomain.data(), fftSize); // For the classification scale we need magnitudes for the
// full range (polar only in a subset) and we operate in
// the readahead, pulling current values from the existing
// readahead
// Synthesis window is shorter than analysis window, v_fftshift(readahead.timeDomain.data(), classify);
// so copy and cut only from the middle of the
// time-domain frame; and the accumulator length
// always matches the longest FFT size, so as to make
// mixing straightforward, so there is an additional
// offset needed for the target
int synthesisWindowSize = scaleData->synthesisWindow.getSize();
int fromOffset = (fftSize - synthesisWindowSize) / 2;
int toOffset = (m_guideConfiguration.longestFftSize -
synthesisWindowSize) / 2;
scaleData->synthesisWindow.cutAndAdd v_copy(classifyScale->mag.data(),
(scale->timeDomain.data() + fromOffset, readahead.mag.data(),
scale->accumulator.data() + toOffset); classifyScale->bufSize);
}
// Mix and emit this channel
double *mixptr = cd->mixdown.data(); v_copy(classifyScale->phase.data(),
v_zero(mixptr, outhop); readahead.phase.data(),
classifyScale->bufSize);
for (auto &it : cd->scales) { m_scaleData.at(classify)->fft.forward(readahead.timeDomain.data(),
auto &scale = it.second; classifyScale->real.data(),
v_add(mixptr, scale->accumulator.data(), outhop); classifyScale->imag.data());
for (const auto &b : m_guideConfiguration.fftBandLimits) {
if (b.fftSize == classify) {
if (b.b0min > 0) {
v_cartesian_to_magnitudes(readahead.mag.data(),
classifyScale->real.data(),
classifyScale->imag.data(),
b.b0min);
} }
cd->outbuf->write(mixptr, outhop); v_cartesian_to_polar(readahead.mag.data() + b.b0min,
readahead.phase.data() + b.b0min,
for (auto &it : cd->scales) { classifyScale->real.data() + b.b0min,
int fftSize = it.first; classifyScale->imag.data() + b.b0min,
auto &scale = it.second; b.b1max - b.b0min);
double *accptr = scale->accumulator.data();
if (b.b1max < classify/2 + 1) {
int n = scale->accumulator.size() - outhop; v_cartesian_to_magnitudes
v_move(accptr, accptr + outhop, n); (readahead.mag.data() + b.b1max,
v_zero(accptr + n, outhop); classifyScale->real.data() + b.b1max,
classifyScale->imag.data() + b.b1max,
classify/2 + 1 - b.b1max);
} }
v_scale(classifyScale->mag.data(),
1.0 / double(classify),
classifyScale->mag.size());
break;
}
}
// For the others we operate directly in the scale data
// and restrict the range for cartesian-polar conversion
if (readSpace < m_inhop) { for (auto &it: cd->scales) {
// This should happen only when draining int fftSize = it.first;
cd->inbuf->skip(readSpace); if (fftSize == classify) continue;
} else { auto &scale = it.second;
cd->inbuf->skip(m_inhop);
v_fftshift(scale->timeDomain.data(), fftSize);
m_scaleData.at(fftSize)->fft.forward(scale->timeDomain.data(),
scale->real.data(),
scale->imag.data());
//!!! This should be a map
for (const auto &b : m_guideConfiguration.fftBandLimits) {
if (b.fftSize == fftSize) {
v_cartesian_to_polar(scale->mag.data() + b.b0min,
scale->phase.data() + b.b0min,
scale->real.data() + b.b0min,
scale->imag.data() + b.b0min,
b.b1max - b.b0min);
v_scale(scale->mag.data() + b.b0min,
1.0 / double(fftSize),
b.b1max - b.b0min);
break;
} }
} }
} }
// Use the classification scale to get a bin segmentation
// and calculate the adaptive frequency guide for this
// channel
cd->prevSegmentation = cd->segmentation;
cd->segmentation = cd->nextSegmentation;
cd->nextSegmentation = cd->segmenter->segment(readahead.mag.data());
m_troughPicker.findNearestAndNextPeaks
(classifyScale->mag.data(), 3, nullptr,
classifyScale->troughs.data());
double instantaneousRatio = double(prevOuthop) / double(m_inhop);
m_guide.calculate(instantaneousRatio,
classifyScale->mag.data(),
classifyScale->troughs.data(),
classifyScale->prevMag.data(),
cd->segmentation,
cd->prevSegmentation,
cd->nextSegmentation,
cd->guidance);
} }
void
R3StretcherImpl::synthesiseChannel(int c, int outhop)
{
int longest = m_guideConfiguration.longestFftSize;
auto &cd = m_channelData.at(c);
for (auto &it : cd->scales) {
auto &scale = it.second;
int bufSize = scale->bufSize;
// copy to prevMag before filtering
v_copy(scale->prevMag.data(),
scale->mag.data(),
bufSize);
}
for (const auto &band : cd->guidance.fftBands) {
int fftSize = band.fftSize;
auto &scale = cd->scales.at(fftSize);
auto &scaleData = m_scaleData.at(fftSize);
//!!! messy and slow, but leave it until we've
//!!! discovered whether we need a window accumulator
//!!! (we probably do)
int analysisWindowSize = scaleData->analysisWindow.getSize();
int synthesisWindowSize = scaleData->synthesisWindow.getSize();
int offset = (analysisWindowSize - synthesisWindowSize) / 2;
double winscale = 0.0;
for (int i = 0; i < synthesisWindowSize; ++i) {
winscale += scaleData->analysisWindow.getValue(i + offset) *
scaleData->synthesisWindow.getValue(i);
}
winscale = double(outhop) / winscale;
// The frequency filter is applied naively in the
// frequency domain. Aliasing is reduced by the
// shorter resynthesis window
double factor = m_parameters.sampleRate / double(fftSize);
for (int i = 0; i < fftSize/2 + 1; ++i) {
double f = double(i) * factor;
if (f >= band.f0 && f < band.f1) {
//!!! check the mod 2 bit from stretch-fn
scale->mag[i] *= winscale;
} else {
scale->mag[i] = 0.f;
}
}
}
// Resynthesise each FFT size (scale) individually, then
// sum. This is easier to manage scaling for in situations
// with a varying resynthesis hop
for (auto &it : cd->scales) {
int fftSize = it.first;
auto &scale = it.second;
auto &scaleData = m_scaleData.at(fftSize);
for (const auto &b : m_guideConfiguration.fftBandLimits) {
if (b.fftSize == fftSize) {
int offset = b.b0min;
v_zero(scale->real.data(), fftSize/2 + 1);
v_zero(scale->imag.data(), fftSize/2 + 1);
v_polar_to_cartesian
(scale->real.data() + offset,
scale->imag.data() + offset,
scale->mag.data() + offset,
scale->advancedPhase.data() + offset,
b.b1max - offset);
break;
}
}
scaleData->fft.inverse(scale->real.data(),
scale->imag.data(),
scale->timeDomain.data());
v_fftshift(scale->timeDomain.data(), fftSize);
// Synthesis window is shorter than analysis window,
// so copy and cut only from the middle of the
// time-domain frame; and the accumulator length
// always matches the longest FFT size, so as to make
// mixing straightforward, so there is an additional
// offset needed for the target
int synthesisWindowSize = scaleData->synthesisWindow.getSize();
int fromOffset = (fftSize - synthesisWindowSize) / 2;
int toOffset = (longest - synthesisWindowSize) / 2;
scaleData->synthesisWindow.cutAndAdd
(scale->timeDomain.data() + fromOffset,
scale->accumulator.data() + toOffset);
}
// Mix and emit this channel
double *mixptr = cd->mixdown.data();
v_zero(mixptr, outhop);
for (auto &it : cd->scales) {
auto &scale = it.second;
v_add(mixptr, scale->accumulator.data(), outhop);
}
cd->outbuf->write(mixptr, outhop);
for (auto &it : cd->scales) {
int fftSize = it.first;
auto &scale = it.second;
double *accptr = scale->accumulator.data();
int n = scale->accumulator.size() - outhop;
v_move(accptr, accptr + outhop, n);
v_zero(accptr + n, outhop);
}
int readSpace = cd->inbuf->getReadSpace();
if (readSpace < m_inhop) {
// This should happen only when draining
cd->inbuf->skip(readSpace);
} else {
cd->inbuf->skip(m_inhop);
}
}
} }

View File

@@ -248,6 +248,8 @@ protected:
void consume(); void consume();
void calculateHop(); void calculateHop();
void analyseChannel(int channel, int prevOuthop);
void synthesiseChannel(int channel, int outhop);
double getEffectiveRatio() const { double getEffectiveRatio() const {
return m_timeRatio * m_pitchScale; return m_timeRatio * m_pitchScale;