Reorganise into faster (R2) and finer (R3)

2022-05-19 13:34:51 +01:00
parent e9264ae909
commit e9ad04e2b4
66 changed files with 89 additions and 127 deletions
--- a/src/common/VectorOps.h
+++ b/src/common/VectorOps.h
@@ -0,0 +1,869 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2022 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef RUBBERBAND_VECTOR_OPS_H
+#define RUBBERBAND_VECTOR_OPS_H
+
+#ifdef HAVE_IPP
+#ifndef _MSC_VER
+#include <inttypes.h>
+#endif
+#include <ippversion.h>
+#include <ipps.h>
+#if (IPP_VERSION_MAJOR <= 7)
+// Deprecated in v8, removed in v9
+#include <ippac.h>
+#endif
+#endif
+
+#ifdef HAVE_VDSP
+#include <Accelerate/Accelerate.h>
+#include <TargetConditionals.h>
+#include <alloca.h>
+#endif
+
+#include <cstring>
+#include "sysutils.h"
+
+namespace RubberBand {
+
+// Note that all functions with a "target" vector have their arguments
+// in the same order as memcpy and friends, i.e. target vector first.
+// This is the reverse order from the IPP functions.
+
+// The ideal here is to write the basic loops in such a way as to be
+// auto-vectorizable by a sensible compiler (definitely gcc-4.3 on
+// Linux, ideally also gcc-4.0 on OS/X).
+
+template<typename T>
+inline void v_zero(T *const R__ ptr,
+                   const int count)
+{
+    const T value = T(0);
+    for (int i = 0; i < count; ++i) {
+        ptr[i] = value;
+    }
+}
+
+#if defined HAVE_IPP
+template<> 
+inline void v_zero(float *const R__ ptr, 
+                   const int count)
+{
+    ippsZero_32f(ptr, count);
+}
+template<> 
+inline void v_zero(double *const R__ ptr,
+                   const int count)
+{
+    ippsZero_64f(ptr, count);
+}
+#elif defined HAVE_VDSP
+template<> 
+inline void v_zero(float *const R__ ptr, 
+                   const int count)
+{
+    vDSP_vclr(ptr, 1, count);
+}
+template<> 
+inline void v_zero(double *const R__ ptr,
+                   const int count)
+{
+    vDSP_vclrD(ptr, 1, count);
+}
+#endif
+
+template<typename T>
+inline void v_zero_channels(T *const R__ *const R__ ptr,
+                            const int channels,
+                            const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_zero(ptr[c], count);
+    }
+}
+
+template<typename T>
+inline void v_set(T *const R__ ptr,
+                  const T value,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        ptr[i] = value;
+    }
+}
+
+template<typename T>
+inline void v_copy(T *const R__ dst,
+                   const T *const R__ src,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = src[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_copy(float *const R__ dst,
+                   const float *const R__ src,
+                   const int count)
+{
+    ippsCopy_32f(src, dst, count);
+}
+template<>
+inline void v_copy(double *const R__ dst,
+                   const double *const R__ src,
+                   const int count)
+{
+    ippsCopy_64f(src, dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_copy_channels(T *const R__ *const R__ dst,
+                            const T *const R__ *const R__ src,
+                            const int channels,
+                            const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_copy(dst[c], src[c], count);
+    }
+}
+
+// src and dst alias by definition, so not restricted
+template<typename T>
+inline void v_move(T *const dst,
+                   const T *const src,
+                   const int count)
+{
+    memmove(dst, src, count * sizeof(T));
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_move(float *const dst,
+                   const float *const src,
+                   const int count)
+{
+    ippsMove_32f(src, dst, count);
+}
+template<>
+inline void v_move(double *const dst,
+                   const double *const src,
+                   const int count)
+{
+    ippsMove_64f(src, dst, count);
+}
+#endif
+
+template<typename T, typename U>
+inline void v_convert(U *const R__ dst,
+                      const T *const R__ src,
+                      const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = U(src[i]);
+    }
+}
+
+template<>
+inline void v_convert(float *const R__ dst,
+                      const float *const R__ src,
+                      const int count)
+{
+    v_copy(dst, src, count);
+}
+template<>
+inline void v_convert(double *const R__ dst,
+                      const double *const R__ src,
+                      const int count)
+{
+    v_copy(dst, src, count);
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_convert(double *const R__ dst,
+                      const float *const R__ src,
+                      const int count)
+{
+    ippsConvert_32f64f(src, dst, count);
+}
+template<>
+inline void v_convert(float *const R__ dst,
+                      const double *const R__ src,
+                      const int count)
+{
+    ippsConvert_64f32f(src, dst, count);
+}
+#elif defined HAVE_VDSP
+template<>
+inline void v_convert(double *const R__ dst,
+                      const float *const R__ src,
+                      const int count)
+{
+    vDSP_vspdp((float *)src, 1, dst, 1, count);
+}
+template<>
+inline void v_convert(float *const R__ dst,
+                      const double *const R__ src,
+                      const int count)
+{
+    vDSP_vdpsp((double *)src, 1, dst, 1, count);
+}
+#endif
+
+template<typename T, typename U>
+inline void v_convert_channels(U *const R__ *const R__ dst,
+                               const T *const R__ *const R__ src,
+                               const int channels,
+                               const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_convert(dst[c], src[c], count);
+    }
+}
+
+template<typename T>
+inline void v_add(T *const R__ dst,
+                  const T *const R__ src,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src[i];
+    }
+}
+
+template<typename T>
+inline void v_add(T *const R__ dst,
+                  const T value,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += value;
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_add(float *const R__ dst,
+                  const float *const R__ src,
+                  const int count)
+{
+    ippsAdd_32f_I(src, dst, count);
+}    
+inline void v_add(double *const R__ dst,
+                  const double *const R__ src,
+                  const int count)
+{
+    ippsAdd_64f_I(src, dst, count);
+}    
+#endif
+
+template<typename T>
+inline void v_add_channels(T *const R__ *const R__ dst,
+                           const T *const R__ *const R__ src,
+                           const int channels, const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_add(dst[c], src[c], count);
+    }
+}
+
+template<typename T, typename G>
+inline void v_add_with_gain(T *const R__ dst,
+                            const T *const R__ src,
+                            const G gain,
+                            const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src[i] * gain;
+    }
+}
+
+template<typename T, typename G>
+inline void v_add_channels_with_gain(T *const R__ *const R__ dst,
+                                     const T *const R__ *const R__ src,
+                                     const G gain,
+                                     const int channels,
+                                     const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_add_with_gain(dst[c], src[c], gain, count);
+    }
+}
+
+template<typename T>
+inline void v_subtract(T *const R__ dst,
+                       const T *const R__ src,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] -= src[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_subtract(float *const R__ dst,
+                       const float *const R__ src,
+                       const int count)
+{
+    ippsSub_32f_I(src, dst, count);
+}    
+inline void v_subtract(double *const R__ dst,
+                       const double *const R__ src,
+                       const int count)
+{
+    ippsSub_64f_I(src, dst, count);
+}    
+#endif
+
+template<typename T, typename G>
+inline void v_scale(T *const R__ dst,
+                    const G gain,
+                    const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] *= gain;
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_scale(float *const R__ dst,
+                    const float gain,
+                    const int count)
+{
+    ippsMulC_32f_I(gain, dst, count);
+}
+template<>
+inline void v_scale(double *const R__ dst,
+                    const double gain,
+                    const int count)
+{
+    ippsMulC_64f_I(gain, dst, count);
+}
+#endif
+
+template<typename T, typename S>
+inline void v_multiply(T *const R__ srcdst,
+                       const S *const R__ src,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        srcdst[i] *= src[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_multiply(float *const R__ srcdst,
+                       const float *const R__ src,
+                       const int count)
+{
+    ippsMul_32f_I(src, srcdst, count);
+}
+template<>
+inline void v_multiply(double *const R__ srcdst,
+                       const double *const R__ src,
+                       const int count)
+{
+    ippsMul_64f_I(src, srcdst, count);
+}
+#endif // HAVE_IPP
+
+template<typename T>
+inline void v_multiply(T *const R__ dst,
+                       const T *const R__ src1,
+                       const T *const R__ src2,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = src1[i] * src2[i];
+    }
+}
+
+template<typename T>
+inline void v_divide(T *const R__ dst,
+                     const T *const R__ src,
+                     const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] /= src[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_divide(float *const R__ dst,
+                     const float *const R__ src,
+                     const int count)
+{
+    ippsDiv_32f_I(src, dst, count);
+}
+template<>
+inline void v_divide(double *const R__ dst,
+                     const double *const R__ src,
+                     const int count)
+{
+    ippsDiv_64f_I(src, dst, count);
+}
+#endif
+
+#if defined HAVE_IPP 
+template<>
+inline void v_multiply(float *const R__ dst,
+                       const float *const R__ src1,
+                       const float *const R__ src2,
+                       const int count)
+{
+    ippsMul_32f(src1, src2, dst, count);
+}    
+template<>
+inline void v_multiply(double *const R__ dst,
+                       const double *const R__ src1,
+                       const double *const R__ src2,
+                       const int count)
+{
+    ippsMul_64f(src1, src2, dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_multiply_and_add(T *const R__ dst,
+                               const T *const R__ src1,
+                               const T *const R__ src2,
+                               const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src1[i] * src2[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_multiply_and_add(float *const R__ dst,
+                               const float *const R__ src1,
+                               const float *const R__ src2,
+                               const int count)
+{
+    ippsAddProduct_32f(src1, src2, dst, count);
+}
+template<>
+inline void v_multiply_and_add(double *const R__ dst,
+                               const double *const R__ src1,
+                               const double *const R__ src2,
+                               const int count)
+{
+    ippsAddProduct_64f(src1, src2, dst, count);
+}
+#endif
+
+template<typename T>
+inline T v_sum(const T *const R__ src,
+               const int count)
+{
+    T result = T();
+    for (int i = 0; i < count; ++i) {
+        result += src[i];
+    }
+    return result;
+}
+
+template<typename T>
+inline T v_multiply_and_sum(const T *const R__ src1,
+                            const T *const R__ src2,
+                            const int count)
+{
+    T result = T();
+    for (int i = 0; i < count; ++i) {
+        result += src1[i] * src2[i];
+    }
+    return result;
+}
+
+#if defined HAVE_IPP
+template<>
+inline float v_multiply_and_sum(const float *const R__ src1,
+                                const float *const R__ src2,
+                                const int count)
+{
+    float dp;
+    ippsDotProd_32f(src1, src2, count, &dp);
+    return dp;
+}
+template<>
+inline double v_multiply_and_sum(const double *const R__ src1,
+                                 const double *const R__ src2,
+                                 const int count)
+{
+    double dp;
+    ippsDotProd_64f(src1, src2, count, &dp);
+    return dp;
+}
+#elif defined HAVE_VDSP
+template<>
+inline float v_multiply_and_sum(const float *const R__ src1,
+                                const float *const R__ src2,
+                                const int count)
+{
+    float dp;
+    vDSP_dotpr(src1, 1, src2, 1, &dp, count);
+    return dp;
+}
+template<>
+inline double v_multiply_and_sum(const double *const R__ src1,
+                                 const double *const R__ src2,
+                                 const int count)
+{
+    double dp;
+    vDSP_dotprD(src1, 1, src2, 1, &dp, count);
+    return dp;
+}
+#endif
+
+template<typename T>
+inline void v_log(T *const R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = log(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_log(float *const R__ dst,
+                  const int count)
+{
+    ippsLn_32f_I(dst, count);
+}
+template<>
+inline void v_log(double *const R__ dst,
+                  const int count)
+{
+    ippsLn_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_log(float *const R__ dst,
+                  const int count)
+{
+    float *tmp = (float *)alloca(count * sizeof(float));
+    vvlogf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_log(double *const R__ dst,
+                  const int count)
+{
+    double *tmp = (double *)alloca(count * sizeof(double));
+    vvlog(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_exp(T *const R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = exp(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_exp(float *const R__ dst,
+                  const int count)
+{
+    ippsExp_32f_I(dst, count);
+}
+template<>
+inline void v_exp(double *const R__ dst,
+                  const int count)
+{
+    ippsExp_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_exp(float *const R__ dst,
+                  const int count)
+{
+    float *tmp = (float *)alloca(count * sizeof(float));
+    vvexpf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_exp(double *const R__ dst,
+                  const int count)
+{
+    double *tmp = (double *)alloca(count * sizeof(double));
+    vvexp(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_sqrt(T *const R__ dst,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = sqrt(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_sqrt(float *const R__ dst,
+                   const int count)
+{
+    ippsSqrt_32f_I(dst, count);
+}
+template<>
+inline void v_sqrt(double *const R__ dst,
+                   const int count)
+{
+    ippsSqrt_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_sqrt(float *const R__ dst,
+                   const int count)
+{
+    float *tmp = (float *)alloca(count * sizeof(float));
+    vvsqrtf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_sqrt(double *const R__ dst,
+                   const int count)
+{
+    double *tmp = (double *)alloca(count * sizeof(double));
+    vvsqrt(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_square(T *const R__ dst,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = dst[i] * dst[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_square(float *const R__ dst,
+                   const int count)
+{
+    ippsSqr_32f_I(dst, count);
+}
+template<>
+inline void v_square(double *const R__ dst,
+                   const int count)
+{
+    ippsSqr_64f_I(dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_abs(T *const R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = fabs(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_abs(float *const R__ dst,
+                  const int count)
+{
+    ippsAbs_32f_I(dst, count);
+}
+template<>
+inline void v_abs(double *const R__ dst,
+                  const int count)
+{
+    ippsAbs_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+template<>
+inline void v_abs(float *const R__ dst,
+                  const int count)
+{
+    float *tmp = (float *)alloca(count * sizeof(float));
+#if TARGET_OS_IPHONE
+    vvfabsf(tmp, dst, &count);
+#elif (defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED < 1070)
+    vvfabf(tmp, dst, &count);
+#else
+    vvfabsf(tmp, dst, &count);
+#endif
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_interleave(T *const R__ dst,
+                         const T *const R__ *const R__ src,
+                         const int channels, 
+                         const int count)
+{
+    int idx = 0;
+    switch (channels) {
+    case 2:
+        // common case, may be vectorized by compiler if hardcoded
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < 2; ++j) {
+                dst[idx++] = src[j][i];
+            }
+        }
+        return;
+    case 1:
+        v_copy(dst, src[0], count);
+        return;
+    default:
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < channels; ++j) {
+                dst[idx++] = src[j][i];
+            }
+        }
+    }
+}
+
+#if defined HAVE_IPP 
+#if (IPP_VERSION_MAJOR <= 7)
+// Deprecated in v8, removed in v9
+template<>
+inline void v_interleave(float *const R__ dst,
+                         const float *const R__ *const R__ src,
+                         const int channels, 
+                         const int count)
+{
+    ippsInterleave_32f((const Ipp32f **)src, channels, count, dst);
+}
+// IPP does not (currently?) provide double-precision interleave
+#endif
+#endif
+
+template<typename T>
+inline void v_deinterleave(T *const R__ *const R__ dst,
+                           const T *const R__ src,
+                           const int channels, 
+                           const int count)
+{
+    int idx = 0;
+    switch (channels) {
+    case 2:
+        // common case, may be vectorized by compiler if hardcoded
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < 2; ++j) {
+                dst[j][i] = src[idx++];
+            }
+        }
+        return;
+    case 1:
+        v_copy(dst[0], src, count);
+        return;
+    default:
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < channels; ++j) {
+                dst[j][i] = src[idx++];
+            }
+        }
+    }
+}
+
+#if defined HAVE_IPP
+#if (IPP_VERSION_MAJOR <= 7)
+// Deprecated in v8, removed in v9
+template<>
+inline void v_deinterleave(float *const R__ *const R__ dst,
+                           const float *const R__ src,
+                           const int channels, 
+                           const int count)
+{
+    ippsDeinterleave_32f((const Ipp32f *)src, channels, count, (Ipp32f **)dst);
+}
+// IPP does not (currently?) provide double-precision deinterleave
+#endif
+#endif
+
+template<typename T>
+inline void v_fftshift(T *const R__ ptr,
+                       const int count)
+{
+    const int hs = count/2;
+    for (int i = 0; i < hs; ++i) {
+        T t = ptr[i];
+        ptr[i] = ptr[i + hs];
+        ptr[i + hs] = t;
+    }
+}
+
+template<typename T>
+inline T v_mean(const T *const R__ ptr, const int count)
+{
+    T t = T(0);
+    for (int i = 0; i < count; ++i) {
+        t += ptr[i];
+    }
+    t /= T(count);
+    return t;
+}
+
+template<typename T>
+inline T v_mean_channels(const T *const R__ *const R__ ptr,
+                         const int channels,
+                         const int count)
+{
+    T t = T(0);
+    for (int c = 0; c < channels; ++c) {
+        t += v_mean(ptr[c], count);
+    }
+    t /= T(channels);
+    return t;
+}
+
+}
+
+#endif