librubberband/src/common/VectorOps.h

/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    Rubber Band Library
    An audio time-stretching and pitch-shifting library.
    Copyright 2007-2022 Particular Programs Ltd.

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COPYING included with this distribution for more information.

    Alternatively, if you have a valid commercial licence for the
    Rubber Band Library obtained by agreement with the copyright
    holders, you may redistribute and/or modify it under the terms
    described in that licence.

    If you wish to distribute code using the Rubber Band Library
    under terms other than those of the GNU General Public License,
    you must obtain a valid commercial licence before doing so.
*/

#ifndef RUBBERBAND_VECTOR_OPS_H
#define RUBBERBAND_VECTOR_OPS_H

#ifdef HAVE_IPP
#ifndef _MSC_VER
#include <inttypes.h>
#endif
#include <ippversion.h>
#include <ipps.h>
#if (IPP_VERSION_MAJOR <= 7)
// Deprecated in v8, removed in v9
#include <ippac.h>
#endif
#endif

#ifdef HAVE_VDSP
#include <Accelerate/Accelerate.h>
#include <TargetConditionals.h>
#include <alloca.h>
#endif

#include <cstring>
#include "sysutils.h"

namespace RubberBand {

// Note that all functions with a "target" vector have their arguments
// in the same order as memcpy and friends, i.e. target vector first.
// This is the reverse order from the IPP functions.

// The ideal here is to write the basic loops in such a way as to be
// auto-vectorizable by a sensible compiler (definitely gcc-4.3 on
// Linux, ideally also gcc-4.0 on OS/X).

template<typename T>
inline void v_zero(T *const R__ ptr,
                   const int count)
{
    const T value = T(0);
    for (int i = 0; i < count; ++i) {
        ptr[i] = value;
    }
}

#if defined HAVE_IPP
template<>
inline void v_zero(float *const R__ ptr,
                   const int count)
{
    ippsZero_32f(ptr, count);
}
template<>
inline void v_zero(double *const R__ ptr,
                   const int count)
{
    ippsZero_64f(ptr, count);
}
#elif defined HAVE_VDSP
template<>
inline void v_zero(float *const R__ ptr,
                   const int count)
{
    vDSP_vclr(ptr, 1, count);
}
template<>
inline void v_zero(double *const R__ ptr,
                   const int count)
{
    vDSP_vclrD(ptr, 1, count);
}
#endif

template<typename T>
inline void v_zero_channels(T *const R__ *const R__ ptr,
                            const int channels,
                            const int count)
{
    for (int c = 0; c < channels; ++c) {
        v_zero(ptr[c], count);
    }
}

template<typename T>
inline void v_set(T *const R__ ptr,
                  const T value,
                  const int count)
{
    for (int i = 0; i < count; ++i) {
        ptr[i] = value;
    }
}

template<typename T>
inline void v_copy(T *const R__ dst,
                   const T *const R__ src,
                   const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = src[i];
    }
}

#if defined HAVE_IPP
template<>
inline void v_copy(float *const R__ dst,
                   const float *const R__ src,
                   const int count)
{
    ippsCopy_32f(src, dst, count);
}
template<>
inline void v_copy(double *const R__ dst,
                   const double *const R__ src,
                   const int count)
{
    ippsCopy_64f(src, dst, count);
}
#endif

template<typename T>
inline void v_copy_channels(T *const R__ *const R__ dst,
                            const T *const R__ *const R__ src,
                            const int channels,
                            const int count)
{
    for (int c = 0; c < channels; ++c) {
        v_copy(dst[c], src[c], count);
    }
}

// src and dst alias by definition, so not restricted
template<typename T>
inline void v_move(T *const dst,
                   const T *const src,
                   const int count)
{
    memmove(dst, src, count * sizeof(T));
}

#if defined HAVE_IPP
template<>
inline void v_move(float *const dst,
                   const float *const src,
                   const int count)
{
    ippsMove_32f(src, dst, count);
}
template<>
inline void v_move(double *const dst,
                   const double *const src,
                   const int count)
{
    ippsMove_64f(src, dst, count);
}
#endif

template<typename T, typename U>
inline void v_convert(U *const R__ dst,
                      const T *const R__ src,
                      const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = U(src[i]);
    }
}

template<>
inline void v_convert(float *const R__ dst,
                      const float *const R__ src,
                      const int count)
{
    v_copy(dst, src, count);
}
template<>
inline void v_convert(double *const R__ dst,
                      const double *const R__ src,
                      const int count)
{
    v_copy(dst, src, count);
}

#if defined HAVE_IPP
template<>
inline void v_convert(double *const R__ dst,
                      const float *const R__ src,
                      const int count)
{
    ippsConvert_32f64f(src, dst, count);
}
template<>
inline void v_convert(float *const R__ dst,
                      const double *const R__ src,
                      const int count)
{
    ippsConvert_64f32f(src, dst, count);
}
#elif defined HAVE_VDSP
template<>
inline void v_convert(double *const R__ dst,
                      const float *const R__ src,
                      const int count)
{
    vDSP_vspdp((float *)src, 1, dst, 1, count);
}
template<>
inline void v_convert(float *const R__ dst,
                      const double *const R__ src,
                      const int count)
{
    vDSP_vdpsp((double *)src, 1, dst, 1, count);
}
#endif

template<typename T, typename U>
inline void v_convert_channels(U *const R__ *const R__ dst,
                               const T *const R__ *const R__ src,
                               const int channels,
                               const int count)
{
    for (int c = 0; c < channels; ++c) {
        v_convert(dst[c], src[c], count);
    }
}

template<typename T>
inline void v_add(T *const R__ dst,
                  const T *const R__ src,
                  const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] += src[i];
    }
}

template<typename T>
inline void v_add(T *const R__ dst,
                  const T value,
                  const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] += value;
    }
}

#if defined HAVE_IPP
template<>
inline void v_add(float *const R__ dst,
                  const float *const R__ src,
                  const int count)
{
    ippsAdd_32f_I(src, dst, count);
}
inline void v_add(double *const R__ dst,
                  const double *const R__ src,
                  const int count)
{
    ippsAdd_64f_I(src, dst, count);
}
#endif

template<typename T>
inline void v_add_channels(T *const R__ *const R__ dst,
                           const T *const R__ *const R__ src,
                           const int channels, const int count)
{
    for (int c = 0; c < channels; ++c) {
        v_add(dst[c], src[c], count);
    }
}

template<typename T, typename G>
inline void v_add_with_gain(T *const R__ dst,
                            const T *const R__ src,
                            const G gain,
                            const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] += src[i] * gain;
    }
}

template<typename T, typename G>
inline void v_add_channels_with_gain(T *const R__ *const R__ dst,
                                     const T *const R__ *const R__ src,
                                     const G gain,
                                     const int channels,
                                     const int count)
{
    for (int c = 0; c < channels; ++c) {
        v_add_with_gain(dst[c], src[c], gain, count);
    }
}

template<typename T>
inline void v_subtract(T *const R__ dst,
                       const T *const R__ src,
                       const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] -= src[i];
    }
}

#if defined HAVE_IPP
template<>
inline void v_subtract(float *const R__ dst,
                       const float *const R__ src,
                       const int count)
{
    ippsSub_32f_I(src, dst, count);
}
inline void v_subtract(double *const R__ dst,
                       const double *const R__ src,
                       const int count)
{
    ippsSub_64f_I(src, dst, count);
}
#endif

template<typename T, typename G>
inline void v_scale(T *const R__ dst,
                    const G gain,
                    const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] *= gain;
    }
}

#if defined HAVE_IPP
template<>
inline void v_scale(float *const R__ dst,
                    const float gain,
                    const int count)
{
    ippsMulC_32f_I(gain, dst, count);
}
template<>
inline void v_scale(double *const R__ dst,
                    const double gain,
                    const int count)
{
    ippsMulC_64f_I(gain, dst, count);
}
#endif

template<typename T, typename S>
inline void v_multiply(T *const R__ srcdst,
                       const S *const R__ src,
                       const int count)
{
    for (int i = 0; i < count; ++i) {
        srcdst[i] *= src[i];
    }
}

#if defined HAVE_IPP
template<>
inline void v_multiply(float *const R__ srcdst,
                       const float *const R__ src,
                       const int count)
{
    ippsMul_32f_I(src, srcdst, count);
}
template<>
inline void v_multiply(double *const R__ srcdst,
                       const double *const R__ src,
                       const int count)
{
    ippsMul_64f_I(src, srcdst, count);
}
#endif // HAVE_IPP

template<typename T>
inline void v_multiply(T *const R__ dst,
                       const T *const R__ src1,
                       const T *const R__ src2,
                       const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = src1[i] * src2[i];
    }
}

template<typename T>
inline void v_divide(T *const R__ dst,
                     const T *const R__ src,
                     const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] /= src[i];
    }
}

#if defined HAVE_IPP
template<>
inline void v_divide(float *const R__ dst,
                     const float *const R__ src,
                     const int count)
{
    ippsDiv_32f_I(src, dst, count);
}
template<>
inline void v_divide(double *const R__ dst,
                     const double *const R__ src,
                     const int count)
{
    ippsDiv_64f_I(src, dst, count);
}
#endif

#if defined HAVE_IPP
template<>
inline void v_multiply(float *const R__ dst,
                       const float *const R__ src1,
                       const float *const R__ src2,
                       const int count)
{
    ippsMul_32f(src1, src2, dst, count);
}
template<>
inline void v_multiply(double *const R__ dst,
                       const double *const R__ src1,
                       const double *const R__ src2,
                       const int count)
{
    ippsMul_64f(src1, src2, dst, count);
}
#endif

template<typename T>
inline void v_multiply_and_add(T *const R__ dst,
                               const T *const R__ src1,
                               const T *const R__ src2,
                               const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] += src1[i] * src2[i];
    }
}

#if defined HAVE_IPP
template<>
inline void v_multiply_and_add(float *const R__ dst,
                               const float *const R__ src1,
                               const float *const R__ src2,
                               const int count)
{
    ippsAddProduct_32f(src1, src2, dst, count);
}
template<>
inline void v_multiply_and_add(double *const R__ dst,
                               const double *const R__ src1,
                               const double *const R__ src2,
                               const int count)
{
    ippsAddProduct_64f(src1, src2, dst, count);
}
#endif

template<typename T>
inline T v_sum(const T *const R__ src,
               const int count)
{
    T result = T();
    for (int i = 0; i < count; ++i) {
        result += src[i];
    }
    return result;
}

template<typename T>
inline T v_multiply_and_sum(const T *const R__ src1,
                            const T *const R__ src2,
                            const int count)
{
    T result = T();
    for (int i = 0; i < count; ++i) {
        result += src1[i] * src2[i];
    }
    return result;
}

#if defined HAVE_IPP
template<>
inline float v_multiply_and_sum(const float *const R__ src1,
                                const float *const R__ src2,
                                const int count)
{
    float dp;
    ippsDotProd_32f(src1, src2, count, &dp);
    return dp;
}
template<>
inline double v_multiply_and_sum(const double *const R__ src1,
                                 const double *const R__ src2,
                                 const int count)
{
    double dp;
    ippsDotProd_64f(src1, src2, count, &dp);
    return dp;
}
#elif defined HAVE_VDSP
template<>
inline float v_multiply_and_sum(const float *const R__ src1,
                                const float *const R__ src2,
                                const int count)
{
    float dp;
    vDSP_dotpr(src1, 1, src2, 1, &dp, count);
    return dp;
}
template<>
inline double v_multiply_and_sum(const double *const R__ src1,
                                 const double *const R__ src2,
                                 const int count)
{
    double dp;
    vDSP_dotprD(src1, 1, src2, 1, &dp, count);
    return dp;
}
#endif

template<typename T>
inline void v_log(T *const R__ dst,
                  const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = log(dst[i]);
    }
}

#if defined HAVE_IPP
template<>
inline void v_log(float *const R__ dst,
                  const int count)
{
    ippsLn_32f_I(dst, count);
}
template<>
inline void v_log(double *const R__ dst,
                  const int count)
{
    ippsLn_64f_I(dst, count);
}
#elif defined HAVE_VDSP
// no in-place vForce functions for these -- can we use the
// out-of-place functions with equal input and output vectors? can we
// use an out-of-place one with temporary buffer and still be faster
// than doing it any other way?
template<>
inline void v_log(float *const R__ dst,
                  const int count)
{
    float *tmp = (float *)alloca(count * sizeof(float));
    vvlogf(tmp, dst, &count);
    v_copy(dst, tmp, count);
}
template<>
inline void v_log(double *const R__ dst,
                  const int count)
{
    double *tmp = (double *)alloca(count * sizeof(double));
    vvlog(tmp, dst, &count);
    v_copy(dst, tmp, count);
}
#endif

template<typename T>
inline void v_exp(T *const R__ dst,
                  const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = exp(dst[i]);
    }
}

#if defined HAVE_IPP
template<>
inline void v_exp(float *const R__ dst,
                  const int count)
{
    ippsExp_32f_I(dst, count);
}
template<>
inline void v_exp(double *const R__ dst,
                  const int count)
{
    ippsExp_64f_I(dst, count);
}
#elif defined HAVE_VDSP
// no in-place vForce functions for these -- can we use the
// out-of-place functions with equal input and output vectors? can we
// use an out-of-place one with temporary buffer and still be faster
// than doing it any other way?
template<>
inline void v_exp(float *const R__ dst,
                  const int count)
{
    float *tmp = (float *)alloca(count * sizeof(float));
    vvexpf(tmp, dst, &count);
    v_copy(dst, tmp, count);
}
template<>
inline void v_exp(double *const R__ dst,
                  const int count)
{
    double *tmp = (double *)alloca(count * sizeof(double));
    vvexp(tmp, dst, &count);
    v_copy(dst, tmp, count);
}
#endif

template<typename T>
inline void v_sqrt(T *const R__ dst,
                   const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = sqrt(dst[i]);
    }
}

#if defined HAVE_IPP
template<>
inline void v_sqrt(float *const R__ dst,
                   const int count)
{
    ippsSqrt_32f_I(dst, count);
}
template<>
inline void v_sqrt(double *const R__ dst,
                   const int count)
{
    ippsSqrt_64f_I(dst, count);
}
#elif defined HAVE_VDSP
// no in-place vForce functions for these -- can we use the
// out-of-place functions with equal input and output vectors? can we
// use an out-of-place one with temporary buffer and still be faster
// than doing it any other way?
template<>
inline void v_sqrt(float *const R__ dst,
                   const int count)
{
    float *tmp = (float *)alloca(count * sizeof(float));
    vvsqrtf(tmp, dst, &count);
    v_copy(dst, tmp, count);
}
template<>
inline void v_sqrt(double *const R__ dst,
                   const int count)
{
    double *tmp = (double *)alloca(count * sizeof(double));
    vvsqrt(tmp, dst, &count);
    v_copy(dst, tmp, count);
}
#endif

template<typename T>
inline void v_square(T *const R__ dst,
                   const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = dst[i] * dst[i];
    }
}

#if defined HAVE_IPP
template<>
inline void v_square(float *const R__ dst,
                   const int count)
{
    ippsSqr_32f_I(dst, count);
}
template<>
inline void v_square(double *const R__ dst,
                   const int count)
{
    ippsSqr_64f_I(dst, count);
}
#endif

template<typename T>
inline void v_abs(T *const R__ dst,
                  const int count)
{
    for (int i = 0; i < count; ++i) {
        dst[i] = fabs(dst[i]);
    }
}

#if defined HAVE_IPP
template<>
inline void v_abs(float *const R__ dst,
                  const int count)
{
    ippsAbs_32f_I(dst, count);
}
template<>
inline void v_abs(double *const R__ dst,
                  const int count)
{
    ippsAbs_64f_I(dst, count);
}
#elif defined HAVE_VDSP
template<>
inline void v_abs(float *const R__ dst,
                  const int count)
{
    float *tmp = (float *)alloca(count * sizeof(float));
#if TARGET_OS_IPHONE
    vvfabsf(tmp, dst, &count);
#elif (defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED < 1070)
    vvfabf(tmp, dst, &count);
#else
    vvfabsf(tmp, dst, &count);
#endif
    v_copy(dst, tmp, count);
}
#endif

template<typename T>
inline void v_interleave(T *const R__ dst,
                         const T *const R__ *const R__ src,
                         const int channels,
                         const int count)
{
    int idx = 0;
    switch (channels) {
    case 2:
        // common case, may be vectorized by compiler if hardcoded
        for (int i = 0; i < count; ++i) {
            for (int j = 0; j < 2; ++j) {
                dst[idx++] = src[j][i];
            }
        }
        return;
    case 1:
        v_copy(dst, src[0], count);
        return;
    default:
        for (int i = 0; i < count; ++i) {
            for (int j = 0; j < channels; ++j) {
                dst[idx++] = src[j][i];
            }
        }
    }
}

#if defined HAVE_IPP
#if (IPP_VERSION_MAJOR <= 7)
// Deprecated in v8, removed in v9
template<>
inline void v_interleave(float *const R__ dst,
                         const float *const R__ *const R__ src,
                         const int channels,
                         const int count)
{
    ippsInterleave_32f((const Ipp32f **)src, channels, count, dst);
}
// IPP does not (currently?) provide double-precision interleave
#endif
#endif

template<typename T>
inline void v_deinterleave(T *const R__ *const R__ dst,
                           const T *const R__ src,
                           const int channels,
                           const int count)
{
    int idx = 0;
    switch (channels) {
    case 2:
        // common case, may be vectorized by compiler if hardcoded
        for (int i = 0; i < count; ++i) {
            for (int j = 0; j < 2; ++j) {
                dst[j][i] = src[idx++];
            }
        }
        return;
    case 1:
        v_copy(dst[0], src, count);
        return;
    default:
        for (int i = 0; i < count; ++i) {
            for (int j = 0; j < channels; ++j) {
                dst[j][i] = src[idx++];
            }
        }
    }
}

#if defined HAVE_IPP
#if (IPP_VERSION_MAJOR <= 7)
// Deprecated in v8, removed in v9
template<>
inline void v_deinterleave(float *const R__ *const R__ dst,
                           const float *const R__ src,
                           const int channels,
                           const int count)
{
    ippsDeinterleave_32f((const Ipp32f *)src, channels, count, (Ipp32f **)dst);
}
// IPP does not (currently?) provide double-precision deinterleave
#endif
#endif

template<typename T>
inline void v_fftshift(T *const R__ ptr,
                       const int count)
{
    const int hs = count/2;
    for (int i = 0; i < hs; ++i) {
        T t = ptr[i];
        ptr[i] = ptr[i + hs];
        ptr[i + hs] = t;
    }
}

template<typename T>
inline T v_mean(const T *const R__ ptr, const int count)
{
    T t = T(0);
    for (int i = 0; i < count; ++i) {
        t += ptr[i];
    }
    t /= T(count);
    return t;
}

template<typename T>
inline T v_mean_channels(const T *const R__ *const R__ ptr,
                         const int channels,
                         const int count)
{
    T t = T(0);
    for (int c = 0; c < channels; ++c) {
        t += v_mean(ptr[c], count);
    }
    t /= T(channels);
    return t;
}

}

#endif