pffft.c | Explore in Territory

/* Copyright (c) 2013  Julien Pommier ( [email protected] )

   Based on original fortran 77 code from FFTPACKv4 from NETLIB
   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
   of NCAR, in 1985.

   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.

   FFTPACK license:

   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html

   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.

   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:

   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.  

   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.

   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.

   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.


   PFFFT : a Pretty Fast FFT.

   This file is largerly based on the original FFTPACK implementation, modified in
   order to take advantage of SIMD instructions of modern CPUs.
*/

/*
  ChangeLog: 
  - 2011/10/02, version 1: This is the very first release of this file.
*/

#include "pffft.h"
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>

/* detect compiler flavour */
#if defined(_MSC_VER)
#define COMPILER_MSVC
#elif defined(__GNUC__)
#define COMPILER_GCC
#endif

#if defined(COMPILER_GCC)
#define ALWAYS_INLINE(return_type) …
#define NEVER_INLINE(return_type) …
#define RESTRICT …
#define VLA_ARRAY_ON_STACK(type__, varname__, size__) …
#define VLA_ARRAY_ON_STACK_FREE(varname__) …
#elif defined(COMPILER_MSVC)
#include <malloc.h>
#define ALWAYS_INLINE …
#define NEVER_INLINE …
#define RESTRICT …
#define VLA_ARRAY_ON_STACK …
#define VLA_ARRAY_ON_STACK_FREE …
#endif


/* 
   vector support macros: the rest of the code is independant of
   SSE/Altivec/NEON -- adding support for other platforms with 4-element
   vectors should be limited to these macros 
*/


// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
//#define PFFFT_SIMD_DISABLE

/*
   Altivec support macros 
*/
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
typedef vector float v4sf;
#define SIMD_SZ …
#define VZERO …
#define VMUL …
#define VADD …
#define VMADD …
#define VSUB …
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
#define LD_PS1 …
#define INTERLEAVE2 …
#define UNINTERLEAVE2 …
#define VTRANSPOSE4 …
#define VSWAPHL …
#define VALIGNED …

/*
  SSE1 support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))

#include <xmmintrin.h>
v4sf;
#define SIMD_SZ …
#define VZERO() …
#define VMUL(a,b) …
#define VADD(a,b) …
#define VMADD(a,b,c) …
#define VSUB(a,b) …
#define LD_PS1(p) …
#define INTERLEAVE2(in1, in2, out1, out2) …
#define UNINTERLEAVE2(in1, in2, out1, out2) …
#define VTRANSPOSE4(x0,x1,x2,x3) …
#define VSWAPHL(a,b) …
#define VALIGNED(ptr) …

/*
  ARM NEON support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__ARMEL__) || defined(__aarch64__) || defined(_M_ARM64))
#  include <arm_neon.h>
typedef float32x4_t v4sf;
#define SIMD_SZ …
#define VZERO …
#define VMUL …
#define VADD …
#define VMADD …
#define VSUB …
#define LD_PS1 …
#define INTERLEAVE2 …
#define UNINTERLEAVE2 …
#define VTRANSPOSE4 …
// marginally faster version
//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
#define VSWAPHL …
#define VALIGNED …
#else
#  if !defined(PFFFT_SIMD_DISABLE)
#    warning "building with simd disabled !\n";
#define PFFFT_SIMD_DISABLE …
#  endif
#endif

// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
#ifdef PFFFT_SIMD_DISABLE
typedef float v4sf;
#define SIMD_SZ …
#define VZERO …
#define VMUL …
#define VADD …
#define VMADD …
#define VSUB …
#define LD_PS1 …
#define VALIGNED …
#endif

// shortcuts for complex multiplcations
#define VCPLXMUL(ar,ai,br,bi) …
#define VCPLXMULCONJ(ar,ai,br,bi) …
#ifndef SVMUL
// multiply a scalar with a vector
#define SVMUL(f,v) …
#endif

#if !defined(PFFFT_SIMD_DISABLE)
v4sf_union;

#include <string.h>

#define assertv4(v,f0,f1,f2,f3) …

/* detect bugs with the vector support macros */
void validate_pffft_simd() { … }
#endif //!PFFFT_SIMD_DISABLE

/* SSE and co like 16-bytes aligned pointers */
#define MALLOC_V4SF_ALIGNMENT …
void *pffft_aligned_malloc(size_t nb_bytes) { … }

void pffft_aligned_free(void *p) { … }

int pffft_simd_size() { … }

/*
  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
*/
static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { … }

/*
  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
*/
static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
                                    const float *wa1, const float *wa2, float fsign) { … } /* passf3 */

static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
                                    const float *wa1, const float *wa2, const float *wa3, float fsign) { … } /* passf4 */

/*
  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
*/
static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
                                    const float *wa1, const float *wa2, 
                                    const float *wa3, const float *wa4, float fsign) { … }

static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { … } /* radf2 */


static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) { … } /* radb2 */

static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
                     const float *wa1, const float *wa2) { … } /* radf3 */


static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
                     const float *wa1, const float *wa2)
{ … } /* radb3 */

static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
{ … } /* radf4 */


static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
{ … } /* radb4 */

static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, 
                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
{ … } /* radf5 */

static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, 
                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
{ … } /* radb5 */

static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
                                      const float *wa, const int *ifac) { … } /* rfftf1 */

static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
                                      const float *wa, const int *ifac) { … }

static int decompose(int n, int *ifac, const int *ntryh) { … }



static void rffti1_ps(int n, float *wa, int *ifac)
{ … } /* rffti1 */

void cffti1_ps(int n, float *wa, int *ifac)
{ … } /* cffti1 */


v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { … }


struct PFFFT_Setup { … };

PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { … }


void pffft_destroy_setup(PFFFT_Setup *s) { … }

#if !defined(PFFFT_SIMD_DISABLE)

/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { … }

static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { … }

void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { … }

void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { … }

void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { … }


static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
                            const v4sf *e, v4sf *out) { … }

static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { … }

static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, 
                                             const v4sf *e, v4sf *out, int first) { … }

static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { … }


void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
                             pffft_direction_t direction, int ordered) { … }

void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { … }


#else // defined(PFFFT_SIMD_DISABLE)

// standard routine using scalar floats, without SIMD stuff.

#define pffft_zreorder_nosimd …
void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
  int k, N = setup->N;
  if (setup->transform == PFFFT_COMPLEX) {
    for (k=0; k < 2*N; ++k) out[k] = in[k];
    return;
  }
  else if (direction == PFFFT_FORWARD) {
    float x_N = in[N-1];
    for (k=N-1; k > 1; --k) out[k] = in[k-1]; 
    out[0] = in[0];
    out[1] = x_N;
  } else {
    float x_N = in[1];
    for (k=1; k < N-1; ++k) out[k] = in[k+1]; 
    out[0] = in[0];
    out[N-1] = x_N;
  }
}

#define pffft_transform_internal_nosimd …
void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
                                    pffft_direction_t direction, int ordered) {
  int Ncvec   = setup->Ncvec;
  int nf_odd = (setup->ifac[1] & 1);

  // temporary buffer is allocated on the stack if the scratch pointer is NULL
  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
  float *buff[2];
  int ib;
  if (scratch == 0) scratch = scratch_on_stack;
  buff[0] = output; buff[1] = scratch;

  if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
  ib = (nf_odd ^ ordered ? 1 : 0);

  if (direction == PFFFT_FORWARD) {
    if (setup->transform == PFFFT_REAL) { 
      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
    } else {
      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
    }
    if (ordered) {
      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
    }
  } else {    
    if (input == buff[ib]) { 
      ib = !ib; // may happen when finput == foutput
    }
    if (ordered) {
      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); 
      input = buff[!ib];
    }
    if (setup->transform == PFFFT_REAL) {
      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], 
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
    } else {
      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
    }
  }
  if (buff[ib] != output) {
    int k;
    // extra copy required -- this situation should happens only when finput == foutput
    assert(input==output);
    for (k=0; k < Ncvec; ++k) {
      float a = buff[ib][2*k], b = buff[ib][2*k+1];
      output[2*k] = a; output[2*k+1] = b;
    }
    ib = !ib;
  }
  assert(buff[ib] == output);

  VLA_ARRAY_ON_STACK_FREE(scratch_on_stack);
}

#define pffft_zconvolve_accumulate_nosimd …
void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
                                       float *ab, float scaling) {
  int i, Ncvec = s->Ncvec;

  if (s->transform == PFFFT_REAL) {
    // take care of the fftpack ordering
    ab[0] += a[0]*b[0]*scaling;
    ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
    ++ab; ++a; ++b; --Ncvec;
  }
  for (i=0; i < Ncvec; ++i) {
    float ar, ai, br, bi;
    ar = a[2*i+0]; ai = a[2*i+1];
    br = b[2*i+0]; bi = b[2*i+1];
    VCPLXMUL(ar, ai, br, bi);
    ab[2*i+0] += ar*scaling;
    ab[2*i+1] += ai*scaling;
  }
}

#endif // defined(PFFFT_SIMD_DISABLE)

void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { … }

void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { … }
chromium/third_party/pffft/src/pffft.c