Skip to content

Commit

Permalink
Add unordered transforms and convolution (#3)
Browse files Browse the repository at this point in the history
* Implementing frequency-domain convolution

* Test convolution with complex I/O

* Convolution for SSE and AVX
  • Loading branch information
jatinchowdhury18 authored Oct 25, 2024
1 parent 0e96713 commit e6b65ea
Show file tree
Hide file tree
Showing 6 changed files with 355 additions and 17 deletions.
87 changes: 82 additions & 5 deletions chowdsp_fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ SOFTWARE.

#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstdint>
#include <cstdlib>

#if defined(_MSC_VER)
// Contains the definition of __cpuidex
Expand Down Expand Up @@ -89,7 +89,8 @@ struct FFT_Setup;
FFT_Setup* fft_new_setup (int N, fft_transform_t transform);
void fft_destroy_setup (FFT_Setup* s);
void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered);
}
void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling);
} // namespace chowdsp::fft::avx
static constexpr uintptr_t address_mask = ~static_cast<uintptr_t> (3);
static constexpr uintptr_t typeid_mask = static_cast<uintptr_t> (3);
#endif
Expand Down Expand Up @@ -187,19 +188,19 @@ static bool cpu_supports_avx()
[[maybe_unused]] const auto fma3_avx = avx && fma3_sse42;

int regs8[4];
get_cpuid(regs8, 0x80000001);
get_cpuid (regs8, 0x80000001);
[[maybe_unused]] const auto fma4 = regs8[2] >> 16 & avx_state_os_enabled;

// sse4a = regs[2] >> 6 & 1;

// xop = regs[2] >> 11 & 1;

int regs7[4];
get_cpuid(regs7, 0x7);
get_cpuid (regs7, 0x7);
const auto avx2 = regs7[1] >> 5 & avx_state_os_enabled;

int regs7a[4];
get_cpuid(regs7a, 0x7, 0x1);
get_cpuid (regs7a, 0x7, 0x1);
[[maybe_unused]] const auto avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;

const auto fma3_avx2 = avx2 && fma3_sse42;
Expand Down Expand Up @@ -305,4 +306,80 @@ void fft_transform (void* setup, const float* input, float* output, float* work,
1);
#endif
}

void fft_transform_unordered (void* setup, const float* input, float* output, float* work, fft_direction_t direction)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
if (check_is_pointer_sse_setup (setup))
{
sse::pffft_transform_internal (reinterpret_cast<sse::FFT_Setup*> (get_setup_pointer (setup)),
input,
output,
(__m128*) work,
direction,
0);
}
else
{
avx::pffft_transform_internal (reinterpret_cast<avx::FFT_Setup*> (get_setup_pointer (setup)),
input,
output,
work,
direction,
0);
}
#else
sse::pffft_transform_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
input,
output,
(__m128*) work,
direction,
0);
#endif
#elif defined(__ARM_NEON__)
neon::pffft_transform_internal (reinterpret_cast<neon::FFT_Setup*> (setup),
input,
output,
(float32x4_t*) work,
direction,
0);
#endif
}

void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
if (check_is_pointer_sse_setup (setup))
{
sse::pffft_convolve_internal (reinterpret_cast<sse::FFT_Setup*> (get_setup_pointer (setup)),
a,
b,
ab,
scaling);
}
else
{
avx::pffft_convolve_internal (reinterpret_cast<avx::FFT_Setup*> (get_setup_pointer (setup)),
a,
b,
ab,
scaling);
}
#else
sse::pffft_convolve_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
a,
b,
ab,
scaling);
#endif
#elif defined(__ARM_NEON__)
neon::pffft_convolve_internal (reinterpret_cast<neon::FFT_Setup*> (setup),
a,
b,
ab,
scaling);
#endif
}
} // namespace chowdsp::fft
12 changes: 12 additions & 0 deletions chowdsp_fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ void fft_destroy_setup (void*);
*/
void fft_transform (void* setup, const float* input, float* output, float* work, fft_direction_t direction);

/**
* Same as the above method, but without necessarily maintaining the correct data order
* in the frequency domain. This may be useful if your frequency-domain operations
* are order-independent, for example, convolution.
*/
void fft_transform_unordered (void* setup, const float* input, float* output, float* work, fft_direction_t direction);

/**
* Convolve two (unordered) frequency-domain signals, with some scale factor.
*/
void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling);

void* aligned_malloc (size_t nb_bytes);
void aligned_free (void*);

Expand Down
44 changes: 44 additions & 0 deletions simd/chowdsp_fft_impl_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1984,5 +1984,49 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
}
assert (buff[ib] == voutput);
}

void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling)
{
int Ncvec = setup->Ncvec;
auto* va = reinterpret_cast<const __m256*> (a);
auto* vb = reinterpret_cast<const __m256*> (b);
auto* vab = reinterpret_cast<__m256*> (ab);

float ar0, ai0, br0, bi0, abr0, abi0;
const auto vscal = _mm256_set1_ps (scaling);
int i;

ar0 = reinterpret_cast<const float*> (&va[0])[0];
ai0 = reinterpret_cast<const float*> (&va[1])[0];
br0 = reinterpret_cast<const float*> (&vb[0])[0];
bi0 = reinterpret_cast<const float*> (&vb[1])[0];
abr0 = reinterpret_cast<const float*> (&vab[0])[0];
abi0 = reinterpret_cast<const float*> (&vab[1])[0];

for (i = 0; i < Ncvec; i += 2)
{
__m256 ar, ai, br, bi;
ar = va[2 * i + 0];
ai = va[2 * i + 1];
br = vb[2 * i + 0];
bi = vb[2 * i + 1];
cplx_mul_v (ar, ai, br, bi);
vab[2 * i + 0] = _mm256_fmadd_ps (ar, vscal, vab[2 * i + 0]);
vab[2 * i + 1] = _mm256_fmadd_ps (ai, vscal, vab[2 * i + 1]);
ar = va[2 * i + 2];
ai = va[2 * i + 3];
br = vb[2 * i + 2];
bi = vb[2 * i + 3];
cplx_mul_v (ar, ai, br, bi);
vab[2 * i + 2] = _mm256_fmadd_ps (ar, vscal, vab[2 * i + 2]);
vab[2 * i + 3] = _mm256_fmadd_ps (ai, vscal, vab[2 * i + 3]);
}

if (setup->transform == FFT_REAL)
{
reinterpret_cast<float*> (&vab[0])[0] = abr0 + ar0 * br0 * scaling;
reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
}
}
} // namespace chowdsp::fft::avx
#endif
47 changes: 44 additions & 3 deletions simd/chowdsp_fft_impl_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1558,9 +1558,6 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
float32x4_t* buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
int ib = (nf_odd ^ ordered ? 1 : 0);

// assert (VALIGNED (finput) && VALIGNED (foutput));

//assert(finput != foutput);
if (direction == FFT_FORWARD)
{
ib = ! ib;
Expand Down Expand Up @@ -1628,4 +1625,48 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
}
assert (buff[ib] == voutput);
}

void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling)
{
int Ncvec = setup->Ncvec;
auto* va = (const float32x4_t*) a;
auto* vb = (const float32x4_t*) b;
auto* vab = (float32x4_t*) ab;

float ar0, ai0, br0, bi0, abr0, abi0;
const auto vscal = vld1q_dup_f32 (&scaling);
int i;

ar0 = reinterpret_cast<const float*> (&va[0])[0];
ai0 = reinterpret_cast<const float*> (&va[1])[0];
br0 = reinterpret_cast<const float*> (&vb[0])[0];
bi0 = reinterpret_cast<const float*> (&vb[1])[0];
abr0 = reinterpret_cast<const float*> (&vab[0])[0];
abi0 = reinterpret_cast<const float*> (&vab[1])[0];

for (i = 0; i < Ncvec; i += 2)
{
float32x4_t ar, ai, br, bi;
ar = va[2 * i + 0];
ai = va[2 * i + 1];
br = vb[2 * i + 0];
bi = vb[2 * i + 1];
std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
vab[2 * i + 0] = vmlaq_f32 (vab[2 * i + 0], ar, vscal);
vab[2 * i + 1] = vmlaq_f32 (vab[2 * i + 1], ai, vscal);
ar = va[2 * i + 2];
ai = va[2 * i + 3];
br = vb[2 * i + 2];
bi = vb[2 * i + 3];
std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
vab[2 * i + 2] = vmlaq_f32 (vab[2 * i + 2], ar, vscal);
vab[2 * i + 3] = vmlaq_f32 (vab[2 * i + 3], ai, vscal);
}

if (setup->transform == FFT_REAL)
{
reinterpret_cast<float*> (&vab[0])[0] = abr0 + ar0 * br0 * scaling;
reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
}
}
} // namespace chowdsp::fft::neon
44 changes: 44 additions & 0 deletions simd/chowdsp_fft_impl_sse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1645,4 +1645,48 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
}
assert (buff[ib] == voutput);
}

void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling)
{
int Ncvec = setup->Ncvec;
auto* va = reinterpret_cast<const __m128*> (a);
auto* vb = reinterpret_cast<const __m128*> (b);
auto* vab = reinterpret_cast<__m128*> (ab);

float ar0, ai0, br0, bi0, abr0, abi0;
const auto vscal = _mm_set1_ps (scaling);
int i;

ar0 = reinterpret_cast<const float*> (&va[0])[0];
ai0 = reinterpret_cast<const float*> (&va[1])[0];
br0 = reinterpret_cast<const float*> (&vb[0])[0];
bi0 = reinterpret_cast<const float*> (&vb[1])[0];
abr0 = reinterpret_cast<const float*> (&vab[0])[0];
abi0 = reinterpret_cast<const float*> (&vab[1])[0];

for (i = 0; i < Ncvec; i += 2)
{
__m128 ar, ai, br, bi;
ar = va[2 * i + 0];
ai = va[2 * i + 1];
br = vb[2 * i + 0];
bi = vb[2 * i + 1];
std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
ar = va[2 * i + 2];
ai = va[2 * i + 3];
br = vb[2 * i + 2];
bi = vb[2 * i + 3];
std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
vab[2 * i + 2] = _mm_add_ps (vab[2 * i + 2], _mm_mul_ps (ar, vscal));
vab[2 * i + 3] = _mm_add_ps (vab[2 * i + 3], _mm_mul_ps (ai, vscal));
}

if (setup->transform == FFT_REAL)
{
reinterpret_cast<float*> (&vab[0])[0] = abr0 + ar0 * br0 * scaling;
reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
}
}
} // namespace chowdsp::fft::sse
Loading

0 comments on commit e6b65ea

Please sign in to comment.