Add unordered transforms and convolution (#3)

* Implementing frequency-domain convolution * Test convolution with complex I/O * Convolution for SSE and AVX
Chowdhury-DSP · Oct 25, 2024 · e6b65ea · e6b65ea
1 parent 0e96713
commit e6b65ea
Show file tree

Hide file tree

Showing 6 changed files with 355 additions and 17 deletions.
diff --git a/chowdsp_fft.cpp b/chowdsp_fft.cpp
@@ -50,8 +50,8 @@ SOFTWARE.
 
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdint>
+#include <cstdlib>
 
 #if defined(_MSC_VER)
 // Contains the definition of __cpuidex
@@ -89,7 +89,8 @@ struct FFT_Setup;
 FFT_Setup* fft_new_setup (int N, fft_transform_t transform);
 void fft_destroy_setup (FFT_Setup* s);
 void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered);
-}
+void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling);
+} // namespace chowdsp::fft::avx
 static constexpr uintptr_t address_mask = ~static_cast<uintptr_t> (3);
 static constexpr uintptr_t typeid_mask = static_cast<uintptr_t> (3);
 #endif
@@ -187,19 +188,19 @@ static bool cpu_supports_avx()
     [[maybe_unused]] const auto fma3_avx = avx && fma3_sse42;
 
     int regs8[4];
-    get_cpuid(regs8, 0x80000001);
+    get_cpuid (regs8, 0x80000001);
     [[maybe_unused]] const auto fma4 = regs8[2] >> 16 & avx_state_os_enabled;
 
     // sse4a = regs[2] >> 6 & 1;
 
     // xop = regs[2] >> 11 & 1;
 
     int regs7[4];
-    get_cpuid(regs7, 0x7);
+    get_cpuid (regs7, 0x7);
     const auto avx2 = regs7[1] >> 5 & avx_state_os_enabled;
 
     int regs7a[4];
-    get_cpuid(regs7a, 0x7, 0x1);
+    get_cpuid (regs7a, 0x7, 0x1);
     [[maybe_unused]] const auto avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;
 
     const auto fma3_avx2 = avx2 && fma3_sse42;
@@ -305,4 +306,80 @@ void fft_transform (void* setup, const float* input, float* output, float* work,
                                     1);
 #endif
 }
+
+void fft_transform_unordered (void* setup, const float* input, float* output, float* work, fft_direction_t direction)
+{
+#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
+#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
+    if (check_is_pointer_sse_setup (setup))
+    {
+        sse::pffft_transform_internal (reinterpret_cast<sse::FFT_Setup*> (get_setup_pointer (setup)),
+                                       input,
+                                       output,
+                                       (__m128*) work,
+                                       direction,
+                                       0);
+    }
+    else
+    {
+        avx::pffft_transform_internal (reinterpret_cast<avx::FFT_Setup*> (get_setup_pointer (setup)),
+                                       input,
+                                       output,
+                                       work,
+                                       direction,
+                                       0);
+    }
+#else
+    sse::pffft_transform_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
+                                   input,
+                                   output,
+                                   (__m128*) work,
+                                   direction,
+                                   0);
+#endif
+#elif defined(__ARM_NEON__)
+    neon::pffft_transform_internal (reinterpret_cast<neon::FFT_Setup*> (setup),
+                                    input,
+                                    output,
+                                    (float32x4_t*) work,
+                                    direction,
+                                    0);
+#endif
+}
+
+void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling)
+{
+#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
+#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
+    if (check_is_pointer_sse_setup (setup))
+    {
+        sse::pffft_convolve_internal (reinterpret_cast<sse::FFT_Setup*> (get_setup_pointer (setup)),
+                                      a,
+                                      b,
+                                      ab,
+                                      scaling);
+    }
+    else
+    {
+        avx::pffft_convolve_internal (reinterpret_cast<avx::FFT_Setup*> (get_setup_pointer (setup)),
+                                      a,
+                                      b,
+                                      ab,
+                                      scaling);
+    }
+#else
+    sse::pffft_convolve_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
+                                  a,
+                                  b,
+                                  ab,
+                                  scaling);
+#endif
+#elif defined(__ARM_NEON__)
+    neon::pffft_convolve_internal (reinterpret_cast<neon::FFT_Setup*> (setup),
+                                   a,
+                                   b,
+                                   ab,
+                                   scaling);
+#endif
+}
 } // namespace chowdsp::fft
diff --git a/chowdsp_fft.h b/chowdsp_fft.h
@@ -102,6 +102,18 @@ void fft_destroy_setup (void*);
 */
 void fft_transform (void* setup, const float* input, float* output, float* work, fft_direction_t direction);
 
+/**
+ * Same as the above method, but without necessarily maintaining the correct data order
+ * in the frequency domain. This may be useful if your frequency-domain operations
+ * are order-independent, for example, convolution.
+ */
+void fft_transform_unordered (void* setup, const float* input, float* output, float* work, fft_direction_t direction);
+
+/**
+ * Convolve two (unordered) frequency-domain signals, with some scale factor.
+ */
+void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling);
+
 void* aligned_malloc (size_t nb_bytes);
 void aligned_free (void*);
 

diff --git a/simd/chowdsp_fft_impl_avx.cpp b/simd/chowdsp_fft_impl_avx.cpp
@@ -1984,5 +1984,49 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
     }
     assert (buff[ib] == voutput);
 }
+
+void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling)
+{
+    int Ncvec = setup->Ncvec;
+    auto* va = reinterpret_cast<const __m256*> (a);
+    auto* vb = reinterpret_cast<const __m256*> (b);
+    auto* vab = reinterpret_cast<__m256*> (ab);
+
+    float ar0, ai0, br0, bi0, abr0, abi0;
+    const auto vscal = _mm256_set1_ps (scaling);
+    int i;
+
+    ar0 = reinterpret_cast<const float*> (&va[0])[0];
+    ai0 = reinterpret_cast<const float*> (&va[1])[0];
+    br0 = reinterpret_cast<const float*> (&vb[0])[0];
+    bi0 = reinterpret_cast<const float*> (&vb[1])[0];
+    abr0 = reinterpret_cast<const float*> (&vab[0])[0];
+    abi0 = reinterpret_cast<const float*> (&vab[1])[0];
+
+    for (i = 0; i < Ncvec; i += 2)
+    {
+        __m256 ar, ai, br, bi;
+        ar = va[2 * i + 0];
+        ai = va[2 * i + 1];
+        br = vb[2 * i + 0];
+        bi = vb[2 * i + 1];
+        cplx_mul_v (ar, ai, br, bi);
+        vab[2 * i + 0] = _mm256_fmadd_ps (ar, vscal, vab[2 * i + 0]);
+        vab[2 * i + 1] = _mm256_fmadd_ps (ai, vscal, vab[2 * i + 1]);
+        ar = va[2 * i + 2];
+        ai = va[2 * i + 3];
+        br = vb[2 * i + 2];
+        bi = vb[2 * i + 3];
+        cplx_mul_v (ar, ai, br, bi);
+        vab[2 * i + 2] = _mm256_fmadd_ps (ar, vscal, vab[2 * i + 2]);
+        vab[2 * i + 3] = _mm256_fmadd_ps (ai, vscal, vab[2 * i + 3]);
+    }
+
+    if (setup->transform == FFT_REAL)
+    {
+        reinterpret_cast<float*> (&vab[0])[0] = abr0 + ar0 * br0 * scaling;
+        reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
+    }
+}
 } // namespace chowdsp::fft::avx
 #endif
diff --git a/simd/chowdsp_fft_impl_neon.cpp b/simd/chowdsp_fft_impl_neon.cpp
@@ -1558,9 +1558,6 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
     float32x4_t* buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
     int ib = (nf_odd ^ ordered ? 1 : 0);
 
-    // assert (VALIGNED (finput) && VALIGNED (foutput));
-
-    //assert(finput != foutput);
     if (direction == FFT_FORWARD)
     {
         ib = ! ib;
@@ -1628,4 +1625,48 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
     }
     assert (buff[ib] == voutput);
 }
+
+void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling)
+{
+    int Ncvec = setup->Ncvec;
+    auto* va = (const float32x4_t*) a;
+    auto* vb = (const float32x4_t*) b;
+    auto* vab = (float32x4_t*) ab;
+
+    float ar0, ai0, br0, bi0, abr0, abi0;
+    const auto vscal = vld1q_dup_f32 (&scaling);
+    int i;
+
+    ar0 = reinterpret_cast<const float*> (&va[0])[0];
+    ai0 = reinterpret_cast<const float*> (&va[1])[0];
+    br0 = reinterpret_cast<const float*> (&vb[0])[0];
+    bi0 = reinterpret_cast<const float*> (&vb[1])[0];
+    abr0 = reinterpret_cast<const float*> (&vab[0])[0];
+    abi0 = reinterpret_cast<const float*> (&vab[1])[0];
+
+    for (i = 0; i < Ncvec; i += 2)
+    {
+        float32x4_t ar, ai, br, bi;
+        ar = va[2 * i + 0];
+        ai = va[2 * i + 1];
+        br = vb[2 * i + 0];
+        bi = vb[2 * i + 1];
+        std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
+        vab[2 * i + 0] = vmlaq_f32 (vab[2 * i + 0], ar, vscal);
+        vab[2 * i + 1] = vmlaq_f32 (vab[2 * i + 1], ai, vscal);
+        ar = va[2 * i + 2];
+        ai = va[2 * i + 3];
+        br = vb[2 * i + 2];
+        bi = vb[2 * i + 3];
+        std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
+        vab[2 * i + 2] = vmlaq_f32 (vab[2 * i + 2], ar, vscal);
+        vab[2 * i + 3] = vmlaq_f32 (vab[2 * i + 3], ai, vscal);
+    }
+
+    if (setup->transform == FFT_REAL)
+    {
+        reinterpret_cast<float*> (&vab[0])[0] = abr0 + ar0 * br0 * scaling;
+        reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
+    }
+}
 } // namespace chowdsp::fft::neon
diff --git a/simd/chowdsp_fft_impl_sse.cpp b/simd/chowdsp_fft_impl_sse.cpp
@@ -1645,4 +1645,48 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
     }
     assert (buff[ib] == voutput);
 }
+
+void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling)
+{
+    int Ncvec = setup->Ncvec;
+    auto* va = reinterpret_cast<const __m128*> (a);
+    auto* vb = reinterpret_cast<const __m128*> (b);
+    auto* vab = reinterpret_cast<__m128*> (ab);
+
+    float ar0, ai0, br0, bi0, abr0, abi0;
+    const auto vscal = _mm_set1_ps (scaling);
+    int i;
+
+    ar0 = reinterpret_cast<const float*> (&va[0])[0];
+    ai0 = reinterpret_cast<const float*> (&va[1])[0];
+    br0 = reinterpret_cast<const float*> (&vb[0])[0];
+    bi0 = reinterpret_cast<const float*> (&vb[1])[0];
+    abr0 = reinterpret_cast<const float*> (&vab[0])[0];
+    abi0 = reinterpret_cast<const float*> (&vab[1])[0];
+
+    for (i = 0; i < Ncvec; i += 2)
+    {
+        __m128 ar, ai, br, bi;
+        ar = va[2 * i + 0];
+        ai = va[2 * i + 1];
+        br = vb[2 * i + 0];
+        bi = vb[2 * i + 1];
+        std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
+        vab[2 * i + 0] = _mm_add_ps  (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
+        vab[2 * i + 1] = _mm_add_ps  (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
+        ar = va[2 * i + 2];
+        ai = va[2 * i + 3];
+        br = vb[2 * i + 2];
+        bi = vb[2 * i + 3];
+        std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
+        vab[2 * i + 2] = _mm_add_ps (vab[2 * i + 2], _mm_mul_ps (ar, vscal));
+        vab[2 * i + 3] = _mm_add_ps (vab[2 * i + 3], _mm_mul_ps (ai, vscal));
+    }
+
+    if (setup->transform == FFT_REAL)
+    {
+        reinterpret_cast<float*> (&vab[0])[0] = abr0 + ar0 * br0 * scaling;
+        reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
+    }
+}
 } // namespace chowdsp::fft::sse