diff --git a/chowdsp_fft.cpp b/chowdsp_fft.cpp index c62e368..0fac38d 100644 --- a/chowdsp_fft.cpp +++ b/chowdsp_fft.cpp @@ -50,8 +50,8 @@ SOFTWARE. #include #include -#include #include +#include #if defined(_MSC_VER) // Contains the definition of __cpuidex @@ -89,7 +89,8 @@ struct FFT_Setup; FFT_Setup* fft_new_setup (int N, fft_transform_t transform); void fft_destroy_setup (FFT_Setup* s); void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered); -} +void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling); +} // namespace chowdsp::fft::avx static constexpr uintptr_t address_mask = ~static_cast (3); static constexpr uintptr_t typeid_mask = static_cast (3); #endif @@ -187,7 +188,7 @@ static bool cpu_supports_avx() [[maybe_unused]] const auto fma3_avx = avx && fma3_sse42; int regs8[4]; - get_cpuid(regs8, 0x80000001); + get_cpuid (regs8, 0x80000001); [[maybe_unused]] const auto fma4 = regs8[2] >> 16 & avx_state_os_enabled; // sse4a = regs[2] >> 6 & 1; @@ -195,11 +196,11 @@ static bool cpu_supports_avx() // xop = regs[2] >> 11 & 1; int regs7[4]; - get_cpuid(regs7, 0x7); + get_cpuid (regs7, 0x7); const auto avx2 = regs7[1] >> 5 & avx_state_os_enabled; int regs7a[4]; - get_cpuid(regs7a, 0x7, 0x1); + get_cpuid (regs7a, 0x7, 0x1); [[maybe_unused]] const auto avxvnni = regs7a[0] >> 4 & avx_state_os_enabled; const auto fma3_avx2 = avx2 && fma3_sse42; @@ -305,4 +306,80 @@ void fft_transform (void* setup, const float* input, float* output, float* work, 1); #endif } + +void fft_transform_unordered (void* setup, const float* input, float* output, float* work, fft_direction_t direction) +{ +#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) +#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX + if (check_is_pointer_sse_setup (setup)) + { + sse::pffft_transform_internal (reinterpret_cast (get_setup_pointer (setup)), + input, + output, + (__m128*) work, + direction, + 0); + } + else + { + avx::pffft_transform_internal (reinterpret_cast (get_setup_pointer (setup)), + input, + output, + work, + direction, + 0); + } +#else + sse::pffft_transform_internal (reinterpret_cast (setup), + input, + output, + (__m128*) work, + direction, + 0); +#endif +#elif defined(__ARM_NEON__) + neon::pffft_transform_internal (reinterpret_cast (setup), + input, + output, + (float32x4_t*) work, + direction, + 0); +#endif +} + +void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling) +{ +#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) +#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX + if (check_is_pointer_sse_setup (setup)) + { + sse::pffft_convolve_internal (reinterpret_cast (get_setup_pointer (setup)), + a, + b, + ab, + scaling); + } + else + { + avx::pffft_convolve_internal (reinterpret_cast (get_setup_pointer (setup)), + a, + b, + ab, + scaling); + } +#else + sse::pffft_convolve_internal (reinterpret_cast (setup), + a, + b, + ab, + scaling); +#endif +#elif defined(__ARM_NEON__) + neon::pffft_convolve_internal (reinterpret_cast (setup), + a, + b, + ab, + scaling); +#endif +} } // namespace chowdsp::fft diff --git a/chowdsp_fft.h b/chowdsp_fft.h index 92597db..f9ce206 100644 --- a/chowdsp_fft.h +++ b/chowdsp_fft.h @@ -102,6 +102,18 @@ void fft_destroy_setup (void*); */ void fft_transform (void* setup, const float* input, float* output, float* work, fft_direction_t direction); +/** + * Same as the above method, but without necessarily maintaining the correct data order + * in the frequency domain. This may be useful if your frequency-domain operations + * are order-independent, for example, convolution. + */ +void fft_transform_unordered (void* setup, const float* input, float* output, float* work, fft_direction_t direction); + +/** + * Convolve two (unordered) frequency-domain signals, with some scale factor. + */ +void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling); + void* aligned_malloc (size_t nb_bytes); void aligned_free (void*); diff --git a/simd/chowdsp_fft_impl_avx.cpp b/simd/chowdsp_fft_impl_avx.cpp index 4b8b7c3..4cd2d6b 100644 --- a/simd/chowdsp_fft_impl_avx.cpp +++ b/simd/chowdsp_fft_impl_avx.cpp @@ -1984,5 +1984,49 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou } assert (buff[ib] == voutput); } + +void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling) +{ + int Ncvec = setup->Ncvec; + auto* va = reinterpret_cast (a); + auto* vb = reinterpret_cast (b); + auto* vab = reinterpret_cast<__m256*> (ab); + + float ar0, ai0, br0, bi0, abr0, abi0; + const auto vscal = _mm256_set1_ps (scaling); + int i; + + ar0 = reinterpret_cast (&va[0])[0]; + ai0 = reinterpret_cast (&va[1])[0]; + br0 = reinterpret_cast (&vb[0])[0]; + bi0 = reinterpret_cast (&vb[1])[0]; + abr0 = reinterpret_cast (&vab[0])[0]; + abi0 = reinterpret_cast (&vab[1])[0]; + + for (i = 0; i < Ncvec; i += 2) + { + __m256 ar, ai, br, bi; + ar = va[2 * i + 0]; + ai = va[2 * i + 1]; + br = vb[2 * i + 0]; + bi = vb[2 * i + 1]; + cplx_mul_v (ar, ai, br, bi); + vab[2 * i + 0] = _mm256_fmadd_ps (ar, vscal, vab[2 * i + 0]); + vab[2 * i + 1] = _mm256_fmadd_ps (ai, vscal, vab[2 * i + 1]); + ar = va[2 * i + 2]; + ai = va[2 * i + 3]; + br = vb[2 * i + 2]; + bi = vb[2 * i + 3]; + cplx_mul_v (ar, ai, br, bi); + vab[2 * i + 2] = _mm256_fmadd_ps (ar, vscal, vab[2 * i + 2]); + vab[2 * i + 3] = _mm256_fmadd_ps (ai, vscal, vab[2 * i + 3]); + } + + if (setup->transform == FFT_REAL) + { + reinterpret_cast (&vab[0])[0] = abr0 + ar0 * br0 * scaling; + reinterpret_cast (&vab[1])[0] = abi0 + ai0 * bi0 * scaling; + } +} } // namespace chowdsp::fft::avx #endif diff --git a/simd/chowdsp_fft_impl_neon.cpp b/simd/chowdsp_fft_impl_neon.cpp index 296d864..a12c73f 100644 --- a/simd/chowdsp_fft_impl_neon.cpp +++ b/simd/chowdsp_fft_impl_neon.cpp @@ -1558,9 +1558,6 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou float32x4_t* buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; int ib = (nf_odd ^ ordered ? 1 : 0); - // assert (VALIGNED (finput) && VALIGNED (foutput)); - - //assert(finput != foutput); if (direction == FFT_FORWARD) { ib = ! ib; @@ -1628,4 +1625,48 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou } assert (buff[ib] == voutput); } + +void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling) +{ + int Ncvec = setup->Ncvec; + auto* va = (const float32x4_t*) a; + auto* vb = (const float32x4_t*) b; + auto* vab = (float32x4_t*) ab; + + float ar0, ai0, br0, bi0, abr0, abi0; + const auto vscal = vld1q_dup_f32 (&scaling); + int i; + + ar0 = reinterpret_cast (&va[0])[0]; + ai0 = reinterpret_cast (&va[1])[0]; + br0 = reinterpret_cast (&vb[0])[0]; + bi0 = reinterpret_cast (&vb[1])[0]; + abr0 = reinterpret_cast (&vab[0])[0]; + abi0 = reinterpret_cast (&vab[1])[0]; + + for (i = 0; i < Ncvec; i += 2) + { + float32x4_t ar, ai, br, bi; + ar = va[2 * i + 0]; + ai = va[2 * i + 1]; + br = vb[2 * i + 0]; + bi = vb[2 * i + 1]; + std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi); + vab[2 * i + 0] = vmlaq_f32 (vab[2 * i + 0], ar, vscal); + vab[2 * i + 1] = vmlaq_f32 (vab[2 * i + 1], ai, vscal); + ar = va[2 * i + 2]; + ai = va[2 * i + 3]; + br = vb[2 * i + 2]; + bi = vb[2 * i + 3]; + std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi); + vab[2 * i + 2] = vmlaq_f32 (vab[2 * i + 2], ar, vscal); + vab[2 * i + 3] = vmlaq_f32 (vab[2 * i + 3], ai, vscal); + } + + if (setup->transform == FFT_REAL) + { + reinterpret_cast (&vab[0])[0] = abr0 + ar0 * br0 * scaling; + reinterpret_cast (&vab[1])[0] = abi0 + ai0 * bi0 * scaling; + } +} } // namespace chowdsp::fft::neon diff --git a/simd/chowdsp_fft_impl_sse.cpp b/simd/chowdsp_fft_impl_sse.cpp index d6b1493..4691594 100644 --- a/simd/chowdsp_fft_impl_sse.cpp +++ b/simd/chowdsp_fft_impl_sse.cpp @@ -1645,4 +1645,48 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou } assert (buff[ib] == voutput); } + +void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling) +{ + int Ncvec = setup->Ncvec; + auto* va = reinterpret_cast (a); + auto* vb = reinterpret_cast (b); + auto* vab = reinterpret_cast<__m128*> (ab); + + float ar0, ai0, br0, bi0, abr0, abi0; + const auto vscal = _mm_set1_ps (scaling); + int i; + + ar0 = reinterpret_cast (&va[0])[0]; + ai0 = reinterpret_cast (&va[1])[0]; + br0 = reinterpret_cast (&vb[0])[0]; + bi0 = reinterpret_cast (&vb[1])[0]; + abr0 = reinterpret_cast (&vab[0])[0]; + abi0 = reinterpret_cast (&vab[1])[0]; + + for (i = 0; i < Ncvec; i += 2) + { + __m128 ar, ai, br, bi; + ar = va[2 * i + 0]; + ai = va[2 * i + 1]; + br = vb[2 * i + 0]; + bi = vb[2 * i + 1]; + std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi); + vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal)); + vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal)); + ar = va[2 * i + 2]; + ai = va[2 * i + 3]; + br = vb[2 * i + 2]; + bi = vb[2 * i + 3]; + std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi); + vab[2 * i + 2] = _mm_add_ps (vab[2 * i + 2], _mm_mul_ps (ar, vscal)); + vab[2 * i + 3] = _mm_add_ps (vab[2 * i + 3], _mm_mul_ps (ai, vscal)); + } + + if (setup->transform == FFT_REAL) + { + reinterpret_cast (&vab[0])[0] = abr0 + ar0 * br0 * scaling; + reinterpret_cast (&vab[1])[0] = abi0 + ai0 * bi0 * scaling; + } +} } // namespace chowdsp::fft::sse diff --git a/test/test.cpp b/test/test.cpp index 3c963e9..8d171c1 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -8,12 +8,12 @@ void compare (const float* ref, const float* test, int N) { - const auto tol = 1.0e-6f * (float) N / 8.0f; + const auto tol = 2.0e-7f * (float) N; for (int n = 0; n < N; ++n) REQUIRE (test[n] == Catch::Approx { ref[n] }.margin(tol)); } -void test_complex (int N, bool use_avx = false) +void test_fft_complex (int N, bool use_avx = false) { auto* data = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N * 2); auto* data_ref = (float*) pffft_aligned_malloc (sizeof (float) * N * 2); @@ -28,7 +28,7 @@ void test_complex (int N, bool use_avx = false) std::copy (data, data + N * 2, data_ref); auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_COMPLEX, use_avx); - assert (fft_setup != nullptr); + REQUIRE (fft_setup != nullptr); auto* pffft_setup = pffft_new_setup (N, PFFFT_COMPLEX); chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_FORWARD); @@ -56,7 +56,7 @@ void test_complex (int N, bool use_avx = false) pffft_aligned_free (work_data_ref); } -void test_real (int N, bool use_avx = false) +void test_fft_real (int N, bool use_avx = false) { auto* data = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N); auto* data_ref = (float*) pffft_aligned_malloc (sizeof (float) * N); @@ -70,7 +70,7 @@ void test_real (int N, bool use_avx = false) std::copy (data, data + N, data_ref); auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx); - assert (fft_setup != nullptr); + REQUIRE (fft_setup != nullptr); auto* pffft_setup = pffft_new_setup (N, PFFFT_REAL); chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_FORWARD); @@ -98,6 +98,106 @@ void test_real (int N, bool use_avx = false) pffft_aligned_free (work_data_ref); } +void test_convolution_complex (int N, bool use_avx = false) +{ + auto* sine1 = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N * 2); + auto* sine1_ref = (float*) pffft_aligned_malloc (sizeof (float) * N * 2); + auto* sine2 = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N * 2); + auto* sine2_ref = (float*) pffft_aligned_malloc (sizeof (float) * N * 2); + auto* out = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N * 2); + auto* out_ref = (float*) pffft_aligned_malloc (sizeof (float) * N * 2); + auto* work_data = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N * 2); + auto* work_data_ref = (float*) pffft_aligned_malloc (sizeof (float) * N * 2); + + for (int i = 0; i < N; ++i) + { + sine1[i * 2] = std::sin (3.14f * (100.0f / 48000.0f) * (float) i); + sine1[i * 2 + 1] = std::cos (3.14f * (100.0f / 48000.0f) * (float) i); + sine2[i * 2] = std::sin (3.14f * (200.0f / 48000.0f) * (float) i); + sine2[i * 2 + 1] = std::cos (3.14f * (200.0f / 48000.0f) * (float) i); + } + std::copy (sine1, sine1 + N * 2, sine1_ref); + std::copy (sine2, sine2 + N * 2, sine2_ref); + std::fill_n (out, N * 2, 0.0f); + std::fill_n (out_ref, N * 2, 0.0f); + const auto norm_gain = 1.0f / static_cast (N); + + auto* pffft_setup = pffft_new_setup (N, PFFFT_COMPLEX); + pffft_transform (pffft_setup, sine1_ref, sine1_ref, work_data_ref, PFFFT_FORWARD); + pffft_transform (pffft_setup, sine2_ref, sine2_ref, work_data_ref, PFFFT_FORWARD); + pffft_zconvolve_accumulate (pffft_setup, sine1_ref, sine2_ref, out_ref, norm_gain); + pffft_transform (pffft_setup, out_ref, out_ref, work_data_ref, PFFFT_BACKWARD); + + auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_COMPLEX, use_avx); + REQUIRE (fft_setup != nullptr); + chowdsp::fft::fft_transform_unordered (fft_setup, sine1, sine1, work_data, chowdsp::fft::FFT_FORWARD); + chowdsp::fft::fft_transform_unordered (fft_setup, sine2, sine2, work_data, chowdsp::fft::FFT_FORWARD); + chowdsp::fft::fft_convolve_unordered (fft_setup, sine1, sine2, out, norm_gain); + chowdsp::fft::fft_transform_unordered (fft_setup, out, out, work_data, chowdsp::fft::FFT_BACKWARD); + + compare (out_ref, out, N); + + chowdsp::fft::fft_destroy_setup (fft_setup); + pffft_destroy_setup (pffft_setup); + chowdsp::fft::aligned_free (sine1); + pffft_aligned_free (sine1_ref); + chowdsp::fft::aligned_free (sine2); + pffft_aligned_free (sine2_ref); + chowdsp::fft::aligned_free (out); + pffft_aligned_free (out_ref); + chowdsp::fft::aligned_free (work_data); + pffft_aligned_free (work_data_ref); +} + +void test_convolution_real (int N, bool use_avx = false) +{ + auto* sine1 = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N); + auto* sine1_ref = (float*) pffft_aligned_malloc (sizeof (float) * N); + auto* sine2 = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N); + auto* sine2_ref = (float*) pffft_aligned_malloc (sizeof (float) * N); + auto* out = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N); + auto* out_ref = (float*) pffft_aligned_malloc (sizeof (float) * N); + auto* work_data = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N); + auto* work_data_ref = (float*) pffft_aligned_malloc (sizeof (float) * N); + + for (int i = 0; i < N; ++i) + { + sine1[i] = std::sin (3.14f * (100.0f / 48000.0f) * (float) i); + sine2[i] = std::sin (3.14f * (200.0f / 48000.0f) * (float) i); + } + std::copy (sine1, sine1 + N, sine1_ref); + std::copy (sine2, sine2 + N, sine2_ref); + std::fill_n (out, N, 0.0f); + std::fill_n (out_ref, N, 0.0f); + const auto norm_gain = 1.0f / static_cast (N); + + auto* pffft_setup = pffft_new_setup (N, PFFFT_REAL); + pffft_transform (pffft_setup, sine1_ref, sine1_ref, work_data_ref, PFFFT_FORWARD); + pffft_transform (pffft_setup, sine2_ref, sine2_ref, work_data_ref, PFFFT_FORWARD); + pffft_zconvolve_accumulate (pffft_setup, sine1_ref, sine2_ref, out_ref, norm_gain); + pffft_transform (pffft_setup, out_ref, out_ref, work_data_ref, PFFFT_BACKWARD); + + auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx); + REQUIRE (fft_setup != nullptr); + chowdsp::fft::fft_transform_unordered (fft_setup, sine1, sine1, work_data, chowdsp::fft::FFT_FORWARD); + chowdsp::fft::fft_transform_unordered (fft_setup, sine2, sine2, work_data, chowdsp::fft::FFT_FORWARD); + chowdsp::fft::fft_convolve_unordered (fft_setup, sine1, sine2, out, norm_gain); + chowdsp::fft::fft_transform_unordered (fft_setup, out, out, work_data, chowdsp::fft::FFT_BACKWARD); + + compare (out_ref, out, N); + + chowdsp::fft::fft_destroy_setup (fft_setup); + pffft_destroy_setup (pffft_setup); + chowdsp::fft::aligned_free (sine1); + pffft_aligned_free (sine1_ref); + chowdsp::fft::aligned_free (sine2); + pffft_aligned_free (sine2_ref); + chowdsp::fft::aligned_free (out); + pffft_aligned_free (out_ref); + chowdsp::fft::aligned_free (work_data); + pffft_aligned_free (work_data_ref); +} + TEST_CASE("FFT SSE/NEON") { for (int i = 5; i < 20; ++i) @@ -105,12 +205,22 @@ TEST_CASE("FFT SSE/NEON") const auto fft_size = 1 << i; SECTION ("Testing Complex FFT with size: " + std::to_string (fft_size)) { - test_complex (fft_size); + test_fft_complex (fft_size); } SECTION ("Testing Real FFT with size: " + std::to_string (fft_size)) { - test_real (fft_size); + test_fft_real (fft_size); + } + + SECTION ("Testing Complex Convolution with size: " + std::to_string (fft_size)) + { + test_convolution_complex (fft_size); + } + + SECTION ("Testing Real Convolution with size: " + std::to_string (fft_size)) + { + test_convolution_real (fft_size); } } } @@ -123,12 +233,22 @@ TEST_CASE("FFT AVX") const auto fft_size = 1 << i; SECTION ("Testing Complex FFT with size: " + std::to_string (fft_size)) { - test_complex (fft_size, true); + test_fft_complex (fft_size, true); } SECTION ("Testing Real FFT with size: " + std::to_string (fft_size)) { - test_real (fft_size, true); + test_fft_real (fft_size, true); + } + + SECTION ("Testing Complex Convolution with size: " + std::to_string (fft_size)) + { + test_convolution_complex (fft_size, true); + } + + SECTION ("Testing Real Convolution with size: " + std::to_string (fft_size)) + { + test_convolution_real (fft_size, true); } } }