From bd71665232b76e4dbca6c98e696083b12fc494ee Mon Sep 17 00:00:00 2001 From: johnfea <11298478+johnfea@users.noreply.github.com> Date: Tue, 3 Sep 2024 20:07:51 -0400 Subject: [PATCH] feat(batch): Add support for b4_SSE2 batched mode. (#1825) Add support for b4_SSE2 batched mode, enabling batched execution for all x86-64 CPUs that don't support AVX. Quick tests were run to see that output with procedural and texture based materials looked ok and proper SSE2 batched code was being generated for wide/ functions. EDIT: Additionally, all tests pass now. --------- Signed-off-by: Tuomas Tonteri --- .github/workflows/ci.yml | 14 +- INSTALL.md | 6 +- src/cmake/compiler.cmake | 2 +- src/include/OSL/batched_texture.h | 9 +- src/include/OSL/llvm_util.h | 3 + src/include/OSL/rendererservices.h | 1 + src/liboslexec/CMakeLists.txt | 4 + src/liboslexec/batched_analysis.cpp | 8 +- src/liboslexec/batched_backendllvm.cpp | 1 + src/liboslexec/batched_llvm_instance.cpp | 44 +++ src/liboslexec/batched_rendservices.cpp | 1 + src/liboslexec/context.cpp | 1 + src/liboslexec/llvm_passes.h | 2 + src/liboslexec/llvm_util.cpp | 337 ++++++++++++++++-- src/liboslexec/rendservices.cpp | 7 + src/liboslexec/shadingsys.cpp | 31 +- src/testshade/batched_simplerend.cpp | 1 + src/testshade/simplerend.cpp | 4 +- src/testshade/simplerend.h | 5 + src/testshade/testshade.cpp | 19 +- .../oslbatcheddeformer.cpp | 16 +- 21 files changed, 463 insertions(+), 53 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c9c94385d..b30a36fed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,8 +50,18 @@ jobs: pybind11_ver: v2.7.0 simd: sse4.2 batched: b8_AVX2_noFMA - setenvs: export ENABLE_OPENVDB=0 - - desc: gcc9/C++17 llvm13 py3.9 oiio-rel avx2 + - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2 + nametag: linux-vfx2021 + runner: ubuntu-latest + container: aswftesting/ci-osl:2021-clang11 + vfxyear: 2021 + cxx_std: 17 + openimageio_ver: v2.4.13.0 + python_ver: 3.7 + pybind11_ver: v2.7.0 + simd: sse2 + batched: b4_SSE2 + - desc: gcc9/C++17 llvm13 py3.9 exr3.1 oiio-rel avx2 nametag: linux-vfx2022 runner: ubuntu-latest container: aswftesting/ci-osl:2022-clang13 diff --git a/INSTALL.md b/INSTALL.md index 33fba370c..ef7c6f057 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as well, but we don't officially support or test other than these platforms. Shader execution is supported on the native architectures of those x86_64 and -aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode -requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs -using Cuda+OptiX. +aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode +requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on +NVIDIA GPUs using Cuda+OptiX. Dependencies ------------ diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake index b70bef2c1..fbd4aa10f 100644 --- a/src/cmake/compiler.cmake +++ b/src/cmake/compiler.cmake @@ -338,7 +338,7 @@ endif () # # The USE_BATCHED option may be set to indicate that support for batched # SIMD shader execution be compiled along with targe specific libraries -set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") +set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF) set (BATCHED_SUPPORT_DEFINES "") set (BATCHED_TARGET_LIBS "") diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h index c720e9bed..787664472 100644 --- a/src/include/OSL/batched_texture.h +++ b/src/include/OSL/batched_texture.h @@ -49,6 +49,9 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); template struct BatchedTextureOptions { VaryingTextureOptions varying; @@ -90,11 +93,15 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH // Code here is to validate our OSL BatchedTextureOptions is binary compatible // and safe to reinterpret_cast -static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8), +static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) + || (OIIO::Tex::BatchWidth == 4), "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16"); namespace validate_offsets { diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h index 7f112ccf5..49df62891 100644 --- a/src/include/OSL/llvm_util.h +++ b/src/include/OSL/llvm_util.h @@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Constant* constant(uint32_t i); /// Return an llvm::Constant holding the given integer constant. + llvm::Constant* constant4(int8_t i); + llvm::Constant* constant4(uint8_t i); llvm::Constant* constant8(int8_t i); llvm::Constant* constant8(uint8_t i); llvm::Constant* constant16(int16_t i); @@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index); llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index); + llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index); std::array op_split_16x(llvm::Value* vector_val); std::array op_split_8x(llvm::Value* vector_val); std::array op_quarter_16x(llvm::Value* vector_val); diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h index 04e5269ae..62a6b6179 100644 --- a/src/include/OSL/rendererservices.h +++ b/src/include/OSL/rendererservices.h @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices { /// Unless overridden, a nullptr is returned. virtual BatchedRendererServices<16>* batched(WidthOf<16>); virtual BatchedRendererServices<8>* batched(WidthOf<8>); + virtual BatchedRendererServices<4>* batched(WidthOf<4>); protected: TextureSystem* m_texturesys; // A place to hold a TextureSystem diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt index 28c0ad09e..0e47deb16 100644 --- a/src/liboslexec/CMakeLists.txt +++ b/src/liboslexec/CMakeLists.txt @@ -379,6 +379,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=core-avx2") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=corei7-avx") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=x86-64") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () @@ -454,6 +456,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=haswell") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=sandybridge") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=x86-64") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp index 888f19874..9f76c1acf 100644 --- a/src/liboslexec/batched_analysis.cpp +++ b/src/liboslexec/batched_analysis.cpp @@ -1813,10 +1813,16 @@ struct Analyzer { // specific BatchedRendererServices. // Right here we don't know which width will be used, // so we will just require all widths provide the same answer + auto rs4 = m_ba.renderer()->batched(WidthOf<4>()); auto rs8 = m_ba.renderer()->batched(WidthOf<8>()); auto rs16 = m_ba.renderer()->batched(WidthOf<16>()); - if (rs8 || rs16) { + if (rs4 || rs8 || rs16) { get_attr_is_uniform = true; + if (rs4) { + get_attr_is_uniform + &= rs4->is_attribute_uniform(obj_name, + attr_name); + } if (rs8) { get_attr_is_uniform &= rs8->is_attribute_uniform(obj_name, diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp index e94122ef4..79f87ca90 100644 --- a/src/liboslexec/batched_backendllvm.cpp +++ b/src/liboslexec/batched_backendllvm.cpp @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys, switch (vector_width()) { case 16: m_true_mask_value = Mask<16>(true).value(); break; case 8: m_true_mask_value = Mask<8>(true).value(); break; + case 4: m_true_mask_value = Mask<4>(true).value(); break; default: OSL_ASSERT(0 && "unsupported vector width"); } ll.dumpasm(shadingsys.m_llvm_dumpasm); diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp index b2cb6c7cd..f6f325bbb 100644 --- a/src/liboslexec/batched_llvm_instance.cpp +++ b/src/liboslexec/batched_llvm_instance.cpp @@ -537,6 +537,33 @@ const char* = "b8_AVX_"; #endif +#ifdef __OSL_SUPPORTS_b4_SSE2 +template<> +const NameAndSignature + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[] + = { +# define DECL_INDIRECT(name, signature) \ + NameAndSignature { #name, signature }, +# define DECL(name, signature) DECL_INDIRECT(name, signature) +# define __OSL_WIDTH 4 +# define __OSL_TARGET_ISA SSE2 +// Don't allow order of xmacro includes be rearranged +// clang-format off +# include "wide/define_opname_macros.h" +# include "builtindecl_wide_xmacro.h" +# include "wide/undef_opname_macros.h" +// clang-format on +# undef __OSL_TARGET_ISA +# undef __OSL_WIDTH +# undef DECL +# undef DECL_INDIRECT + }; +template<> +const char* + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string + = "b4_SSE2_"; +#endif + std::unique_ptr @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context, default: break; } break; + case 4: + switch (target_isa) { +#ifdef __OSL_SUPPORTS_b4_SSE2 + case TargetISA::x64: + return RetType( + new ConcreteTargetLibraryHelper<4, TargetISA::x64>()); +#endif + default: break; + } + break; + default: OSL_ASSERT(0 && "unsupported vector width"); } std::cerr << "Build is not configured to support TargetISA of " @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedTextureOptions<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedTextureOptions<8>(offset_by_index); break; @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedShaderGlobals<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedShaderGlobals<8>(offset_by_index); break; diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp index 878d02f3f..e36dff3a2 100644 --- a/src/liboslexec/batched_rendservices.cpp +++ b/src/liboslexec/batched_rendservices.cpp @@ -347,5 +347,6 @@ BatchedRendererServices::getmessage(BatchedShaderGlobals* bsg, // Explicitly instantiate BatchedRendererServices template template class OSLEXECPUBLIC BatchedRendererServices<16>; template class OSLEXECPUBLIC BatchedRendererServices<8>; +template class OSLEXECPUBLIC BatchedRendererServices<4>; OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp index a97b427e1..b001315a8 100644 --- a/src/liboslexec/context.cpp +++ b/src/liboslexec/context.cpp @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg) // Explicit template instantiation for supported batch sizes template class ShadingContext::Batched<16>; template class ShadingContext::Batched<8>; +template class ShadingContext::Batched<4>; #endif diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h index 852ec82f9..43c7a7289 100644 --- a/src/liboslexec/llvm_passes.h +++ b/src/liboslexec/llvm_passes.h @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final // including this file will need its own static members defined. LLVM will // assign IDs when they get registered, so this initialization value is not // important. +template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0; + template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0; template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0; diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp index 3dd888cab..de41e217f 100644 --- a/src/liboslexec/llvm_util.cpp +++ b/src/liboslexec/llvm_util.cpp @@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM() #ifndef OSL_LLVM_NEW_PASS_MANAGER // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks + static llvm::RegisterPass< + LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>> + sRegCustomPass2( + "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>", + "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass", + false /* Only looks at CFG */, false /* Analysis Pass */); static llvm::RegisterPass< LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>> sRegCustomPass0( @@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host) break; } case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + m_new_pass_manager->module_pass_manager.addPass( + createModuleToFunctionPassAdaptor( + NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>( + context()))); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host) new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>()); break; case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + mpm.add( + new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>()); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -3393,6 +3405,19 @@ LLVM_Util::constant(uint32_t i) return llvm::ConstantInt::get(context(), llvm::APInt(32, i)); } +llvm::Constant* +LLVM_Util::constant4(int8_t i) +{ + return llvm::ConstantInt::get(context(), + llvm::APInt(4, i, true /*signed*/)); +} + +llvm::Constant* +LLVM_Util::constant4(uint8_t i) +{ + return llvm::ConstantInt::get(context(), llvm::APInt(4, i)); +} + llvm::Constant* LLVM_Util::constant8(int8_t i) { @@ -3592,6 +3617,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -3659,6 +3689,34 @@ LLVM_Util::mask_as_int(llvm::Value* mask) int8_mask = builder().CreateCall(func, toArrayRef(args)); return int8_mask; } + case 4: { + // We need to do more than a simple cast to an int. Since we + // know vectorized comparison for SSE2 ends up setting 4 + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to + // do more than a simple cast to an int. + + // Convert <4 x i1> -> <4 x i32> + llvm::Value* w4_int_mask = builder().CreateSExt(mask, + type_wide_int()); + + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. However the only 256bit + // version works on floats, so we will cast from int32 to + // float beforehand + llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); + llvm::Value* w4_float_mask = builder().CreateBitCast(w4_int_mask, + w4_float_type); + + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. + llvm::Function* func = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_sse_movmsk_ps); + + llvm::Value* args[1] = { w4_float_mask }; + llvm::Value* int8_mask; + int8_mask = builder().CreateCall(func, toArrayRef(args)); + return int8_mask; + } default: { OSL_ASSERT(0 && "unsupported native bit mask width"); return mask; @@ -3748,17 +3806,19 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // do more than a simple cast to an int. // Convert <4 x i1> -> <4 x i32> - llvm::Value* wide_int_mask = builder().CreateSExt(mask, - type_wide_int()); + llvm::Value* w4_int_mask = builder().CreateSExt(mask, + type_wide_int()); // Now we will use the horizontal sign extraction intrinsic - // to build a 32 bit mask value. However the only 128bit + // to build a 32 bit mask value. However the only 256bit // version works on floats, so we will cast from int32 to // float beforehand llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); - llvm::Value* w4_float_mask = builder().CreateBitCast(wide_int_mask, + llvm::Value* w4_float_mask = builder().CreateBitCast(w4_int_mask, w4_float_type); + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. llvm::Function* func = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_sse_movmsk_ps); @@ -3797,13 +3857,36 @@ LLVM_Util::mask_as_int8(llvm::Value* mask) llvm::Value* LLVM_Util::mask4_as_int8(llvm::Value* mask) { - OSL_ASSERT(m_supports_llvm_bit_masks_natively); - // combine <4xi1> mask with <4xi1> zero init to get <8xi1> and cast it - // to i8 - llvm::Value* zero_mask4 - = llvm::ConstantDataVector::getSplat(4, constant_bool(false)); - return builder().CreateBitCast(op_combine_4x_vectors(mask, zero_mask4), - type_int8()); + if (m_supports_llvm_bit_masks_natively) { + // combine <4xi1> mask with <4xi1> zero init to get <8xi1> and cast it + // to i8 + llvm::Value* zero_mask4 + = llvm::ConstantDataVector::getSplat(4, constant_bool(false)); + return builder().CreateBitCast(op_combine_4x_vectors(mask, zero_mask4), + type_int8()); + } else { + // Convert <4 x i1> -> <4 x i32> + llvm::Value* w4_int_mask = builder().CreateSExt(mask, type_wide_int()); + + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. However the only 256bit + // version works on floats, so we will cast from int32 to + // float beforehand + llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); + llvm::Value* w4_float_mask = builder().CreateBitCast(w4_int_mask, + w4_float_type); + + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. + llvm::Function* func = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_sse_movmsk_ps); + + llvm::Value* args[1] = { w4_float_mask }; + llvm::Value* int32 = builder().CreateCall(func, toArrayRef(args)); + llvm::Value* i8 = builder().CreateIntCast(int32, type_int8(), true); + + return i8; + } } @@ -3828,14 +3911,19 @@ LLVM_Util::int_as_mask(llvm::Value* value) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 8 bit integer to a 4 bit mask + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; llvm::Value* intMask = builder().CreateTrunc(value, intMaskType); result = builder().CreateBitCast(intMask, type_wide_bool()); } else { - // Since we know vectorized comparisons for AVX&AVX2 end up setting - // 8 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more + // Since we know vectorized comparisons for SSE2&AVX&AVX2 end up setting + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more // than a simple cast to an int. // Broadcast out the int32 mask to all data lanes @@ -3939,34 +4027,25 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) // Assumes mask is not empty llvm::Type* intMaskType = nullptr; + llvm::Value* int_mask = nullptr; switch (m_vector_width) { case 16: // We can just reinterpret cast a 16 bit mask to a 16 bit integer // and all types are happy intMaskType = type_int16(); + int_mask = builder().CreateBitCast(mask, intMaskType); break; case 8: // We can just reinterpret cast a 8 bit mask to a 8 bit integer // and all types are happy intMaskType = type_int8(); + int_mask = builder().CreateBitCast(mask, intMaskType); break; -#if 0 // WIP - case 4: - { - // We can just reinterpret cast a 8 bit mask to a 8 bit integer - // and all types are happy - intMaskType = type_int8(); - -// extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width); -// llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type); -// -// int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context); -// zeroConstant = constant128(0); -// -// llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); - break; - } -#endif + case 4: { + intMaskType = type_int8(); + int_mask = mask4_as_int8(mask); + break; + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -3976,8 +4055,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) = llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::cttz, toArrayRef(types)); - llvm::Value* int_mask = builder().CreateBitCast(mask, intMaskType); - llvm::Value* args[2] = { int_mask, constant_bool(true) }; + llvm::Value* args[2] = { int_mask, constant_bool(true) }; llvm::Value* firstNonZeroIndex = builder().CreateCall(func_cttz, toArrayRef(args)); @@ -4455,6 +4533,19 @@ LLVM_Util::op_linearize_8x_indices(llvm::Value* wide_index) } +llvm::Value* +LLVM_Util::op_linearize_4x_indices(llvm::Value* wide_index) +{ + llvm::Value* strided_indices = op_mul(wide_index, wide_constant(4, 4)); + llvm::Constant* offsets_to_lane[4] = { constant(0), constant(1), + constant(2), constant(3) }; + llvm::Value* const_vec_offsets = llvm::ConstantVector::get( + llvm::ArrayRef(&offsets_to_lane[0], 4)); + + return op_add(strided_indices, const_vec_offsets); +} + + std::array LLVM_Util::op_split_16x(llvm::Value* vector_val) { @@ -4613,6 +4704,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); @@ -4663,6 +4755,16 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + wide_index, wide_int_mask, + constant4((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } + default: OSL_ASSERT(0 && "unsupported width"); }; } else { @@ -4680,6 +4782,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); @@ -4739,6 +4842,17 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather; } + case 4: { + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), wide_index, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant4((uint8_t)4) + }; + llvm::Value* gather = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather; + } } } else { return clamped_gather_from_uniform(type_wide_float()); @@ -4805,6 +4919,29 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } + case 4: { + // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring + llvm::Function* func_avx512_gather_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv4_di); + OSL_ASSERT(func_avx512_gather_dpq); + + auto w4_bit_masks = current_mask(); + auto w4_int_indices = wide_index; + + llvm::Value* unmasked_value + = builder().CreateVectorSplat(4, constant64((uint64_t)0)); + llvm::Value* args[] + = { unmasked_value, void_ptr(src_ptr), w4_int_indices, + mask4_as_int8(w4_bit_masks), constant(4) }; + llvm::Value* gather1 + = builder().CreateCall(func_avx512_gather_dpq, + toArrayRef(args)); + args[2] = w4_int_indices; + args[3] = mask4_as_int8(w4_bit_masks); + + return builder().CreateIntToPtr(gather1, type_wide_ustring()); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else { @@ -4841,6 +4978,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_ps, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); + OSL_ASSERT(func_avx512_gather_ps); + + llvm::Value* unmasked_value = wide_constant(0.0f); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_ps, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4889,6 +5040,19 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), int_indices, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant8((uint8_t)4) + }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -4926,6 +5090,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_pi, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); + OSL_ASSERT(func_avx512_gather_pi); + + llvm::Value* unmasked_value = wide_constant(0); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_pi, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else if (m_supports_avx2) { @@ -4975,6 +5153,26 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Function* func_avx2_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx2_gather_d_d_256); + OSL_ASSERT(func_avx2_gather_pi); + + llvm::Constant* avx2_unmasked_value = wide_constant(8, 0); + + // Convert <16 x i1> -> <16 x i32> -> to <2 x< 8 x i32>> + llvm::Value* wide_int_mask + = builder().CreateSExt(current_mask(), type_wide_int()); + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + int_indices, wide_int_mask, + constant8((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -5017,7 +5215,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } - case 8: { + case 8: + case 4: { // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring llvm::Function* func_avx512_gather_dpq = llvm::Intrinsic::getDeclaration( @@ -5093,6 +5292,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, linear_indices = op_linearize_16x_indices(wide_index); break; case 8: linear_indices = op_linearize_8x_indices(wide_index); break; + case 4: linear_indices = op_linearize_4x_indices(wide_index); break; default: OSL_ASSERT(0 && "unsupported vector width for scatter"); }; } else { @@ -5150,6 +5350,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); @@ -5182,6 +5383,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); @@ -5256,6 +5458,25 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = wide_index; + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5295,6 +5516,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); + OSL_ASSERT(func_avx512_scatter_ps); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5338,6 +5572,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); + OSL_ASSERT(func_avx512_scatter_pi); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5407,6 +5654,26 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = op_linearize_4x_indices( + wide_index); + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp index f27307d31..d97376011 100644 --- a/src/liboslexec/rendservices.cpp +++ b/src/liboslexec/rendservices.cpp @@ -542,4 +542,11 @@ RendererServices::batched(WidthOf<8>) return nullptr; } +BatchedRendererServices<4>* +RendererServices::batched(WidthOf<4>) +{ + // No default implementation for batched services + return nullptr; +} + OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp index 597c6010b..0a96599fa 100644 --- a/src/liboslexec/shadingsys.cpp +++ b/src/liboslexec/shadingsys.cpp @@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width) m_impl->attribute("llvm_jit_fma", 0); return true; } +# endif + if (target_requested) { + break; + } + // fallthrough + default: return false; + }; + return false; + case 4: + switch (requestedISA) { + case TargetISA::UNKNOWN: + // fallthrough + case TargetISA::x64: +# ifdef __OSL_SUPPORTS_b4_SSE2 + if (LLVM_Util::supports_isa(TargetISA::x64)) { + if (!target_requested) + m_impl->attribute("llvm_jit_target", + LLVM_Util::target_isa_name( + TargetISA::x64)); + // SSE2 doesn't support FMA + m_impl->attribute("llvm_jit_fma", 0); + return true; + } # endif if (target_requested) { break; @@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor::jit_all_groups(int nthreads) // Explicitly instantiate template class ShadingSystem::BatchedExecutor<16>; template class ShadingSystem::BatchedExecutor<8>; +template class ShadingSystem::BatchedExecutor<4>; #endif @@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer, , m_opt_groupdata(true) #if OSL_USE_BATCHED , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr) - || (renderer->batched(WidthOf<8>()) != nullptr)) + || (renderer->batched(WidthOf<8>()) != nullptr) + || (renderer->batched(WidthOf<4>()) != nullptr)) #else , m_opt_batched_analysis(false) #endif @@ -3809,7 +3834,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, // the batch jit has already happened, // as it requires the ops so we can't delete them yet! if (((renderer()->batched(WidthOf<16>()) == nullptr) - && (renderer()->batched(WidthOf<8>()) == nullptr)) + && (renderer()->batched(WidthOf<8>()) == nullptr) + && (renderer()->batched(WidthOf<4>()) == nullptr)) || group.batch_jitted()) { group_post_jit_cleanup(group); } @@ -4030,6 +4056,7 @@ ShadingSystemImpl::Batched::jit_all_groups(int nthreads, int mythread, // machine as well, start with just the batch size template class pvt::ShadingSystemImpl::Batched<16>; template class pvt::ShadingSystemImpl::Batched<8>; +template class pvt::ShadingSystemImpl::Batched<4>; #endif int diff --git a/src/testshade/batched_simplerend.cpp b/src/testshade/batched_simplerend.cpp index 937655af4..ea2acbdf9 100644 --- a/src/testshade/batched_simplerend.cpp +++ b/src/testshade/batched_simplerend.cpp @@ -1001,6 +1001,7 @@ BatchedSimpleRenderer::get_camera_screen_window(ustringhash /*object*/, // Explicitly instantiate BatchedSimpleRenderer template template class BatchedSimpleRenderer<16>; template class BatchedSimpleRenderer<8>; +template class BatchedSimpleRenderer<4>; OSL_NAMESPACE_EXIT diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp index 65862c2db..3582c9cc4 100644 --- a/src/testshade/simplerend.cpp +++ b/src/testshade/simplerend.cpp @@ -218,7 +218,9 @@ register_closures(OSL::ShadingSystem* shadingsys) SimpleRenderer::SimpleRenderer() #if OSL_USE_BATCHED - : m_batch_16_simple_renderer(*this), m_batch_8_simple_renderer(*this) + : m_batch_16_simple_renderer(*this) + , m_batch_8_simple_renderer(*this) + , m_batch_4_simple_renderer(*this) #endif { Matrix44 M; diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h index 87d0b96dd..8ebe1c1fc 100644 --- a/src/testshade/simplerend.h +++ b/src/testshade/simplerend.h @@ -177,12 +177,17 @@ class SimpleRenderer : public RendererServices { { return &m_batch_8_simple_renderer; } + BatchedRendererServices<4>* batched(WidthOf<4>) override + { + return &m_batch_4_simple_renderer; + } #endif protected: #if OSL_USE_BATCHED BatchedSimpleRenderer<16> m_batch_16_simple_renderer; BatchedSimpleRenderer<8> m_batch_8_simple_renderer; + BatchedSimpleRenderer<4> m_batch_4_simple_renderer; #endif // Camera parameters diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp index 1d4d770ad..c2c3203f1 100644 --- a/src/testshade/testshade.cpp +++ b/src/testshade/testshade.cpp @@ -305,6 +305,9 @@ set_shadingsys_options() } else if ((!batch_size_requested || batch_size == 8) && shadingsys->configure_batch_execution_at(8)) { batch_size = 8; + } else if ((!batch_size_requested || batch_size == 4) + && shadingsys->configure_batch_execution_at(4)) { + batch_size = 4; } else { OSL::print( "WARNING: Hardware or library requirements to utilize batched execution"); @@ -1197,9 +1200,11 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys, // jit_group will optimize the group if necesssary if (batch_size == 16) { shadingsys->batched<16>().jit_group(shadergroup.get(), ctx); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { shadingsys->batched<8>().jit_group(shadergroup.get(), ctx); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + shadingsys->batched<4>().jit_group(shadergroup.get(), ctx); } } else #endif @@ -2203,13 +2208,19 @@ test_shade(int argc, const char* argv[]) batched_shade_region<16>(rend, shadergroup.get(), sub_roi, save); }); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { OIIO::ImageBufAlgo::parallel_image( roi, num_threads, [&](OIIO::ROI sub_roi) -> void { batched_shade_region<8>(rend, shadergroup.get(), sub_roi, save); }); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + OIIO::ImageBufAlgo::parallel_image( + roi, num_threads, [&](OIIO::ROI sub_roi) -> void { + batched_shade_region<4>(rend, shadergroup.get(), + sub_roi, save); + }); } } else # endif diff --git a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp index 0b7af16e4..449f06f59 100644 --- a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp +++ b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp @@ -182,10 +182,15 @@ class MyRendererServices final : public OSL::RendererServices { { return &m_batch_8_rs; } + OSL::BatchedRendererServices<4>* batched(OSL::WidthOf<4>) override + { + return &m_batch_4_rs; + } private: MyBatchedRendererServices<16> m_batch_16_rs; MyBatchedRendererServices<8> m_batch_8_rs; + MyBatchedRendererServices<4> m_batch_4_rs; }; @@ -232,11 +237,13 @@ main(int argc, char* argv[]) batch_width = 16; } else if (shadsys->configure_batch_execution_at(8)) { batch_width = 8; + } else if (shadsys->configure_batch_execution_at(4)) { + batch_width = 4; } else { std::cout - << "Error: Hardware doesn't support 8 or 16 wide SIMD or the OSL has not been configured and built with a proper USE_BATCHED." + << "Error: Hardware doesn't support 4, 8 or 16 wide SIMD or the OSL has not been configured and built with a proper USE_BATCHED." << std::endl; - std::cout << "Error: e.g.: USE_BATCHED=b8_AVX2,b8_AVX512,b16_AVX512" + std::cout << "Error: e.g.: USE_BATCHED=b4_SSE2,b8_AVX2,b8_AVX512,b16_AVX512" << std::endl; return -1; } @@ -432,8 +439,11 @@ main(int argc, char* argv[]) if (batch_width == 16) { batched_shadepoints(std::integral_constant {}); - } else { + } + else if (batch_width == 8) { batched_shadepoints(std::integral_constant {}); + } else { + batched_shadepoints(std::integral_constant {}); } // Print some results to prove that we generated an expected Pout.