Add support for b4_SSE2 batched mode (2)

AcademySoftwareFoundation · Jun 8, 2024 · 52fb623 · 52fb623
1 parent 0d122e7
commit 52fb623
Show file tree

Hide file tree

Showing 30 changed files with 754 additions and 17 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -74,6 +74,17 @@ jobs:
             pybind11_ver: v2.5.0
             simd: sse4.2
             setenvs: export CONAN_LLVM_VERSION=10.0.1
+          - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
+            nametag: linux-vfx2021
+            runner: ubuntu-latest
+            container: aswftesting/ci-osl:2021-clang11
+            vfxyear: 2021
+            cxx_std: 17
+            openimageio_ver: v2.4.13.0
+            python_ver: 3.7
+            pybind11_ver: v2.7.0
+            simd: sse2
+            batched: b4_SSE2
           - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
             nametag: linux-vfx2021
             runner: ubuntu-latest

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -95,7 +95,7 @@ else ()
 endif ()
 set (OSL_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")
-option (OSL_BUILD_TESTS "Build the unit tests, testshade, testrender" ON)
+option (OSL_BUILD_TESTS "Build the unit tests, testminimal, testshade, testrender" ON)
 if (WIN32)
     option (USE_LLVM_BITCODE "Generate embedded LLVM bitcode" OFF)
 else ()
@@ -220,6 +220,7 @@ add_subdirectory (src/oslc)
 add_subdirectory (src/oslinfo)
 
 if (OSL_BUILD_TESTS AND BUILD_TESTING)
+    add_subdirectory (src/testminimal)
     add_subdirectory (src/testshade)
     add_subdirectory (src/testrender)
 endif ()

diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
@@ -329,7 +329,7 @@ endif ()
 #
 # The USE_BATCHED option may be set to indicate that support for batched
 # SIMD shader execution be compiled along with targe specific libraries
-set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
+set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
 option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
 set (BATCHED_SUPPORT_DEFINES "")
 set (BATCHED_TARGET_LIBS "")

diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake
@@ -270,7 +270,7 @@ macro (osl_add_all_tests)
                 bug-array-heapoffsets bug-locallifetime bug-outputinit
                 bug-param-duplicate bug-peep bug-return
                 calculatenormal-reg
-                cellnoise closure closure-array closure-layered closure-parameters closure-zero closure-conditional
+                cellnoise closure closure-array closure-layered closure-parameters closure-string closure-zero closure-conditional
                 color color-reg colorspace comparison
                 complement-reg compile-buffer compassign-bool compassign-reg
                 component-range

diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
 static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 template<int WidthT> struct BatchedTextureOptions {
     VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,14 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
 static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
 // Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
 // and safe to reinterpret_cast<TextureOptBatch*>
-static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
+static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) || (OIIO::Tex::BatchWidth == 4),
               "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
 
 namespace validate_offsets {

diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
     /// Unless overridden, a nullptr is returned.
     virtual BatchedRendererServices<16>* batched(WidthOf<16>);
     virtual BatchedRendererServices<8>* batched(WidthOf<8>);
+    virtual BatchedRendererServices<4>* batched(WidthOf<4>);
 
 protected:
     TextureSystem* m_texturesys;  // A place to hold a TextureSystem

diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=core2")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
@@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=haswell")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=core2")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()

diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp
@@ -1813,10 +1813,16 @@ struct Analyzer {
                     // specific BatchedRendererServices.
                     // Right here we don't know which width will be used,
                     // so we will just require all widths provide the same answer
+                    auto rs4  = m_ba.renderer()->batched(WidthOf<4>());
                     auto rs8  = m_ba.renderer()->batched(WidthOf<8>());
                     auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
-                    if (rs8 || rs16) {
+                    if (rs4 || rs8 || rs16) {
                         get_attr_is_uniform = true;
+                        if (rs4) {
+                            get_attr_is_uniform
+                                &= rs4->is_attribute_uniform(obj_name,
+                                                             attr_name);
+                        }
                         if (rs8) {
                             get_attr_is_uniform
                                 &= rs8->is_attribute_uniform(obj_name,

diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
     switch (vector_width()) {
     case 16: m_true_mask_value = Mask<16>(true).value(); break;
     case 8: m_true_mask_value = Mask<8>(true).value(); break;
+    case 4: m_true_mask_value = Mask<4>(true).value(); break;
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     ll.dumpasm(shadingsys.m_llvm_dumpasm);

diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp
@@ -537,6 +537,33 @@ const char*
     = "b8_AVX_";
 #endif
 
+#ifdef __OSL_SUPPORTS_b4_SSE2
+template<>
+const NameAndSignature
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
+    = {
+#    define DECL_INDIRECT(name, signature) \
+        NameAndSignature { #name, signature },
+#    define DECL(name, signature) DECL_INDIRECT(name, signature)
+#    define __OSL_WIDTH           4
+#    define __OSL_TARGET_ISA      SSE2
+// Don't allow order of xmacro includes be rearranged
+// clang-format off
+#    include "wide/define_opname_macros.h"
+#    include "builtindecl_wide_xmacro.h"
+#    include "wide/undef_opname_macros.h"
+// clang-format on
+#    undef __OSL_TARGET_ISA
+#    undef __OSL_WIDTH
+#    undef DECL
+#    undef DECL_INDIRECT
+      };
+template<>
+const char*
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
+    = "b4_SSE2_";
+#endif
+
 
 
 std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
         default: break;
         }
         break;
+    case 4:
+        switch (target_isa) {
+#ifdef __OSL_SUPPORTS_b4_SSE2
+        case TargetISA::x64:
+            return RetType(
+                new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
+#endif
+        default: break;
+        }
+        break;
+
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
             break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
             break;

diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
 // Explicitly instantiate BatchedRendererServices template
 template class OSLEXECPUBLIC BatchedRendererServices<16>;
 template class OSLEXECPUBLIC BatchedRendererServices<8>;
+template class OSLEXECPUBLIC BatchedRendererServices<4>;
 
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
 // Explicit template instantiation for supported batch sizes
 template class ShadingContext::Batched<16>;
 template class ShadingContext::Batched<8>;
+template class ShadingContext::Batched<4>;
 #endif
 
 

diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
 // including this file will need its own static members defined. LLVM will
 // assign IDs when they get registered, so this initialization value is not
 // important.
+template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
+
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
 
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;

diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
@@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM()
 
 #ifndef OSL_LLVM_NEW_PASS_MANAGER
     // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks
+    static llvm::RegisterPass<
+        LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>>
+        sRegCustomPass2(
+            "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>",
+            "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass",
+            false /* Only looks at CFG */, false /* Analysis Pass */);
     static llvm::RegisterPass<
         LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>>
         sRegCustomPass0(
@@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host)
                 break;
             }
             case 4:
-                // We don't use masking or SIMD shading for 4-wide
+                // MUST BE THE FINAL PASS!
+                m_new_pass_manager->module_pass_manager.addPass(
+                    createModuleToFunctionPassAdaptor(
+                        NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>(
+                            context())));
                 break;
             default:
                 std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host)
                     new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>());
                 break;
             case 4:
-                // We don't use masking or SIMD shading for 4-wide
+                // MUST BE THE FINAL PASS!
+                mpm.add(
+                    new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>());
                 break;
             default:
                 std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -3592,6 +3604,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             // and all types are happy
             intMaskType = type_int8();
             break;
+        case 4:
+            // We can just reinterpret cast a 4 bit mask to a 8 bit integer
+            // and all types are happy
+            intMaskType = type_int8();
+            break;
         default: OSL_ASSERT(0 && "unsupported native bit mask width");
         };
 
@@ -3950,10 +3967,10 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
         // and all types are happy
         intMaskType = type_int8();
         break;
-#if 0  // WIP
+//#if 0  // WIP
         case 4:
         {
-            // We can just reinterpret cast a 8 bit mask to a 8 bit integer
+            // We can just reinterpret cast a 4 bit mask to a 8 bit integer
             // and all types are happy
             intMaskType = type_int8();
 
@@ -3966,7 +3983,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
 //            llvm::Value * mask_as_int =  builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
             break;
         }
-#endif
+//#endif
     default: OSL_ASSERT(0 && "unsupported native bit mask width");
     };
 

diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp
@@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>)
     return nullptr;
 }
 
+BatchedRendererServices<4>*
+RendererServices::batched(WidthOf<4>)
+{
+    // No default implementation for batched services
+    return nullptr;
+}
+
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp
@@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width)
                 m_impl->attribute("llvm_jit_fma", 0);
                 return true;
             }
+#    endif
+            if (target_requested) {
+                break;
+            }
+            // fallthrough
+        default: return false;
+        };
+        return false;
+    case 4:
+        switch (requestedISA) {
+        case TargetISA::UNKNOWN:
+            // fallthrough
+        case TargetISA::x64:
+#    ifdef __OSL_SUPPORTS_b4_SSE2
+            if (LLVM_Util::supports_isa(TargetISA::x64)) {
+                if (!target_requested)
+                    m_impl->attribute("llvm_jit_target",
+                                      LLVM_Util::target_isa_name(
+                                          TargetISA::x64));
+                // SSE2 doesn't support FMA
+                m_impl->attribute("llvm_jit_fma", 0);
+                return true;
+            }
 #    endif
             if (target_requested) {
                 break;
@@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor<WidthT>::jit_all_groups(int nthreads)
 // Explicitly instantiate
 template class ShadingSystem::BatchedExecutor<16>;
 template class ShadingSystem::BatchedExecutor<8>;
+template class ShadingSystem::BatchedExecutor<4>;
 #endif
 
 
@@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer,
     , m_opt_groupdata(true)
 #if OSL_USE_BATCHED
     , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr)
-                             || (renderer->batched(WidthOf<8>()) != nullptr))
+                             || (renderer->batched(WidthOf<8>()) != nullptr)
+                             || (renderer->batched(WidthOf<4>()) != nullptr))
 #else
     , m_opt_batched_analysis(false)
 #endif
@@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
         // the batch jit has already happened,
         // as it requires the ops so we can't delete them yet!
         if (((renderer()->batched(WidthOf<16>()) == nullptr)
-             && (renderer()->batched(WidthOf<8>()) == nullptr))
+             && (renderer()->batched(WidthOf<8>()) == nullptr)
+             && (renderer()->batched(WidthOf<4>()) == nullptr))
             || group.batch_jitted()) {
             group_post_jit_cleanup(group);
         }
@@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched<WidthT>::jit_all_groups(int nthreads, int mythread,
 // machine as well, start with just the batch size
 template class pvt::ShadingSystemImpl::Batched<16>;
 template class pvt::ShadingSystemImpl::Batched<8>;
+template class pvt::ShadingSystemImpl::Batched<4>;
 #endif
 
 int