diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3fb26aefdf..1874ede0a4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,6 +74,17 @@ jobs:
             pybind11_ver: v2.5.0
             simd: sse4.2
             setenvs: export CONAN_LLVM_VERSION=10.0.1
+          - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
+            nametag: linux-vfx2021
+            runner: ubuntu-latest
+            container: aswftesting/ci-osl:2021-clang11
+            vfxyear: 2021
+            cxx_std: 17
+            openimageio_ver: v2.4.13.0
+            python_ver: 3.7
+            pybind11_ver: v2.7.0
+            simd: sse2
+            batched: b4_SSE2
           - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
             nametag: linux-vfx2021
             runner: ubuntu-latest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c612a27e2..ee864a5c6d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ else ()
 endif ()
 set (OSL_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")
-option (OSL_BUILD_TESTS "Build the unit tests, testshade, testrender" ON)
+option (OSL_BUILD_TESTS "Build the unit tests, testminimal, testshade, testrender" ON)
 if (WIN32)
     option (USE_LLVM_BITCODE "Generate embedded LLVM bitcode" OFF)
 else ()
@@ -220,6 +220,7 @@ add_subdirectory (src/oslc)
 add_subdirectory (src/oslinfo)
 
 if (OSL_BUILD_TESTS AND BUILD_TESTING)
+    add_subdirectory (src/testminimal)
     add_subdirectory (src/testshade)
     add_subdirectory (src/testrender)
 endif ()
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index c97316681c..172bf1a818 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -329,7 +329,7 @@ endif ()
 #
 # The USE_BATCHED option may be set to indicate that support for batched
 # SIMD shader execution be compiled along with targe specific libraries
-set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
+set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
 option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
 set (BATCHED_SUPPORT_DEFINES "")
 set (BATCHED_TARGET_LIBS "")
diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake
index fc5e956b05..c3c0bee88e 100644
--- a/src/cmake/testing.cmake
+++ b/src/cmake/testing.cmake
@@ -270,7 +270,7 @@ macro (osl_add_all_tests)
                 bug-array-heapoffsets bug-locallifetime bug-outputinit
                 bug-param-duplicate bug-peep bug-return
                 calculatenormal-reg
-                cellnoise closure closure-array closure-layered closure-parameters closure-zero closure-conditional
+                cellnoise closure closure-array closure-layered closure-parameters closure-string closure-zero closure-conditional
                 color color-reg colorspace comparison
                 complement-reg compile-buffer compassign-bool compassign-reg
                 component-range
diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h
index c720e9bedc..7876644720 100644
--- a/src/include/OSL/batched_texture.h
+++ b/src/include/OSL/batched_texture.h
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
 static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 template<int WidthT> struct BatchedTextureOptions {
     VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
 static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
 // Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
 // and safe to reinterpret_cast<TextureOptBatch*>
-static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
+static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
+                  || (OIIO::Tex::BatchWidth == 4),
               "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
 
 namespace validate_offsets {
diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h
index 7f112ccf52..49df628917 100644
--- a/src/include/OSL/llvm_util.h
+++ b/src/include/OSL/llvm_util.h
@@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
     llvm::Constant* constant(uint32_t i);
 
     /// Return an llvm::Constant holding the given integer constant.
+    llvm::Constant* constant4(int8_t i);
+    llvm::Constant* constant4(uint8_t i);
     llvm::Constant* constant8(int8_t i);
     llvm::Constant* constant8(uint8_t i);
     llvm::Constant* constant16(int16_t i);
@@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {
 
     llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
     llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
+    llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
     std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
     std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
     std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);
diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h
index 04e5269ae0..62a6b61793 100644
--- a/src/include/OSL/rendererservices.h
+++ b/src/include/OSL/rendererservices.h
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
     /// Unless overridden, a nullptr is returned.
     virtual BatchedRendererServices<16>* batched(WidthOf<16>);
     virtual BatchedRendererServices<8>* batched(WidthOf<8>);
+    virtual BatchedRendererServices<4>* batched(WidthOf<4>);
 
 protected:
     TextureSystem* m_texturesys;  // A place to hold a TextureSystem
diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt
index 328565af68..5f2bd048a4 100644
--- a/src/liboslexec/CMakeLists.txt
+++ b/src/liboslexec/CMakeLists.txt
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=core2")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
@@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=haswell")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=core2")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp
index 888f198741..9f76c1acf6 100644
--- a/src/liboslexec/batched_analysis.cpp
+++ b/src/liboslexec/batched_analysis.cpp
@@ -1813,10 +1813,16 @@ struct Analyzer {
                     // specific BatchedRendererServices.
                     // Right here we don't know which width will be used,
                     // so we will just require all widths provide the same answer
+                    auto rs4  = m_ba.renderer()->batched(WidthOf<4>());
                     auto rs8  = m_ba.renderer()->batched(WidthOf<8>());
                     auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
-                    if (rs8 || rs16) {
+                    if (rs4 || rs8 || rs16) {
                         get_attr_is_uniform = true;
+                        if (rs4) {
+                            get_attr_is_uniform
+                                &= rs4->is_attribute_uniform(obj_name,
+                                                             attr_name);
+                        }
                         if (rs8) {
                             get_attr_is_uniform
                                 &= rs8->is_attribute_uniform(obj_name,
diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp
index e94122ef43..79f87ca900 100644
--- a/src/liboslexec/batched_backendllvm.cpp
+++ b/src/liboslexec/batched_backendllvm.cpp
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
     switch (vector_width()) {
     case 16: m_true_mask_value = Mask<16>(true).value(); break;
     case 8: m_true_mask_value = Mask<8>(true).value(); break;
+    case 4: m_true_mask_value = Mask<4>(true).value(); break;
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     ll.dumpasm(shadingsys.m_llvm_dumpasm);
diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp
index 8e6ff0a76d..2180637861 100644
--- a/src/liboslexec/batched_llvm_instance.cpp
+++ b/src/liboslexec/batched_llvm_instance.cpp
@@ -537,6 +537,33 @@ const char*
     = "b8_AVX_";
 #endif
 
+#ifdef __OSL_SUPPORTS_b4_SSE2
+template<>
+const NameAndSignature
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
+    = {
+#    define DECL_INDIRECT(name, signature) \
+        NameAndSignature { #name, signature },
+#    define DECL(name, signature) DECL_INDIRECT(name, signature)
+#    define __OSL_WIDTH           4
+#    define __OSL_TARGET_ISA      SSE2
+// Don't allow order of xmacro includes be rearranged
+// clang-format off
+#    include "wide/define_opname_macros.h"
+#    include "builtindecl_wide_xmacro.h"
+#    include "wide/undef_opname_macros.h"
+// clang-format on
+#    undef __OSL_TARGET_ISA
+#    undef __OSL_WIDTH
+#    undef DECL
+#    undef DECL_INDIRECT
+      };
+template<>
+const char*
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
+    = "b4_SSE2_";
+#endif
+
 
 
 std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
         default: break;
         }
         break;
+    case 4:
+        switch (target_isa) {
+#ifdef __OSL_SUPPORTS_b4_SSE2
+        case TargetISA::x64:
+            return RetType(
+                new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
+#endif
+        default: break;
+        }
+        break;
+
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
             break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
             break;
diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp
index fbff377b25..1c5fcaa4a6 100644
--- a/src/liboslexec/batched_rendservices.cpp
+++ b/src/liboslexec/batched_rendservices.cpp
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
 // Explicitly instantiate BatchedRendererServices template
 template class OSLEXECPUBLIC BatchedRendererServices<16>;
 template class OSLEXECPUBLIC BatchedRendererServices<8>;
+template class OSLEXECPUBLIC BatchedRendererServices<4>;
 
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp
index a97b427e1b..b001315a8e 100644
--- a/src/liboslexec/context.cpp
+++ b/src/liboslexec/context.cpp
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
 // Explicit template instantiation for supported batch sizes
 template class ShadingContext::Batched<16>;
 template class ShadingContext::Batched<8>;
+template class ShadingContext::Batched<4>;
 #endif
 
 
diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h
index 852ec82f94..43c7a72894 100644
--- a/src/liboslexec/llvm_passes.h
+++ b/src/liboslexec/llvm_passes.h
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
 // including this file will need its own static members defined. LLVM will
 // assign IDs when they get registered, so this initialization value is not
 // important.
+template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
+
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
 
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;
diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
index 3dd888cab0..2e758ec309 100644
--- a/src/liboslexec/llvm_util.cpp
+++ b/src/liboslexec/llvm_util.cpp
@@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM()
 
 #ifndef OSL_LLVM_NEW_PASS_MANAGER
     // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks
+    static llvm::RegisterPass<
+        LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>>
+        sRegCustomPass2(
+            "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>",
+            "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass",
+            false /* Only looks at CFG */, false /* Analysis Pass */);
     static llvm::RegisterPass<
         LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>>
         sRegCustomPass0(
@@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host)
                 break;
             }
             case 4:
-                // We don't use masking or SIMD shading for 4-wide
+                // MUST BE THE FINAL PASS!
+                m_new_pass_manager->module_pass_manager.addPass(
+                    createModuleToFunctionPassAdaptor(
+                        NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>(
+                            context())));
                 break;
             default:
                 std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host)
                     new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>());
                 break;
             case 4:
-                // We don't use masking or SIMD shading for 4-wide
+                // MUST BE THE FINAL PASS!
+                mpm.add(
+                    new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>());
                 break;
             default:
                 std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -3393,6 +3405,19 @@ LLVM_Util::constant(uint32_t i)
     return llvm::ConstantInt::get(context(), llvm::APInt(32, i));
 }
 
+llvm::Constant*
+LLVM_Util::constant4(int8_t i)
+{
+    return llvm::ConstantInt::get(context(),
+                                  llvm::APInt(4, i, true /*signed*/));
+}
+
+llvm::Constant*
+LLVM_Util::constant4(uint8_t i)
+{
+    return llvm::ConstantInt::get(context(), llvm::APInt(4, i));
+}
+
 llvm::Constant*
 LLVM_Util::constant8(int8_t i)
 {
@@ -3592,6 +3617,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             // and all types are happy
             intMaskType = type_int8();
             break;
+        case 4:
+            // We can just reinterpret cast a 4 bit mask to a 8 bit integer
+            // and all types are happy
+            intMaskType = type_int8();
+            break;
         default: OSL_ASSERT(0 && "unsupported native bit mask width");
         };
 
@@ -3659,6 +3689,33 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             int8_mask = builder().CreateCall(func, toArrayRef(args));
             return int8_mask;
         }
+        case 4: {
+            // We need to do more than a simple cast to an int. Since we
+            // know vectorized comparison for SSE2 ends up setting 4
+            // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to
+            // do more than a simple cast to an int.
+
+            // Convert <4 x i1> -> <4 x i32>
+            llvm::Value* wide_int_mask = builder().CreateSExt(mask,
+                                                              type_wide_int());
+
+            // Convert <4 x i32> -> <4 x f32>
+            // Now we will use the horizontal sign extraction intrinsic
+            // to build a 32 bit mask value. However the only 256bit
+            // version works on floats, so we will cast from int32 to
+            // float beforehand
+            llvm::Type* w4_float_type  = llvm_vector_type(m_llvm_type_float, 4);
+            llvm::Value* w4_float_mask = builder().CreateBitCast(wide_int_mask,
+                                                                 w4_float_type);
+
+            llvm::Function* func = llvm::Intrinsic::getDeclaration(
+                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
+
+            llvm::Value* args[1] = { w4_float_mask };
+            llvm::Value* int8_mask;
+            int8_mask = builder().CreateCall(func, toArrayRef(args));
+            return int8_mask;
+        }
         default: {
             OSL_ASSERT(0 && "unsupported native bit mask width");
             return mask;
@@ -3828,14 +3885,19 @@ LLVM_Util::int_as_mask(llvm::Value* value)
             // and all types are happy
             intMaskType = type_int8();
             break;
+        case 4:
+            // We can just reinterpret cast a 8 bit integer to a 4 bit mask
+            // and all types are happy
+            intMaskType = type_int8();
+            break;
         default: OSL_ASSERT(0 && "unsupported native bit mask width");
         };
         llvm::Value* intMask = builder().CreateTrunc(value, intMaskType);
 
         result = builder().CreateBitCast(intMask, type_wide_bool());
     } else {
-        // Since we know vectorized comparisons for AVX&AVX2 end up setting
-        // 8 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more
+        // Since we know vectorized comparisons for SSE2&AVX&AVX2 end up setting
+        // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more
         // than a simple cast to an int.
 
         // Broadcast out the int32 mask to all data lanes
@@ -3950,23 +4012,20 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
         // and all types are happy
         intMaskType = type_int8();
         break;
-#if 0  // WIP
-        case 4:
-        {
-            // We can just reinterpret cast a 8 bit mask to a 8 bit integer
-            // and all types are happy
-            intMaskType = type_int8();
+    case 4: {
+        // We can just reinterpret cast a 4 bit mask to a 8 bit integer
+        // and all types are happy
+        intMaskType = type_int8();
 
-//            extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width);
-//            llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type);
-//
-//            int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context);
-//            zeroConstant = constant128(0);
-//
-//            llvm::Value * mask_as_int =  builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
-            break;
-        }
-#endif
+        //            extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width);
+        //            llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type);
+        //
+        //            int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context);
+        //            zeroConstant = constant128(0);
+        //
+        //            llvm::Value * mask_as_int =  builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
+        break;
+    }
     default: OSL_ASSERT(0 && "unsupported native bit mask width");
     };
 
@@ -4455,6 +4514,19 @@ LLVM_Util::op_linearize_8x_indices(llvm::Value* wide_index)
 }
 
 
+llvm::Value*
+LLVM_Util::op_linearize_4x_indices(llvm::Value* wide_index)
+{
+    llvm::Value* strided_indices = op_mul(wide_index, wide_constant(4, 4));
+    llvm::Constant* offsets_to_lane[4] = { constant(0), constant(1),
+                                           constant(2), constant(3) };
+    llvm::Value* const_vec_offsets     = llvm::ConstantVector::get(
+        llvm::ArrayRef<llvm::Constant*>(&offsets_to_lane[0], 4));
+
+    return op_add(strided_indices, const_vec_offsets);
+}
+
+
 std::array<llvm::Value*, 2>
 LLVM_Util::op_split_16x(llvm::Value* vector_val)
 {
@@ -4613,6 +4685,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                     module(), llvm::Intrinsic::x86_avx512_gather_dpi_512);
                 break;
             case 8:
+            case 4:
                 int_mask              = mask_as_int8(current_mask());
                 func_avx512_gather_pi = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_gather3siv8_si);
@@ -4663,6 +4736,16 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                            toArrayRef(args));
                 return gather_result;
             }
+            case 4: {
+                llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr),
+                                        wide_index, wide_int_mask,
+                                        constant4((uint8_t)4) };
+                llvm::Value* gather_result
+                    = builder().CreateCall(func_avx2_gather_pi,
+                                           toArrayRef(args));
+                return gather_result;
+            }
+
             default: OSL_ASSERT(0 && "unsupported width");
             };
         } else {
@@ -4680,6 +4763,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                     module(), llvm::Intrinsic::x86_avx512_gather_dps_512);
                 break;
             case 8:
+            case 4:
                 int_mask              = mask_as_int8(current_mask());
                 func_avx512_gather_ps = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf);
@@ -4739,6 +4823,17 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                                            toArrayRef(args));
                 return gather;
             }
+            case 4: {
+                llvm::Value* args[] = {
+                    avx2_unmasked_value, void_ptr(src_ptr), wide_index,
+                    builder().CreateBitCast(wide_int_mask,
+                                            llvm_vector_type(type_float(), 4)),
+                    constant4((uint8_t)4)
+                };
+                llvm::Value* gather = builder().CreateCall(func_avx2_gather_ps,
+                                                           toArrayRef(args));
+                return gather;
+            }
             }
         } else {
             return clamped_gather_from_uniform(type_wide_float());
@@ -4805,6 +4900,29 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                                                       gather2),
                                                 type_wide_ustring());
             }
+            case 4: {
+                // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring
+                llvm::Function* func_avx512_gather_dpq
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_gather3siv4_di);
+                OSL_ASSERT(func_avx512_gather_dpq);
+
+                auto w4_bit_masks   = current_mask();
+                auto w4_int_indices = wide_index;
+
+                llvm::Value* unmasked_value
+                    = builder().CreateVectorSplat(4, constant64((uint64_t)0));
+                llvm::Value* args[]
+                    = { unmasked_value, void_ptr(src_ptr), w4_int_indices,
+                        mask4_as_int8(w4_bit_masks), constant(4) };
+                llvm::Value* gather1
+                    = builder().CreateCall(func_avx512_gather_dpq,
+                                           toArrayRef(args));
+                args[2] = w4_int_indices;
+                args[3] = mask4_as_int8(w4_bit_masks);
+
+                return builder().CreateIntToPtr(gather1, type_wide_ustring());
+            }
             default: OSL_ASSERT(0 && "unsupported native bit mask width");
             }
         } else {
@@ -4841,6 +4959,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                 return builder().CreateCall(func_avx512_gather_ps,
                                             toArrayRef(args));
             }
+            case 4: {
+                llvm::Function* func_avx512_gather_ps
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf);
+                OSL_ASSERT(func_avx512_gather_ps);
+
+                llvm::Value* unmasked_value = wide_constant(0.0f);
+                llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr),
+                                        op_linearize_4x_indices(wide_index),
+                                        mask_as_int8(current_mask()),
+                                        constant(4) };
+                return builder().CreateCall(func_avx512_gather_ps,
+                                            toArrayRef(args));
+            }
             default: OSL_ASSERT(0 && "unsupported native bit mask width");
             };
 
@@ -4889,6 +5021,19 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                            toArrayRef(args));
                 return gather_result;
             }
+            case 4: {
+                auto int_indices    = op_linearize_4x_indices(wide_index);
+                llvm::Value* args[] = {
+                    avx2_unmasked_value, void_ptr(src_ptr), int_indices,
+                    builder().CreateBitCast(wide_int_mask,
+                                            llvm_vector_type(type_float(), 4)),
+                    constant8((uint8_t)4)
+                };
+                llvm::Value* gather_result
+                    = builder().CreateCall(func_avx2_gather_ps,
+                                           toArrayRef(args));
+                return gather_result;
+            }
             default:
                 OSL_ASSERT(0 && "unsupported vector width for avx2 gather");
             }
@@ -4926,6 +5071,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                 return builder().CreateCall(func_avx512_gather_pi,
                                             toArrayRef(args));
             }
+            case 4: {
+                llvm::Function* func_avx512_gather_pi
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_gather3siv8_si);
+                OSL_ASSERT(func_avx512_gather_pi);
+
+                llvm::Value* unmasked_value = wide_constant(0);
+                llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr),
+                                        op_linearize_4x_indices(wide_index),
+                                        mask_as_int8(current_mask()),
+                                        constant(4) };
+                return builder().CreateCall(func_avx512_gather_pi,
+                                            toArrayRef(args));
+            }
             default: OSL_ASSERT(0 && "unsupported native bit mask width");
             }
         } else if (m_supports_avx2) {
@@ -4975,6 +5134,26 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                            toArrayRef(args));
                 return gather_result;
             }
+            case 4: {
+                llvm::Function* func_avx2_gather_pi
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx2_gather_d_d_256);
+                OSL_ASSERT(func_avx2_gather_pi);
+
+                llvm::Constant* avx2_unmasked_value = wide_constant(8, 0);
+
+                // Convert <16 x i1> -> <16 x i32> -> to <2 x< 8 x i32>>
+                llvm::Value* wide_int_mask
+                    = builder().CreateSExt(current_mask(), type_wide_int());
+                auto int_indices    = op_linearize_4x_indices(wide_index);
+                llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr),
+                                        int_indices, wide_int_mask,
+                                        constant8((uint8_t)4) };
+                llvm::Value* gather_result
+                    = builder().CreateCall(func_avx2_gather_pi,
+                                           toArrayRef(args));
+                return gather_result;
+            }
             default:
                 OSL_ASSERT(0 && "unsupported vector width for avx2 gather");
             }
@@ -5017,7 +5196,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                                                       gather2),
                                                 type_wide_ustring());
             }
-            case 8: {
+            case 8:
+            case 4: {
                 // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring
                 llvm::Function* func_avx512_gather_dpq
                     = llvm::Intrinsic::getDeclaration(
@@ -5093,6 +5273,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 linear_indices = op_linearize_16x_indices(wide_index);
                 break;
             case 8: linear_indices = op_linearize_8x_indices(wide_index); break;
+            case 4: linear_indices = op_linearize_4x_indices(wide_index); break;
             default: OSL_ASSERT(0 && "unsupported vector width for scatter");
             };
         } else {
@@ -5150,6 +5331,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                     module(), llvm::Intrinsic::x86_avx512_scatter_dps_512);
                 break;
             case 8:
+            case 4:
                 int_mask               = mask_as_int8(current_mask());
                 func_avx512_scatter_ps = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf);
@@ -5182,6 +5364,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                     module(), llvm::Intrinsic::x86_avx512_scatter_dpi_512);
                 break;
             case 8:
+            case 4:
                 int_mask               = mask_as_int8(current_mask());
                 func_avx512_scatter_pi = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_scattersiv8_si);
@@ -5256,6 +5439,25 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Value* linear_indices = wide_index;
+
+                llvm::Function* func_avx512_scatter_dpq
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512);
+                OSL_ASSERT(func_avx512_scatter_dpq);
+
+                llvm::Type* wide_address_int_type
+                    = llvm_vector_type(type_addrint(), 4);
+                llvm::Value* address_int_val
+                    = builder().CreatePtrToInt(wide_val, wide_address_int_type);
+
+                llvm::Value* args[]
+                    = { void_ptr(src_ptr), mask_as_int8(current_mask()),
+                        linear_indices, address_int_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
@@ -5295,6 +5497,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Function* func_avx512_scatter_ps
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf);
+                OSL_ASSERT(func_avx512_scatter_ps);
+
+                llvm::Value* args[] = { void_ptr(src_ptr),
+                                        mask_as_int8(current_mask()),
+                                        op_linearize_4x_indices(wide_index),
+                                        wide_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
@@ -5338,6 +5553,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Function* func_avx512_scatter_pi
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scattersiv8_si);
+                OSL_ASSERT(func_avx512_scatter_pi);
+
+                llvm::Value* args[] = { void_ptr(src_ptr),
+                                        mask_as_int8(current_mask()),
+                                        op_linearize_4x_indices(wide_index),
+                                        wide_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
@@ -5407,6 +5635,26 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Value* linear_indices = op_linearize_8x_indices(
+                    wide_index);
+
+                llvm::Function* func_avx512_scatter_dpq
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512);
+                OSL_ASSERT(func_avx512_scatter_dpq);
+
+                llvm::Type* wide_address_int_type
+                    = llvm_vector_type(type_addrint(), 4);
+                llvm::Value* address_int_val
+                    = builder().CreatePtrToInt(wide_val, wide_address_int_type);
+
+                llvm::Value* args[]
+                    = { void_ptr(src_ptr), mask_as_int8(current_mask()),
+                        linear_indices, address_int_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp
index c0c84b03d6..b3bd5c8989 100644
--- a/src/liboslexec/rendservices.cpp
+++ b/src/liboslexec/rendservices.cpp
@@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>)
     return nullptr;
 }
 
+BatchedRendererServices<4>*
+RendererServices::batched(WidthOf<4>)
+{
+    // No default implementation for batched services
+    return nullptr;
+}
+
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp
index 307d57355e..620c09cee8 100644
--- a/src/liboslexec/shadingsys.cpp
+++ b/src/liboslexec/shadingsys.cpp
@@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width)
                 m_impl->attribute("llvm_jit_fma", 0);
                 return true;
             }
+#    endif
+            if (target_requested) {
+                break;
+            }
+            // fallthrough
+        default: return false;
+        };
+        return false;
+    case 4:
+        switch (requestedISA) {
+        case TargetISA::UNKNOWN:
+            // fallthrough
+        case TargetISA::x64:
+#    ifdef __OSL_SUPPORTS_b4_SSE2
+            if (LLVM_Util::supports_isa(TargetISA::x64)) {
+                if (!target_requested)
+                    m_impl->attribute("llvm_jit_target",
+                                      LLVM_Util::target_isa_name(
+                                          TargetISA::x64));
+                // SSE2 doesn't support FMA
+                m_impl->attribute("llvm_jit_fma", 0);
+                return true;
+            }
 #    endif
             if (target_requested) {
                 break;
@@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor<WidthT>::jit_all_groups(int nthreads)
 // Explicitly instantiate
 template class ShadingSystem::BatchedExecutor<16>;
 template class ShadingSystem::BatchedExecutor<8>;
+template class ShadingSystem::BatchedExecutor<4>;
 #endif
 
 
@@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer,
     , m_opt_groupdata(true)
 #if OSL_USE_BATCHED
     , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr)
-                             || (renderer->batched(WidthOf<8>()) != nullptr))
+                             || (renderer->batched(WidthOf<8>()) != nullptr)
+                             || (renderer->batched(WidthOf<4>()) != nullptr))
 #else
     , m_opt_batched_analysis(false)
 #endif
@@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
         // the batch jit has already happened,
         // as it requires the ops so we can't delete them yet!
         if (((renderer()->batched(WidthOf<16>()) == nullptr)
-             && (renderer()->batched(WidthOf<8>()) == nullptr))
+             && (renderer()->batched(WidthOf<8>()) == nullptr)
+             && (renderer()->batched(WidthOf<4>()) == nullptr))
             || group.batch_jitted()) {
             group_post_jit_cleanup(group);
         }
@@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched<WidthT>::jit_all_groups(int nthreads, int mythread,
 // machine as well, start with just the batch size
 template class pvt::ShadingSystemImpl::Batched<16>;
 template class pvt::ShadingSystemImpl::Batched<8>;
+template class pvt::ShadingSystemImpl::Batched<4>;
 #endif
 
 int
diff --git a/src/testminimal/CMakeLists.txt b/src/testminimal/CMakeLists.txt
new file mode 100644
index 0000000000..42e6e9d115
--- /dev/null
+++ b/src/testminimal/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright Contributors to the Open Shading Language project.
+# SPDX-License-Identifier: BSD-3-Clause
+# https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
+
+# The 'testminimal' executable
+set ( testminimal_srcs
+      testminimal.cpp
+      oslmaterial.cpp )
+
+set(include_dirs ${CMAKE_CURRENT_SOURCE_DIR})
+list(APPEND include_dirs ${CMAKE_SOURCE_DIR}/src/include)
+list(APPEND include_dirs ${CMAKE_BINARY_DIR}/include)
+list(APPEND include_dirs ${IMATH_INCLUDES})
+list(APPEND include_dirs ${OPENEXR_INCLUDES})
+list(APPEND include_dirs ${OpenImageIO_INCLUDES})
+
+set ( rs_srcs
+    oslmaterial.cpp )
+
+EMBED_LLVM_BITCODE_IN_CPP ( "${rs_srcs}" "_host" "testminimal_llvm_compiled_rs" testminimal_srcs "-DOSL_HOST_RS_BITCODE=1" "${include_dirs}")
+
+add_executable ( testminimal ${testminimal_srcs} )
+
+target_link_libraries (testminimal
+                       PRIVATE
+                           oslexec oslquery)
+
+install (TARGETS testminimal RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} )
+
+osl_optix_target(testminimal)
diff --git a/src/testminimal/oslmaterial.cpp b/src/testminimal/oslmaterial.cpp
new file mode 100644
index 0000000000..dd509aa6e7
--- /dev/null
+++ b/src/testminimal/oslmaterial.cpp
@@ -0,0 +1,205 @@
+// Copyright Contributors to the Open Shading Language project.
+// SPDX-License-Identifier: BSD-3-Clause
+// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
+
+
+#include "oslmaterial.h"
+#include <iostream>
+
+using std::cout;
+using std::endl;
+
+#if OSL_USE_BATCHED
+template<int batch_width>
+CustomBatchedRendererServices<batch_width>::CustomBatchedRendererServices(
+    BatchedOSLMaterial<batch_width>& m)
+    : OSL::BatchedRendererServices<batch_width>(m.texturesys()), m_sr(m)
+{
+}
+#endif
+
+OSLMaterial::OSLMaterial() {}
+
+#if OSL_USE_BATCHED
+template<int batch_width>
+BatchedOSLMaterial<batch_width>::BatchedOSLMaterial() : m_batch(*this)
+{
+}
+
+template BatchedOSLMaterial<4>::BatchedOSLMaterial();
+template BatchedOSLMaterial<8>::BatchedOSLMaterial();
+template BatchedOSLMaterial<16>::BatchedOSLMaterial();
+#endif
+
+// Supported closures and parameters
+struct EmptyParams {};
+
+enum ClosureIDs {
+    EMISSION_ID,
+    BACKGROUND_ID,
+    MICROFACET_ID,
+};
+
+struct MicrofacetParams {
+    OSL::ustringhash dist;
+    OSL::Vec3 N, U;
+    float xalpha, yalpha, eta;
+    int refract;
+};
+
+void
+register_closures(OSL::ShadingSystem* ss)
+{
+    // "Describe the memory layout of each closure type to the OSL runtime"
+    constexpr int MaxParams = 32;
+    struct BuiltinClosures {
+        const char* name;
+        int id;
+        OSL::ClosureParam params[MaxParams];  // "upper bound"
+    };
+
+    using namespace OSL;
+
+    // Closures with support built into OSL, connected by the 1st string
+    BuiltinClosures supported[] = {
+        { "emission", EMISSION_ID, { CLOSURE_FINISH_PARAM(EmptyParams) } },
+        { "background", BACKGROUND_ID, { CLOSURE_FINISH_PARAM(EmptyParams) } },
+        { "microfacet",
+          MICROFACET_ID,
+          { CLOSURE_STRING_PARAM(MicrofacetParams, dist),
+            CLOSURE_VECTOR_PARAM(MicrofacetParams, N),
+            CLOSURE_VECTOR_PARAM(MicrofacetParams, U),
+            CLOSURE_FLOAT_PARAM(MicrofacetParams, xalpha),
+            CLOSURE_FLOAT_PARAM(MicrofacetParams, yalpha),
+            CLOSURE_FLOAT_PARAM(MicrofacetParams, eta),
+            CLOSURE_INT_PARAM(MicrofacetParams, refract),
+            CLOSURE_FINISH_PARAM(MicrofacetParams) } },
+    };
+    // Closure registration here enables that type of closure, when executing or compiling a shader
+    for (const BuiltinClosures& c : supported)
+        ss->register_closure(c.name, c.id, c.params, nullptr, nullptr);
+}
+
+void
+process_bsdf_closure(const OSL::ClosureColor* closure)
+{
+    static const ::OSL::ustringhash uh_ggx(OIIO::Strutil::strhash("ggx"));
+    //static const ::OSL::ustringhash uh_beckmann(OIIO::Strutil::strhash("beckmann"));
+    if (!closure)
+        return;
+    switch (closure->id) {
+    case OSL::ClosureColor::MUL: {
+        process_bsdf_closure(closure->as_mul()->closure);
+        break;
+    }
+    case OSL::ClosureColor::ADD: {
+        process_bsdf_closure(closure->as_add()->closureA);
+        process_bsdf_closure(closure->as_add()->closureB);
+        break;
+    }
+    default: {
+        const OSL::ClosureComponent* comp = closure->as_comp();
+        switch (comp->id) {
+        case EMISSION_ID: cout << "parsing emission closure" << endl; break;
+        case MICROFACET_ID: {
+            cout << "parsing microfacet closure" << endl;
+            const MicrofacetParams* mp = comp->as<MicrofacetParams>();
+            if (mp->dist.hash() == uh_ggx.hash()) {
+                cout << "uh_ggx" << endl;
+            } else {
+                cout << "uh_beckmann or default" << endl;
+            }
+        } break;
+        default:
+            OSL_ASSERT(false && "Invalid closure invoked in surface shader");
+            break;
+        }
+    } break;
+    }
+}
+
+void
+OSLMaterial::run_test(OSL::ShadingSystem* ss, OSL::PerThreadInfo* thread_info,
+                      OSL::ShadingContext* context, char* shader_name)
+{
+    register_closures(ss);
+    OSL::ShaderGlobals globals;
+    globals_from_hit(globals);
+
+    std::vector<std::string> options;
+
+    // Create a new shader group
+    m_shaders.emplace_back();
+    m_shaders[0]              = ss->ShaderGroupBegin(std::to_string(0));
+    OSL::ShaderGroupRef group = m_shaders[0];
+
+    //{
+    //    OSL::OSLCompiler compiler;
+    //    std::string name = std::string(shader_name) + ".osl";
+    //    compiler.compile(name.c_str(), options);
+    //}
+
+    ss->Shader(*group, "surface", shader_name, "Test");
+    ss->ShaderGroupEnd(*group);
+
+    ss->execute(context, *group, globals);
+    const OSL::ClosureColor* closure = globals.Ci;
+    process_bsdf_closure(closure);
+}
+
+#if OSL_USE_BATCHED
+template<int batch_width>
+void
+BatchedOSLMaterial<batch_width>::run_test(OSL::ShadingSystem* ss,
+                                          OSL::PerThreadInfo* thread_info,
+                                          OSL::ShadingContext* context,
+                                          char* shader_name)
+{
+    register_closures(ss);
+    OSL::BatchedShaderGlobals<batch_width> batched_globals;
+
+    m_batch.globals_from_hit(batched_globals);
+
+    std::vector<std::string> options;
+
+    // Create a new shader group
+    m_shaders.emplace_back();
+    m_shaders[0]              = ss->ShaderGroupBegin(std::to_string(0));
+    OSL::ShaderGroupRef group = m_shaders[0];
+
+    //{
+    //    OSL::OSLCompiler compiler;
+    //    std::string name = std::string(shader_name) + ".osl";
+    //    compiler.compile(name.c_str(), options);
+    //}
+
+    ss->Shader(*group, "surface", shader_name, "Test");
+    ss->ShaderGroupEnd(*group);
+
+    // Run the shader that was just created
+    OSL::Block<int, batch_width> wide_shadeindex_block;
+    char* userdata_base_ptr = NULL;
+    char* output_base_ptr   = NULL;
+    ss->batched<batch_width>().execute(*context, *group, batch_width,
+                                       wide_shadeindex_block, batched_globals,
+                                       userdata_base_ptr, output_base_ptr);
+    const OSL::ClosureColor* closure = batched_globals.varying.Ci[0];
+    process_bsdf_closure(closure);
+}
+
+template void
+BatchedOSLMaterial<4>::run_test(OSL::ShadingSystem* ss,
+                                OSL::PerThreadInfo* thread_info,
+                                OSL::ShadingContext* context,
+                                char* shader_name);
+template void
+BatchedOSLMaterial<8>::run_test(OSL::ShadingSystem* ss,
+                                OSL::PerThreadInfo* thread_info,
+                                OSL::ShadingContext* context,
+                                char* shader_name);
+template void
+BatchedOSLMaterial<16>::run_test(OSL::ShadingSystem* ss,
+                                 OSL::PerThreadInfo* thread_info,
+                                 OSL::ShadingContext* context,
+                                 char* shader_name);
+#endif
diff --git a/src/testminimal/oslmaterial.h b/src/testminimal/oslmaterial.h
new file mode 100644
index 0000000000..12e49fca33
--- /dev/null
+++ b/src/testminimal/oslmaterial.h
@@ -0,0 +1,235 @@
+// Copyright Contributors to the Open Shading Language project.
+// SPDX-License-Identifier: BSD-3-Clause
+// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
+
+
+#pragma once
+#include <OSL/dual_vec.h>
+#include <OSL/genclosure.h>
+#include <OSL/oslclosure.h>
+#include <OSL/oslcomp.h>
+#include <OSL/oslconfig.h>
+#include <OSL/oslexec.h>
+#include <OSL/rendererservices.h>
+
+#if OSL_USE_BATCHED
+#    include <OSL/batched_rendererservices.h>
+#    include <OSL/batched_shaderglobals.h>
+#endif
+
+class OSLMaterial;
+
+#if OSL_USE_BATCHED
+template<int batch_width> class BatchedOSLMaterial;
+
+using OSL::Vec3;
+
+/// Custom BatchedRendererServices
+template<int batch_width>
+class CustomBatchedRendererServices
+    : public OSL::BatchedRendererServices<batch_width> {
+public:
+    explicit CustomBatchedRendererServices(BatchedOSLMaterial<batch_width>& m);
+
+    //OIIO::ErrorHandler& errhandler() const { return *m_errhandler; }
+    /// Turn information at hitpoint into ShaderGlobals for OSL
+    void globals_from_hit(OSL::BatchedShaderGlobals<batch_width>& bsg)
+    {
+        // Uniform
+        auto& usg = bsg.uniform;
+        // Zero it all
+        std::memset(&usg, 0, sizeof(OSL::UniformShaderGlobals));
+        usg.raytype = 1;  // 1 stands for camera ray?
+        // Varying
+        auto& vsg = bsg.varying;
+
+        //assign_all(vsg.shader2common, TransformationPtr(&Mshad));
+        //assign_all(vsg.object2common, TransformationPtr(&Mobj));
+
+        for (int i = 0; i < batch_width; i++)
+            vsg.P[i] = { 0.0f, 0.0f, 0.0f };
+
+        for (int i = 0; i < batch_width; i++)
+            vsg.I[i] = { 0.0f, 0.0f, -1.0f };  // incident ray
+        for (int i = 0; i < batch_width; i++)
+            vsg.N[i] = { 0.0f, 0.0f, 1.0f };  // shading normal
+        for (int i = 0; i < batch_width; i++)
+            vsg.Ng[i] = { 0.0f, 0.0f, 1.0f };  // true geometric normal
+
+        assign_all(vsg.u,
+                   0.5f);  // 2D surface parameter u, and its differentials.
+        assign_all(vsg.v,
+                   0.5f);  // 2D surface parameter u, and its differentials.
+
+
+        //if (false == vary_udxdy) {
+        assign_all(vsg.dudx, 0.0f);  //uscale / xres);
+        assign_all(vsg.dudy, 0.0f);
+        //}
+        //if (false == vary_vdxdy) {
+        assign_all(vsg.dvdx, 0.0f);
+        assign_all(vsg.dvdy, 0.0f);  //vscale / yres);
+        //}
+
+
+        //if (false == vary_Pdxdy) {
+        //    assign_all(vsg.dPdx, Vec3(vsg.dudx[0], vsg.dudy[0], 0.0f));
+        //    assign_all(vsg.dPdy, Vec3(vsg.dvdx[0], vsg.dvdy[0], 0.0f));
+        //}
+
+        assign_all(vsg.dPdz,
+                   Vec3(0.0f, 0.0f, 0.0f));  // just use 0 for volume tangent
+
+        // Tangents of P with respect to surface u,v
+        assign_all(vsg.dPdu, Vec3(1.0f, 0.0f, 0.0f));
+        assign_all(vsg.dPdv, Vec3(0.0f, 1.0f, 0.0f));
+
+        assign_all(vsg.I, Vec3(0, 0, 0));
+        assign_all(vsg.dIdx, Vec3(0, 0, 0));
+        assign_all(vsg.dIdy, Vec3(0, 0, 0));
+
+        // That also implies that our normal points to (0,0,1)
+        assign_all(vsg.N, Vec3(0, 0, 1));
+        assign_all(vsg.Ng, Vec3(0, 0, 1));
+
+        assign_all(vsg.time, 0.0f);
+        assign_all(vsg.dtime, 0.0f);
+        assign_all(vsg.dPdtime, Vec3(0, 0, 0));
+
+        assign_all(vsg.Ps, Vec3(0, 0, 0));
+        assign_all(vsg.dPsdx, Vec3(0, 0, 0));
+        assign_all(vsg.dPsdy, Vec3(0, 0, 0));
+
+        assign_all(vsg.surfacearea, 1.0f);
+        assign_all(vsg.flipHandedness, 0);
+        assign_all(vsg.backfacing, 0);
+
+        assign_all(vsg.Ci, (::OSL::ClosureColor*)NULL);
+    }
+
+    bool is_overridden_get_inverse_matrix_WmWxWf() const override
+    {
+        return false;
+    };
+    bool is_overridden_get_matrix_WmWsWf() const override { return false; };
+    bool is_overridden_get_inverse_matrix_WmsWf() const override
+    {
+        return false;
+    };
+    bool is_overridden_get_inverse_matrix_WmWsWf() const override
+    {
+        return false;
+    };
+    bool is_overridden_texture() const override { return false; };
+    bool is_overridden_texture3d() const override { return false; };
+    bool is_overridden_environment() const override { return false; };
+    bool is_overridden_pointcloud_search() const override { return false; };
+    bool is_overridden_pointcloud_get() const override { return false; };
+    bool is_overridden_pointcloud_write() const override { return false; };
+
+    BatchedOSLMaterial<batch_width>& m_sr;
+
+private:
+};
+#endif
+
+/// Custom RendererServices for non-batched case
+class OSLMaterial : public OSL::RendererServices {
+public:
+    OSLMaterial();
+
+    void run_test(OSL::ShadingSystem* ss, OSL::PerThreadInfo* thread_info,
+                  OSL::ShadingContext* context, char* shader_name);
+
+    OIIO::ErrorHandler& errhandler() const { return *m_errhandler; }
+
+    /// Turn information at hitpoint into ShaderGlobals for OSL
+    void globals_from_hit(OSL::ShaderGlobals& sg)
+    {
+        sg.P    = { 0.0f, 0.0f, 0.0f };  // surface pos
+        sg.dPdx = { 0.0f, 0.0f, 0.0f };
+        sg.dPdy = { 0.0f, 0.0f, 0.0f };
+        sg.dPdz = { 0.0f, 0.0f, 0.0f };  // for volume shading only
+
+        sg.I    = { 0.0f, 0.0f, -1.0f };  // incident ray
+        sg.dIdx = { 0.0f, 0.0f, 0.0f };
+        sg.dIdy = { 0.0f, 0.0f, 0.0f };
+
+        sg.N  = { 0.0f, 0.0f, 1.0f };  // shading normal
+        sg.Ng = { 0.0f, 0.0f, 1.0f };  // true geometric normal
+
+        sg.u    = 0.5f;  // 2D surface parameter u, and its differentials.
+        sg.dudx = 0.0f;
+        sg.dudy = 0.0f;
+        sg.v    = 0.5f;  // 2D surface parameter v, and its differentials.
+        sg.dvdx = 0.0f;
+        sg.dvdy = 0.0f;
+
+        // Surface tangents: derivative of P with respect to surface u and v.
+        sg.dPdu = { 1.0f, 0.0f, 0.0f };
+        sg.dPdv = { 0.0f, 1.0f, 0.0f };
+
+        sg.time  = 0.0f;
+        sg.dtime = 0.001f;
+
+        // Velocity vector: derivative of position P with respect to time.
+        sg.dPdtime = { 0.0f, 0.0f, 0.0f };
+
+        // For lights or light attenuation shaders: the point being illuminated (???)
+        sg.Ps    = { 0.0f, 0.0f, 0.0f };
+        sg.dPsdx = { 0.0f, 0.0f, 0.0f };
+        sg.dPsdy = { 0.0f, 0.0f, 0.0f };
+
+        // Renderer user pointers
+        sg.renderstate = NULL;
+        sg.tracedata   = NULL;
+        sg.objdata     = NULL;
+
+        sg.renderer = this;
+
+        sg.raytype        = 1;  // 1 stands for camera ray?
+        sg.flipHandedness = 0;
+        sg.backfacing     = 0;
+
+        // output closure, needs to be null initialized
+        sg.Ci = NULL;
+    }
+
+    // ShaderGroupRef storage
+    std::vector<OSL::ShaderGroupRef>& shaders() { return m_shaders; }
+    std::vector<OSL::ShaderGroupRef> m_shaders;
+
+private:
+    std::unique_ptr<OIIO::ErrorHandler> m_errhandler;
+};
+
+#if OSL_USE_BATCHED
+
+/// Custom RendererServices for batched case
+template<int batch_width>
+class BatchedOSLMaterial : public OSL::RendererServices {
+public:
+    BatchedOSLMaterial();
+
+    void run_test(OSL::ShadingSystem* ss, OSL::PerThreadInfo* thread_info,
+                  OSL::ShadingContext* context, char* shader_name);
+
+    OIIO::ErrorHandler& errhandler() const { return *m_errhandler; }
+
+    // ShaderGroupRef storage
+    std::vector<OSL::ShaderGroupRef>& shaders() { return m_shaders; }
+    std::vector<OSL::ShaderGroupRef> m_shaders;
+
+    OSL::BatchedRendererServices<batch_width>*
+    batched(OSL::WidthOf<batch_width>) override
+    {
+        return &m_batch;
+    }
+
+    CustomBatchedRendererServices<batch_width> m_batch;
+
+private:
+    std::unique_ptr<OIIO::ErrorHandler> m_errhandler;
+};
+
+#endif
diff --git a/src/testminimal/testminimal.cpp b/src/testminimal/testminimal.cpp
new file mode 100644
index 0000000000..6f9a5fb0c6
--- /dev/null
+++ b/src/testminimal/testminimal.cpp
@@ -0,0 +1,139 @@
+// Copyright Contributors to the Open Shading Language project.
+// SPDX-License-Identifier: BSD-3-Clause
+// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
+
+
+#include <OpenImageIO/filesystem.h>
+#include <OpenImageIO/sysutil.h>
+#include <iostream>
+#include "oslmaterial.h"
+
+using namespace OSL;
+
+int
+main(int argc, char** argv)
+{
+    int batch_width;
+    char* shader_name;
+    if (argc < 2) {
+        std::cout
+            << "usage: shader_name(without .osl) [+optional] batch_width (0/4/8/16)"
+            << std::endl;
+        return 0;
+    } else if (argc >= 3) {
+        shader_name = argv[1];
+        batch_width = atoi(argv[2]);
+        batch_width = std::max(batch_width, 1);
+        if (batch_width != 1 && batch_width != 4 && batch_width != 8
+            && batch_width != 16)
+            batch_width = 1;
+    } else {
+        shader_name = argv[1];
+        batch_width = -1;
+    }
+
+    OSLMaterial* oslmat = NULL;
+#if OSL_USE_BATCHED
+    BatchedOSLMaterial<4>* boslmat4   = NULL;
+    BatchedOSLMaterial<8>* boslmat8   = NULL;
+    BatchedOSLMaterial<16>* boslmat16 = NULL;
+#endif
+
+    TextureSystem* texturesys = TextureSystem::create();
+    ShadingSystem* ss         = NULL;
+
+    if (batch_width == -1) {
+#if OSL_USE_BATCHED
+        oslmat = new OSLMaterial();
+        ss     = new ShadingSystem(oslmat, NULL, &oslmat->errhandler());
+        if (ss->configure_batch_execution_at(16))
+            batch_width = 16;
+        else if (ss->configure_batch_execution_at(8))
+            batch_width = 8;
+        else if (ss->configure_batch_execution_at(4))
+            batch_width = 4;
+        else
+            batch_width = 1;
+        delete oslmat;
+        oslmat = NULL;
+        delete ss;
+        ss = NULL;
+#else
+        batch_width = 1;
+#endif
+    }
+
+    switch (batch_width) {
+    case 1:
+        oslmat = new OSLMaterial();
+        ss     = new ShadingSystem(oslmat, texturesys, &oslmat->errhandler());
+        break;
+#if OSL_USE_BATCHED
+    case 4:
+        boslmat4 = new BatchedOSLMaterial<4>();
+        ss = new ShadingSystem(boslmat4, texturesys, &boslmat4->errhandler());
+        break;
+    case 8:
+        boslmat8 = new BatchedOSLMaterial<8>();
+        ss = new ShadingSystem(boslmat8, texturesys, &boslmat8->errhandler());
+        break;
+    case 16:
+        boslmat16 = new BatchedOSLMaterial<16>();
+        ss = new ShadingSystem(boslmat16, texturesys, &boslmat16->errhandler());
+        break;
+#endif
+    }
+
+#if OSL_USE_BATCHED
+    if (batch_width > 1) {
+        //ss->attribute("llvm_jit_fma", true);
+        ss->configure_batch_execution_at(batch_width);
+
+        // build searchpath for ISA specific OSL shared libraries based on expected
+        // location of library directories relative to the executables path.
+        static const char* relative_lib_dirs[] =
+#    if (defined(_WIN32) || defined(_WIN64))
+            { "\\..\\lib64", "\\..\\lib" };
+#    else
+            { "/../lib64", "/../lib" };
+#    endif
+        auto executable_directory = OIIO::Filesystem::parent_path(
+            OIIO::Sysutil::this_program_path());
+        int dirNum = 0;
+        std::string librarypath;
+        for (const char* relative_lib_dir : relative_lib_dirs) {
+            if (dirNum++ > 0)
+                librarypath += ":";
+            librarypath += executable_directory + relative_lib_dir;
+        }
+        ss->attribute("searchpath:library", librarypath);
+    }
+#endif
+
+    PerThreadInfo* thread_info;
+    ShadingContext* context;
+    thread_info = ss->create_thread_info();
+    context     = ss->get_context(thread_info);
+
+    switch (batch_width) {
+    case 1: oslmat->run_test(ss, thread_info, context, shader_name); break;
+#if OSL_USE_BATCHED
+    case 4: boslmat4->run_test(ss, thread_info, context, shader_name); break;
+    case 8: boslmat8->run_test(ss, thread_info, context, shader_name); break;
+    case 16: boslmat16->run_test(ss, thread_info, context, shader_name); break;
+#endif
+    }
+
+    ss->release_context(context);
+    ss->destroy_thread_info(thread_info);
+
+    delete oslmat;
+#if OSL_USE_BATCHED
+    delete boslmat4;
+    delete boslmat8;
+    delete boslmat16;
+#endif
+    delete ss;
+
+    return 0;
+}
diff --git a/src/testshade/batched_simplerend.cpp b/src/testshade/batched_simplerend.cpp
index 937655af4d..ea2acbdf97 100644
--- a/src/testshade/batched_simplerend.cpp
+++ b/src/testshade/batched_simplerend.cpp
@@ -1001,6 +1001,7 @@ BatchedSimpleRenderer<WidthT>::get_camera_screen_window(ustringhash /*object*/,
 // Explicitly instantiate BatchedSimpleRenderer template
 template class BatchedSimpleRenderer<16>;
 template class BatchedSimpleRenderer<8>;
+template class BatchedSimpleRenderer<4>;
 
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp
index 65862c2dba..3582c9cc48 100644
--- a/src/testshade/simplerend.cpp
+++ b/src/testshade/simplerend.cpp
@@ -218,7 +218,9 @@ register_closures(OSL::ShadingSystem* shadingsys)
 
 SimpleRenderer::SimpleRenderer()
 #if OSL_USE_BATCHED
-    : m_batch_16_simple_renderer(*this), m_batch_8_simple_renderer(*this)
+    : m_batch_16_simple_renderer(*this)
+    , m_batch_8_simple_renderer(*this)
+    , m_batch_4_simple_renderer(*this)
 #endif
 {
     Matrix44 M;
diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h
index 87d0b96dda..8ebe1c1fc4 100644
--- a/src/testshade/simplerend.h
+++ b/src/testshade/simplerend.h
@@ -177,12 +177,17 @@ class SimpleRenderer : public RendererServices {
     {
         return &m_batch_8_simple_renderer;
     }
+    BatchedRendererServices<4>* batched(WidthOf<4>) override
+    {
+        return &m_batch_4_simple_renderer;
+    }
 #endif
 
 protected:
 #if OSL_USE_BATCHED
     BatchedSimpleRenderer<16> m_batch_16_simple_renderer;
     BatchedSimpleRenderer<8> m_batch_8_simple_renderer;
+    BatchedSimpleRenderer<4> m_batch_4_simple_renderer;
 #endif
 
     // Camera parameters
diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp
index 39834f6380..bd80a0f415 100644
--- a/src/testshade/testshade.cpp
+++ b/src/testshade/testshade.cpp
@@ -306,6 +306,9 @@ set_shadingsys_options()
         } else if ((!batch_size_requested || batch_size == 8)
                    && shadingsys->configure_batch_execution_at(8)) {
             batch_size = 8;
+        } else if ((!batch_size_requested || batch_size == 4)
+                   && shadingsys->configure_batch_execution_at(4)) {
+            batch_size = 4;
         } else {
             OSL::print(
                 "WARNING:  Hardware or library requirements to utilize batched execution");
@@ -1194,9 +1197,11 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys,
             // jit_group will optimize the group if necesssary
             if (batch_size == 16) {
                 shadingsys->batched<16>().jit_group(shadergroup.get(), ctx);
-            } else {
-                ASSERT((batch_size == 8) && "Unsupported batch size");
+            } else if (batch_size == 8) {
                 shadingsys->batched<8>().jit_group(shadergroup.get(), ctx);
+            } else {
+                ASSERT((batch_size == 4) && "Unsupported batch size");
+                shadingsys->batched<4>().jit_group(shadergroup.get(), ctx);
             }
         } else
 #endif
@@ -2195,13 +2200,19 @@ test_shade(int argc, const char* argv[])
                             batched_shade_region<16>(rend, shadergroup.get(),
                                                      sub_roi, save);
                         });
-                } else {
-                    ASSERT((batch_size == 8) && "Unsupported batch size");
+                } else if (batch_size == 8) {
                     OIIO::ImageBufAlgo::parallel_image(
                         roi, num_threads, [&](OIIO::ROI sub_roi) -> void {
                             batched_shade_region<8>(rend, shadergroup.get(),
                                                     sub_roi, save);
                         });
+                } else {
+                    ASSERT((batch_size == 4) && "Unsupported batch size");
+                    OIIO::ImageBufAlgo::parallel_image(
+                        roi, num_threads, [&](OIIO::ROI sub_roi) -> void {
+                            batched_shade_region<4>(rend, shadergroup.get(),
+                                                    sub_roi, save);
+                        });
                 }
             } else
 #    endif
diff --git a/testsuite/closure-string/BATCHED b/testsuite/closure-string/BATCHED
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/testsuite/closure-string/ref/out.txt b/testsuite/closure-string/ref/out.txt
new file mode 100644
index 0000000000..d497c44d99
--- /dev/null
+++ b/testsuite/closure-string/ref/out.txt
@@ -0,0 +1,3 @@
+Compiled test.osl -> test.oso
+parsing microfacet closure
+uh_ggx
diff --git a/testsuite/closure-string/run.py b/testsuite/closure-string/run.py
new file mode 100755
index 0000000000..5e688f5cb9
--- /dev/null
+++ b/testsuite/closure-string/run.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+# Copyright Contributors to the Open Shading Language project.
+# SPDX-License-Identifier: BSD-3-Clause
+# https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
+
+command = testminimal("test")
diff --git a/testsuite/closure-string/test.osl b/testsuite/closure-string/test.osl
new file mode 100644
index 0000000000..4071f5d9e4
--- /dev/null
+++ b/testsuite/closure-string/test.osl
@@ -0,0 +1,4 @@
+shader test(string distribution = "ggx")
+{
+    Ci = microfacet(distribution, N, N, 0.1, 0.1, 0.0, 0);
+}
diff --git a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp
index 0b7af16e40..fe620fefab 100644
--- a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp
+++ b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp
@@ -182,10 +182,15 @@ class MyRendererServices final : public OSL::RendererServices {
     {
         return &m_batch_8_rs;
     }
+    OSL::BatchedRendererServices<4>* batched(OSL::WidthOf<4>) override
+    {
+        return &m_batch_4_rs;
+    }
 
 private:
     MyBatchedRendererServices<16> m_batch_16_rs;
     MyBatchedRendererServices<8> m_batch_8_rs;
+    MyBatchedRendererServices<4> m_batch_4_rs;
 };
 
 
diff --git a/testsuite/runtest.py b/testsuite/runtest.py
index befe278a46..eceec81be2 100755
--- a/testsuite/runtest.py
+++ b/testsuite/runtest.py
@@ -249,6 +249,14 @@ def oiiodiff (fileA, fileB, extraargs="", silent=True, concat=True) :
         command += " ;\n"
     return command
 
+# Construct a command that run testminimal with the specified arguments,
+# appending output to the file "out.txt".
+def testminimal (args) :
+    if os.environ.__contains__('OSL_TESTMINIMAL_NAME') :
+        testminimalname = os.environ['OSL_TESTMINIMAL_NAME'] + " "
+    else :
+        testminimalname = osl_app("testminimal")
+    return (testminimalname + args + redirect + " ;\n")
 
 # Construct a command that run testshade with the specified arguments,
 # appending output to the file "out.txt".