Skip to content

Commit

Permalink
Add support for b4_SSE2 batched mode (2)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnfea committed Jun 8, 2024
1 parent 0d122e7 commit 52fb623
Show file tree
Hide file tree
Showing 30 changed files with 754 additions and 17 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ jobs:
pybind11_ver: v2.5.0
simd: sse4.2
setenvs: export CONAN_LLVM_VERSION=10.0.1
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
nametag: linux-vfx2021
runner: ubuntu-latest
container: aswftesting/ci-osl:2021-clang11
vfxyear: 2021
cxx_std: 17
openimageio_ver: v2.4.13.0
python_ver: 3.7
pybind11_ver: v2.7.0
simd: sse2
batched: b4_SSE2
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
nametag: linux-vfx2021
runner: ubuntu-latest
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ else ()
endif ()
set (OSL_LIBNAME_SUFFIX "" CACHE STRING
"Optional name appended to ${PROJECT_NAME} libraries that are built")
option (OSL_BUILD_TESTS "Build the unit tests, testshade, testrender" ON)
option (OSL_BUILD_TESTS "Build the unit tests, testminimal, testshade, testrender" ON)
if (WIN32)
option (USE_LLVM_BITCODE "Generate embedded LLVM bitcode" OFF)
else ()
Expand Down Expand Up @@ -220,6 +220,7 @@ add_subdirectory (src/oslc)
add_subdirectory (src/oslinfo)

if (OSL_BUILD_TESTS AND BUILD_TESTING)
add_subdirectory (src/testminimal)
add_subdirectory (src/testshade)
add_subdirectory (src/testrender)
endif ()
Expand Down
2 changes: 1 addition & 1 deletion src/cmake/compiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ endif ()
#
# The USE_BATCHED option may be set to indicate that support for batched
# SIMD shader execution be compiled along with targe specific libraries
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
set (BATCHED_SUPPORT_DEFINES "")
set (BATCHED_TARGET_LIBS "")
Expand Down
2 changes: 1 addition & 1 deletion src/cmake/testing.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ macro (osl_add_all_tests)
bug-array-heapoffsets bug-locallifetime bug-outputinit
bug-param-duplicate bug-peep bug-return
calculatenormal-reg
cellnoise closure closure-array closure-layered closure-parameters closure-zero closure-conditional
cellnoise closure closure-array closure-layered closure-parameters closure-string closure-zero closure-conditional
color color-reg colorspace comparison
complement-reg compile-buffer compassign-bool compassign-reg
component-range
Expand Down
8 changes: 7 additions & 1 deletion src/include/OSL/batched_texture.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
== VecReg<8>::alignment,
"Expect alignment of data member to set alignment of struct");
static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
== VecReg<4>::alignment,
"Expect alignment of data member to set alignment of struct");

template<int WidthT> struct BatchedTextureOptions {
VaryingTextureOptions<WidthT> varying;
Expand Down Expand Up @@ -90,11 +93,14 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
== VecReg<8>::alignment,
"Expect alignment of data member to set alignment of struct");
static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
== VecReg<4>::alignment,
"Expect alignment of data member to set alignment of struct");

#ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
// Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
// and safe to reinterpret_cast<TextureOptBatch*>
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) || (OIIO::Tex::BatchWidth == 4),
"This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");

namespace validate_offsets {
Expand Down
1 change: 1 addition & 0 deletions src/include/OSL/rendererservices.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
/// Unless overridden, a nullptr is returned.
virtual BatchedRendererServices<16>* batched(WidthOf<16>);
virtual BatchedRendererServices<8>* batched(WidthOf<8>);
virtual BatchedRendererServices<4>* batched(WidthOf<4>);

protected:
TextureSystem* m_texturesys; // A place to hold a TextureSystem
Expand Down
4 changes: 4 additions & 0 deletions src/liboslexec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
list (APPEND TARGET_CXX_OPTS "-march=core2")
else ()
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
endif ()
Expand Down Expand Up @@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
list (APPEND TARGET_CXX_OPTS "-march=haswell")
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
list (APPEND TARGET_CXX_OPTS "-march=core2")
else ()
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
endif ()
Expand Down
8 changes: 7 additions & 1 deletion src/liboslexec/batched_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1813,10 +1813,16 @@ struct Analyzer {
// specific BatchedRendererServices.
// Right here we don't know which width will be used,
// so we will just require all widths provide the same answer
auto rs4 = m_ba.renderer()->batched(WidthOf<4>());
auto rs8 = m_ba.renderer()->batched(WidthOf<8>());
auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
if (rs8 || rs16) {
if (rs4 || rs8 || rs16) {
get_attr_is_uniform = true;
if (rs4) {
get_attr_is_uniform
&= rs4->is_attribute_uniform(obj_name,
attr_name);
}
if (rs8) {
get_attr_is_uniform
&= rs8->is_attribute_uniform(obj_name,
Expand Down
1 change: 1 addition & 0 deletions src/liboslexec/batched_backendllvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
switch (vector_width()) {
case 16: m_true_mask_value = Mask<16>(true).value(); break;
case 8: m_true_mask_value = Mask<8>(true).value(); break;
case 4: m_true_mask_value = Mask<4>(true).value(); break;
default: OSL_ASSERT(0 && "unsupported vector width");
}
ll.dumpasm(shadingsys.m_llvm_dumpasm);
Expand Down
44 changes: 44 additions & 0 deletions src/liboslexec/batched_llvm_instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,33 @@ const char*
= "b8_AVX_";
#endif

#ifdef __OSL_SUPPORTS_b4_SSE2
template<>
const NameAndSignature
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
= {
# define DECL_INDIRECT(name, signature) \
NameAndSignature { #name, signature },
# define DECL(name, signature) DECL_INDIRECT(name, signature)
# define __OSL_WIDTH 4
# define __OSL_TARGET_ISA SSE2
// Don't allow order of xmacro includes be rearranged
// clang-format off
# include "wide/define_opname_macros.h"
# include "builtindecl_wide_xmacro.h"
# include "wide/undef_opname_macros.h"
// clang-format on
# undef __OSL_TARGET_ISA
# undef __OSL_WIDTH
# undef DECL
# undef DECL_INDIRECT
};
template<>
const char*
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
= "b4_SSE2_";
#endif



std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
Expand Down Expand Up @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
default: break;
}
break;
case 4:
switch (target_isa) {
#ifdef __OSL_SUPPORTS_b4_SSE2
case TargetISA::x64:
return RetType(
new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
#endif
default: break;
}
break;

default: OSL_ASSERT(0 && "unsupported vector width");
}
std::cerr << "Build is not configured to support TargetISA of "
Expand Down Expand Up @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
{
std::vector<unsigned int> offset_by_index;
switch (m_width) {
case 4:
build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
break;
case 8:
build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
break;
Expand Down Expand Up @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
{
std::vector<unsigned int> offset_by_index;
switch (m_width) {
case 4:
build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
break;
case 8:
build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
break;
Expand Down
1 change: 1 addition & 0 deletions src/liboslexec/batched_rendservices.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
// Explicitly instantiate BatchedRendererServices template
template class OSLEXECPUBLIC BatchedRendererServices<16>;
template class OSLEXECPUBLIC BatchedRendererServices<8>;
template class OSLEXECPUBLIC BatchedRendererServices<4>;

OSL_NAMESPACE_EXIT
1 change: 1 addition & 0 deletions src/liboslexec/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
// Explicit template instantiation for supported batch sizes
template class ShadingContext::Batched<16>;
template class ShadingContext::Batched<8>;
template class ShadingContext::Batched<4>;
#endif


Expand Down
2 changes: 2 additions & 0 deletions src/liboslexec/llvm_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
// including this file will need its own static members defined. LLVM will
// assign IDs when they get registered, so this initialization value is not
// important.
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;

template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;

template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;
Expand Down
27 changes: 22 additions & 5 deletions src/liboslexec/llvm_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM()

#ifndef OSL_LLVM_NEW_PASS_MANAGER
// LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks
static llvm::RegisterPass<
LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>>
sRegCustomPass2(
"PreventBitMasksFromBeingLiveinsToBasicBlocks<4>",
"Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass",
false /* Only looks at CFG */, false /* Analysis Pass */);
static llvm::RegisterPass<
LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>>
sRegCustomPass0(
Expand Down Expand Up @@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host)
break;
}
case 4:
// We don't use masking or SIMD shading for 4-wide
// MUST BE THE FINAL PASS!
m_new_pass_manager->module_pass_manager.addPass(
createModuleToFunctionPassAdaptor(
NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>(
context())));
break;
default:
std::cout << "m_vector_width = " << m_vector_width << "\n";
Expand Down Expand Up @@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host)
new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>());
break;
case 4:
// We don't use masking or SIMD shading for 4-wide
// MUST BE THE FINAL PASS!
mpm.add(
new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>());
break;
default:
std::cout << "m_vector_width = " << m_vector_width << "\n";
Expand Down Expand Up @@ -3592,6 +3604,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
// and all types are happy
intMaskType = type_int8();
break;
case 4:
// We can just reinterpret cast a 4 bit mask to a 8 bit integer
// and all types are happy
intMaskType = type_int8();
break;
default: OSL_ASSERT(0 && "unsupported native bit mask width");
};

Expand Down Expand Up @@ -3950,10 +3967,10 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
// and all types are happy
intMaskType = type_int8();
break;
#if 0 // WIP
//#if 0 // WIP
case 4:
{
// We can just reinterpret cast a 8 bit mask to a 8 bit integer
// We can just reinterpret cast a 4 bit mask to a 8 bit integer
// and all types are happy
intMaskType = type_int8();

Expand All @@ -3966,7 +3983,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
// llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
break;
}
#endif
//#endif
default: OSL_ASSERT(0 && "unsupported native bit mask width");
};

Expand Down
7 changes: 7 additions & 0 deletions src/liboslexec/rendservices.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>)
return nullptr;
}

BatchedRendererServices<4>*
RendererServices::batched(WidthOf<4>)
{
// No default implementation for batched services
return nullptr;
}

OSL_NAMESPACE_EXIT
31 changes: 29 additions & 2 deletions src/liboslexec/shadingsys.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width)
m_impl->attribute("llvm_jit_fma", 0);
return true;
}
# endif
if (target_requested) {
break;
}
// fallthrough
default: return false;
};
return false;
case 4:
switch (requestedISA) {
case TargetISA::UNKNOWN:
// fallthrough
case TargetISA::x64:
# ifdef __OSL_SUPPORTS_b4_SSE2
if (LLVM_Util::supports_isa(TargetISA::x64)) {
if (!target_requested)
m_impl->attribute("llvm_jit_target",
LLVM_Util::target_isa_name(
TargetISA::x64));
// SSE2 doesn't support FMA
m_impl->attribute("llvm_jit_fma", 0);
return true;
}
# endif
if (target_requested) {
break;
Expand Down Expand Up @@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor<WidthT>::jit_all_groups(int nthreads)
// Explicitly instantiate
template class ShadingSystem::BatchedExecutor<16>;
template class ShadingSystem::BatchedExecutor<8>;
template class ShadingSystem::BatchedExecutor<4>;
#endif


Expand Down Expand Up @@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer,
, m_opt_groupdata(true)
#if OSL_USE_BATCHED
, m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr)
|| (renderer->batched(WidthOf<8>()) != nullptr))
|| (renderer->batched(WidthOf<8>()) != nullptr)
|| (renderer->batched(WidthOf<4>()) != nullptr))
#else
, m_opt_batched_analysis(false)
#endif
Expand Down Expand Up @@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
// the batch jit has already happened,
// as it requires the ops so we can't delete them yet!
if (((renderer()->batched(WidthOf<16>()) == nullptr)
&& (renderer()->batched(WidthOf<8>()) == nullptr))
&& (renderer()->batched(WidthOf<8>()) == nullptr)
&& (renderer()->batched(WidthOf<4>()) == nullptr))
|| group.batch_jitted()) {
group_post_jit_cleanup(group);
}
Expand Down Expand Up @@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched<WidthT>::jit_all_groups(int nthreads, int mythread,
// machine as well, start with just the batch size
template class pvt::ShadingSystemImpl::Batched<16>;
template class pvt::ShadingSystemImpl::Batched<8>;
template class pvt::ShadingSystemImpl::Batched<4>;
#endif

int
Expand Down
Loading

0 comments on commit 52fb623

Please sign in to comment.