From ec77cef76f40ad7104ddb2f55afa54ac7fd0815d Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 16 Dec 2023 11:45:57 +0100 Subject: [PATCH 01/59] Refactor to support indirect mesh. --- assets/shaders/inc/meshlet_render.h | 19 +-- assets/shaders/inc/meshlet_render_types.h | 37 +++++ tests/assets/shaders/meshlet_cull.comp | 30 ++-- tests/assets/shaders/meshlet_debug.mesh | 20 +-- tests/assets/shaders/meshlet_debug.task | 11 +- tests/assets/shaders/meshlet_debug.vert | 15 +- tests/meshlet_viewer.cpp | 164 ++++++++++++++-------- 7 files changed, 177 insertions(+), 119 deletions(-) create mode 100644 assets/shaders/inc/meshlet_render_types.h diff --git a/assets/shaders/inc/meshlet_render.h b/assets/shaders/inc/meshlet_render.h index 144ae7ac..7c4ea25a 100644 --- a/assets/shaders/inc/meshlet_render.h +++ b/assets/shaders/inc/meshlet_render.h @@ -25,16 +25,7 @@ #error "Must define MESHLET_RENDER_TASKS_BINDING before including meshlet_render.h" #endif -struct AABB -{ - vec3 lo; float pad0; vec3 hi; float pad; -}; - -struct Bound -{ - vec4 center_radius; - vec4 cone; -}; +#include "meshlet_render_types.h" layout(set = MESHLET_RENDER_DESCRIPTOR_SET, binding = MESHLET_RENDER_BOUND_BINDING, std430) readonly buffer Bounds { @@ -56,14 +47,6 @@ layout(set = MESHLET_RENDER_DESCRIPTOR_SET, binding = MESHLET_RENDER_FRUSTUM_BIN vec4 planes[6]; } frustum; -struct TaskInfo -{ - uint aabb_instance; - uint node_instance; - uint node_count_material_index; // Skinning - uint mesh_index_count; -}; - layout(set = MESHLET_RENDER_DESCRIPTOR_SET, binding = MESHLET_RENDER_TASKS_BINDING, std430) readonly buffer Tasks { TaskInfo data[]; diff --git a/assets/shaders/inc/meshlet_render_types.h b/assets/shaders/inc/meshlet_render_types.h new file mode 100644 index 00000000..76da97ca --- /dev/null +++ b/assets/shaders/inc/meshlet_render_types.h @@ -0,0 +1,37 @@ +#ifndef MESHLET_RENDER_TYPES_H_ +#define MESHLET_RENDER_TYPES_H_ + +struct AABB +{ + vec3 lo; float pad0; vec3 hi; float pad; +}; + +struct Bound +{ + vec4 center_radius; + vec4 cone; +}; + +struct TaskInfo +{ + uint aabb_instance; + uint node_instance; + uint node_count_material_index; // Skinning + uint mesh_index_count; +}; + +struct CompactedDrawInfo +{ + uint meshlet_index; + uint node_offset; + uint node_count_material_offset; +}; + +#if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS +struct MeshletDrawCommand +{ + uint payload[MESHLET_RENDER_DRAW_WORDS]; +}; +#endif + +#endif \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_cull.comp b/tests/assets/shaders/meshlet_cull.comp index 9546f566..aead9f95 100644 --- a/tests/assets/shaders/meshlet_cull.comp +++ b/tests/assets/shaders/meshlet_cull.comp @@ -16,26 +16,29 @@ layout(local_size_x = 32) in; #define MESHLET_RENDER_TASKS_BINDING 2 #include "meshlet_render.h" -struct Draw -{ - uint payload[5]; -}; - +#if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS layout(set = 0, binding = 3, std430) readonly buffer InputDraws { - Draw data[]; + MeshletDrawCommand data[]; } input_draws; layout(set = 0, binding = 4, std430) writeonly buffer OutputDraws { uint count; uint padding[256 / 4 - 1]; - Draw data[]; + MeshletDrawCommand data[]; +} output_draws; +#else +layout(set = 0, binding = 4, std430) writeonly buffer OutputDraws +{ + uint count; + uint y, z; } output_draws; +#endif -layout(set = 0, binding = 5, scalar) writeonly buffer CompactedDraws +layout(set = 0, binding = 5, std430) writeonly buffer CompactedDraws { - uvec3 data[]; + CompactedDrawInfo data[]; } output_draw_info; layout(push_constant, std430) uniform Registers @@ -47,14 +50,15 @@ layout(push_constant, std430) uniform Registers #if !MESHLET_PAYLOAD_SUBGROUP shared uint ballot_value; shared uint global_offset; + uvec4 ballot(bool v) { barrier(); if (gl_LocalInvocationIndex == 0) - ballot_value = 0; + ballot_value = 0; barrier(); if (v) - atomicOr(ballot_value, 1u << gl_LocalInvocationIndex); + atomicOr(ballot_value, 1u << gl_LocalInvocationIndex); barrier(); return uvec4(ballot_value, 0, 0, 0); } @@ -140,8 +144,10 @@ void main() if (alloc_draw) { +#if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS output_draws.data[global_offset + local_offset] = input_draws.data[meshlet_index]; - output_draw_info.data[global_offset + local_offset] = uvec3(node_instance, node_count_material_index, meshlet_index); +#endif + output_draw_info.data[global_offset + local_offset] = CompactedDrawInfo(meshlet_index, node_instance, node_count_material_index); } } } \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index dd557420..e0839170 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -45,24 +45,28 @@ layout(set = 1, binding = 0) uniform UBO mat4 VP; }; -struct MeshTask +#ifndef MESHLET_RENDER_TASK +#error "Must define MESHLET_RENDER_TASK" +#endif + +#if MESHLET_RENDER_TASK +taskPayloadSharedEXT CompactedDrawInfo mesh_payload[32 * 32]; +#else +layout(set = 0, binding = 10) readonly buffer DrawInfos { - uint meshlet_index; - uint node_instance; - uint node_count_material_index; + CompactedDrawInfo mesh_payload[]; }; - -taskPayloadSharedEXT MeshTask mesh_payload[32 * 32]; +#endif void main() { - MeshTask task = mesh_payload[gl_WorkGroupID.x]; + CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; MeshletMetaRuntime meta = meshlet_metas_runtime.data[task.meshlet_index]; meshlet_init_workgroup(meta.stream_offset); SetMeshOutputsEXT(meta.num_attributes, meta.num_primitives); - mat4 M = transforms.data[task.node_instance]; + mat4 M = transforms.data[task.node_offset]; #define INDEX(index, value) \ if (index < meta.num_primitives) \ diff --git a/tests/assets/shaders/meshlet_debug.task b/tests/assets/shaders/meshlet_debug.task index 419d69a0..dab1e96a 100644 --- a/tests/assets/shaders/meshlet_debug.task +++ b/tests/assets/shaders/meshlet_debug.task @@ -23,14 +23,7 @@ layout(push_constant, std430) uniform Registers uint count; } registers; -struct MeshTask -{ - uint meshlet_index; - uint node_instance; - uint node_count_material_index; -}; - -taskPayloadSharedEXT MeshTask mesh_payload[32 * 32]; +taskPayloadSharedEXT CompactedDrawInfo mesh_payload[32 * 32]; layout(set = 0, binding = 9) buffer Counter { uint task_counter; }; @@ -118,7 +111,7 @@ void main() uint local_offset = ballotExclusiveBitCount(ballot); if (alloc_draw) - mesh_payload[payload_offset + local_offset] = MeshTask(meshlet_index, node_instance, node_count_material_index); + mesh_payload[payload_offset + local_offset] = CompactedDrawInfo(meshlet_index, node_instance, node_count_material_index); payload_offset += draw_count; } diff --git a/tests/assets/shaders/meshlet_debug.vert b/tests/assets/shaders/meshlet_debug.vert index 1cf61803..303b1917 100644 --- a/tests/assets/shaders/meshlet_debug.vert +++ b/tests/assets/shaders/meshlet_debug.vert @@ -1,6 +1,8 @@ #version 450 #extension GL_ARB_shader_draw_parameters : require +#include "meshlet_render_types.h" + layout(location = 0) in vec3 POS; #if 0 layout(location = 1) in mediump vec3 N; @@ -23,13 +25,6 @@ layout(set = 1, binding = 0) uniform UBO mat4 VP; }; -struct CompactedDrawInfo -{ - uint node_offset; - uint node_count_material_offset; - uint meshlet_id; -}; - layout(set = 0, binding = 0) readonly buffer DrawParameters { CompactedDrawInfo data[]; @@ -45,13 +40,13 @@ void main() mat4 M = transforms.data[draw_info.data[gl_DrawIDARB].node_offset]; vec3 world_pos = (M * vec4(POS, 1.0)).xyz; vWorldPos = world_pos; - vDrawID = draw_info.data[gl_DrawIDARB].meshlet_id; + vDrawID = draw_info.data[gl_DrawIDARB].meshlet_index; gl_Position = VP * vec4(world_pos, 1.0); - #if 0 +#if 0 vNormal = mat3(M) * N; vTangent = vec4(mat3(M) * T.xyz, T.w); vUV = UV; MaterialOffset = bitfieldExtract(draw_info.data[gl_DrawIDARB].node_count_material_offset, 8, 24); - #endif +#endif } diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 46999e88..8af8fa2b 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -272,9 +272,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V struct DrawParameters { + uint32_t meshlet_index; // Debug uint32_t node_instance; uint32_t node_count; // Skinning - uint32_t meshlet_index; // Debug }; std::vector task_params; @@ -311,7 +311,15 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V return; } - BufferHandle task_buffer, cached_transform_buffer, aabb_buffer; + BufferHandle task_buffer, cached_transform_buffer, aabb_buffer, compacted_params, indirect_draws; + + { + BufferCreateInfo info; + info.size = max_draws * sizeof(DrawParameters); + info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + info.domain = BufferDomain::Device; + compacted_params = device.create_buffer(info); + } { BufferCreateInfo info; @@ -360,8 +368,80 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } push; push.camera_pos = render_context.get_render_parameters().camera_position; + const bool use_meshlets = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet; + const bool use_preculling = !use_meshlets || true; - if (manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet) + if (use_preculling) + { + BufferCreateInfo info; + if (use_meshlets) + info.size = 16; + else + info.size = max_draws * sizeof(VkDrawIndexedIndirectCommand) + 256; + + info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT; + info.domain = BufferDomain::Device; + indirect_draws = device.create_buffer(info); + + if (use_meshlets) + { + cmd->fill_buffer(*indirect_draws, 0, 0, 4); + cmd->fill_buffer(*indirect_draws, 1, 4, 8); + } + else + { + cmd->fill_buffer(*indirect_draws, 0, 0, 256); + } + + cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); + } + + if (use_preculling) + { + auto *indirect = manager.get_indirect_buffer(); + bool supports_subgroup_path = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + + if (supports_subgroup_path) + { + cmd->enable_subgroup_size_control(true); + cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + } + + auto command_words = use_meshlets ? 0 : (sizeof(VkDrawIndexedIndirectCommand) / sizeof(uint32_t)); + + cmd->set_program("assets://shaders/meshlet_cull.comp", + {{"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path)}, + {"MESHLET_RENDER_DRAW_WORDS", int(command_words)}}); + cmd->set_storage_buffer(0, 0, *aabb_buffer); + cmd->set_storage_buffer(0, 1, *cached_transform_buffer); + cmd->set_storage_buffer(0, 2, *task_buffer); + if (!use_meshlets) + cmd->set_storage_buffer(0, 3, *indirect); + cmd->set_storage_buffer(0, 4, *indirect_draws); + cmd->set_storage_buffer(0, 5, *compacted_params); + cmd->set_storage_buffer(0, 6, *manager.get_cluster_bounds_buffer()); + memcpy(cmd->allocate_typed_constant_data(0, 7, 6), + render_context.get_visibility_frustum().get_planes(), + 6 * sizeof(vec4)); + + uint32_t count = task_params.size(); + push.count = count; + cmd->push_constants(&push, 0, sizeof(push)); + + cmd->dispatch((count + 31) / 32, 1, 1); + + cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_READ_BIT); + } + + if (use_meshlets) { auto *header_buffer = manager.get_meshlet_header_buffer(); auto *stream_header_buffer = manager.get_meshlet_stream_header_buffer(); @@ -390,10 +470,12 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); } - cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug.mesh", + cmd->set_program(use_preculling ? "" : "assets://shaders/meshlet_debug.task", + "assets://shaders/meshlet_debug.mesh", "assets://shaders/meshlet_debug.mesh.frag", {{"MESHLET_PAYLOAD_LARGE_WORKGROUP", int(large_workgroup)}, - {"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path)}}); + {"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path)}, + {"MESHLET_RENDER_TASK", int(!use_preculling)}}); cmd->set_storage_buffer(0, 0, *aabb_buffer); cmd->set_storage_buffer(0, 1, *cached_transform_buffer); @@ -410,17 +492,25 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 6 * sizeof(vec4)); cmd->set_storage_buffer(0, 9, *readback_counter); - - uint32_t count = task_params.size(); - push.count = count; - cmd->push_constants(&push, 0, sizeof(push)); + cmd->set_storage_buffer(0, 10, *compacted_params); GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); cmd->set_specialization_constant_mask(1); cmd->set_specialization_constant(0, style_to_u32_streams(MeshStyle::Wireframe)); - cmd->draw_mesh_tasks((count + 31) / 32, 1, 1); + if (use_preculling) + { + cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + } + else + { + uint32_t count = task_params.size(); + push.count = count; + cmd->push_constants(&push, 0, sizeof(push)); + cmd->draw_mesh_tasks((count + 31) / 32, 1, 1); + } + cmd->end_render_pass(); cmd->barrier(VK_PIPELINE_STAGE_TASK_SHADER_BIT_EXT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, @@ -434,57 +524,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto *ibo = manager.get_index_buffer(); auto *pos = manager.get_position_buffer(); auto *attr = manager.get_attribute_buffer(); - auto *indirect = manager.get_indirect_buffer(); - - BufferHandle indirect_draws, compacted_params; - { - BufferCreateInfo info; - info.size = max_draws * sizeof(VkDrawIndexedIndirectCommand) + 256; - info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; - info.domain = BufferDomain::Device; - info.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT; - indirect_draws = device.create_buffer(info); - } - - { - BufferCreateInfo info; - info.size = max_draws * sizeof(DrawParameters); - info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - info.domain = BufferDomain::Device; - compacted_params = device.create_buffer(info); - } - - bool supports_subgroup_path = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - - if (supports_subgroup_path) - { - cmd->enable_subgroup_size_control(true); - cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - } - - cmd->set_program("assets://shaders/meshlet_cull.comp", - {{"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path)}}); - cmd->set_storage_buffer(0, 0, *aabb_buffer); - cmd->set_storage_buffer(0, 1, *cached_transform_buffer); - cmd->set_storage_buffer(0, 2, *task_buffer); - cmd->set_storage_buffer(0, 3, *indirect); - cmd->set_storage_buffer(0, 4, *indirect_draws); - cmd->set_storage_buffer(0, 5, *compacted_params); - cmd->set_storage_buffer(0, 6, *manager.get_cluster_bounds_buffer()); - memcpy(cmd->allocate_typed_constant_data(0, 7, 6), - render_context.get_visibility_frustum().get_planes(), - 6 * sizeof(vec4)); - - uint32_t count = task_params.size(); - push.count = count; - cmd->push_constants(&push, 0, sizeof(push)); - - cmd->dispatch((count + 31) / 32, 1, 1); - - cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_READ_BIT); cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); @@ -507,7 +546,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); cmd->draw_indexed_multi_indirect(*indirect_draws, - 256, max_draws, sizeof(VkDrawIndexedIndirectCommand), + 256, max_draws, + sizeof(VkDrawIndexedIndirectCommand), *indirect_draws, 0); cmd->end_render_pass(); From 45d9aa75459663a346f7ac43c9c2b910932a4bf7 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 16 Dec 2023 12:20:06 +0100 Subject: [PATCH 02/59] Experiment with 128-wide meshlets. --- assets/shaders/inc/meshlet_payload_constants.h | 4 ++-- scene-export/meshlet_export.cpp | 5 ++--- tests/assets/shaders/meshlet_debug.mesh | 2 +- tests/meshlet_viewer.cpp | 9 ++++----- vulkan/mesh/meshlet.hpp | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_constants.h b/assets/shaders/inc/meshlet_payload_constants.h index 2a91ff53..eb8b5644 100644 --- a/assets/shaders/inc/meshlet_payload_constants.h +++ b/assets/shaders/inc/meshlet_payload_constants.h @@ -1,8 +1,8 @@ #ifndef MESHLET_PAYLOAD_CONSTANTS_H_ #define MESHLET_PAYLOAD_CONSTANTS_H_ -#define MESHLET_PAYLOAD_MAX_ELEMENTS 256 -#define MESHLET_PAYLOAD_NUM_CHUNKS 8 +#define MESHLET_PAYLOAD_MAX_ELEMENTS 128 +#define MESHLET_PAYLOAD_NUM_CHUNKS 4 #define MESHLET_PAYLOAD_MAX_STREAMS 16 #endif \ No newline at end of file diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index ba66a4f5..3905f1c9 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -671,9 +671,8 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me for (auto &p : positions) position_buffer.push_back(decode_snorm_exp(p)); - // Special meshoptimizer limit. - constexpr unsigned max_vertices = 255; - constexpr unsigned max_primitives = 256; + constexpr unsigned max_vertices = 128; + constexpr unsigned max_primitives = 128; size_t num_meshlets = meshopt_buildMeshletsBound(mesh.count, max_vertices, max_primitives); std::vector out_vertex_redirection_buffer(num_meshlets * max_vertices); diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index e0839170..7e9257cd 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -1,7 +1,7 @@ #version 450 #extension GL_EXT_mesh_shader : require -layout(max_primitives = 256, max_vertices = 255, triangles) out; +layout(max_primitives = 128, max_vertices = 128, triangles) out; #include "meshlet_payload_constants.h" diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 8af8fa2b..a1f3e9d6 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -457,8 +457,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 6 * sizeof(vec4)); bool large_workgroup = - device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32 && - device.get_device_features().mesh_shader_properties.maxMeshWorkGroupInvocations >= 256; + device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32; bool supports_subgroup_path = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT) && device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); @@ -560,9 +559,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V Fence fence; device.submit(cmd, &fence); - fence->wait(); - LOGI("Number of draws: %u\n", - *static_cast(device.map_host_buffer(*readback, MEMORY_ACCESS_READ_BIT))); + //fence->wait(); + //LOGI("Number of draws: %u\n", + // *static_cast(device.map_host_buffer(*readback, MEMORY_ACCESS_READ_BIT))); } void message(const std::string &tag, uint32_t code, uint32_t x, uint32_t y, uint32_t z, uint32_t, diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index 8d2c12f0..dc9f6c4a 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -41,7 +41,7 @@ namespace Vulkan namespace Meshlet { static constexpr unsigned MaxU32Streams = 16; -static constexpr unsigned MaxElements = 256; +static constexpr unsigned MaxElements = 128; static constexpr unsigned MaxPrimitives = MaxElements; static constexpr unsigned MaxVertices = MaxElements; From fc80a2fbc41b8cc7bfcc3c24a9bdc08090dae536 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 16 Dec 2023 15:47:02 +0100 Subject: [PATCH 03/59] Noodle with per-primitive culling. --- assets/shaders/inc/meshlet_payload_decode.h | 10 +- tests/assets/shaders/meshlet_debug.mesh | 170 +++++++++++++++++--- tests/meshlet_viewer.cpp | 13 +- 3 files changed, 161 insertions(+), 32 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index efbb73aa..7aa1e626 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -336,8 +336,7 @@ uint meshlet_get_linear_index() #if !MESHLET_PAYLOAD_SUBGROUP return gl_LocalInvocationIndex; #elif MESHLET_PAYLOAD_LARGE_WORKGROUP - // Rely on SubgroupInvocationID == LocalInvocationID.x here. - return gl_WorkGroupSize.x * gl_LocalInvocationID.y + gl_SubgroupInvocationID; + return gl_SubgroupSize * gl_SubgroupID + gl_SubgroupInvocationID; #else return gl_SubgroupInvocationID; #endif @@ -391,8 +390,7 @@ uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); - // Some compilers don't understand this is implicitly scalar. - uint chunk_id = wgx_mark_uniform(gl_LocalInvocationID.y); + uint chunk_id = gl_SubgroupID; MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); @@ -414,9 +412,7 @@ uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) // Dual-pump the computation. VGPR use is quite low either way, so this is fine. uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); - - // Some compilers don't understand this is implicitly scalar. - uint chunk_id = wgx_mark_uniform(gl_LocalInvocationID.y); + uint chunk_id = gl_SubgroupID; MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index 7e9257cd..eeedb1e6 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -1,6 +1,8 @@ #version 450 #extension GL_EXT_mesh_shader : require +#pragma optimize off + layout(max_primitives = 128, max_vertices = 128, triangles) out; #include "meshlet_payload_constants.h" @@ -45,6 +47,11 @@ layout(set = 1, binding = 0) uniform UBO mat4 VP; }; +layout(set = 1, binding = 2) uniform UBOViewport +{ + vec4 viewport; +}; + #ifndef MESHLET_RENDER_TASK #error "Must define MESHLET_RENDER_TASK" #endif @@ -58,40 +65,157 @@ layout(set = 0, binding = 10) readonly buffer DrawInfos }; #endif +shared vec2 shared_window_positions[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; +shared uint8_t shared_clip_code[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; +shared uvec4 shared_active_vert; +shared uvec4 shared_active_prim; +shared uvec4 shared_active_vert_count; +shared uvec4 shared_active_prim_count; + +const uint CLIP_CODE_NEGATIVE_W = 1; +const uint CLIP_CODE_INACCURATE = 2; + +uint compacted_vertex_output(uint index) +{ + return shared_active_vert_count[index / 32u] + bitCount(shared_active_vert[index / 32u] & ((1u << (index & 31u)) - 1u)); +} + +uint compacted_index_output(uint index) +{ + return shared_active_prim_count[index / 32u] + bitCount(shared_active_prim[index / 32u] & ((1u << (index & 31u)) - 1u)); +} + +bool lane_has_active_vert(uint index) +{ + return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; +} + +uvec3 remap_index_buffer(uvec3 prim) +{ + return uvec3(compacted_vertex_output(prim.x), + compacted_vertex_output(prim.y), + compacted_vertex_output(prim.z)); +} + +bool cull_triangle(vec2 a, vec2 b, vec2 c) +{ + // To be completely accurate, this should be done in fixed point, + // but we can YOLO a bit since glitches in extreme edge cases are considered okay. + precise vec2 ab = b - a; + precise vec2 ac = c - a; + + // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. + // Accuracy decays after that, but any error causing wrong rendering is imperceptible. + precise float neg_area = ab.x * ac.y; + precise float pos_area = ab.y * ac.x; + if (neg_area >= pos_area) + return false; + + // Micropoly test. + vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); + vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); + return all(notEqual(lo, hi)); +} + void main() { + if (gl_LocalInvocationIndex < MESHLET_PAYLOAD_NUM_CHUNKS) + { + shared_active_vert[gl_LocalInvocationIndex] = 0; + shared_active_prim[gl_LocalInvocationIndex] = 0; + } + CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; MeshletMetaRuntime meta = meshlet_metas_runtime.data[task.meshlet_index]; meshlet_init_workgroup(meta.stream_offset); - SetMeshOutputsEXT(meta.num_attributes, meta.num_primitives); - + uint linear_index = meshlet_get_linear_index(); mat4 M = transforms.data[task.node_offset]; -#define INDEX(index, value) \ - if (index < meta.num_primitives) \ - { \ - gl_PrimitiveTriangleIndicesEXT[index] = uvec4(unpack8(value)).xyz; \ - } + uvec3 prim; +#define INDEX(index, value) prim = uvec3(unpack8(value).xyz) MESHLET_DECODE_STREAM_32(meta.stream_offset, 0, INDEX); -#if MESHLET_PAYLOAD_LARGE_WORKGROUP - if (gl_LocalInvocationIndex < meta.num_primitives) - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; -#else - for (uint i = gl_LocalInvocationIndex; i < meta.num_primitives; i += 32) - vDrawID[i] = task.meshlet_index; -#endif + vec3 pos; +#define POSITION(index, value) pos = attribute_decode_snorm_exp_position(value) + MESHLET_DECODE_STREAM_64(meta.stream_offset, 1, POSITION); -#define POSITION(index, value) \ - if (index < meta.num_attributes) \ - { \ - vec3 pos = attribute_decode_snorm_exp_position(value); \ - vec3 world_pos = (M * vec4(pos, 1.0)).xyz; \ - vWorldPos[index] = world_pos; \ - gl_MeshVerticesEXT[index].gl_Position = VP * vec4(world_pos, 1.0); \ + vec3 world_pos = (M * vec4(pos, 1.0)).xyz; + vec4 clip_pos = VP * vec4(world_pos, 1.0); + vec2 c = clip_pos.xy / clip_pos.w; + uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; + if (any(greaterThan(abs(c), vec2(4.0)))) + clip_code |= CLIP_CODE_INACCURATE; + vec2 window = roundEven(c * viewport.zw + viewport.xy); + shared_window_positions[linear_index] = window; + shared_clip_code[linear_index] = uint8_t(clip_code); + + barrier(); + + bool is_active_prim = false; + if (linear_index < meta.num_primitives) + { + vec2 a = shared_window_positions[prim.x]; + vec2 b = shared_window_positions[prim.y]; + vec2 c = shared_window_positions[prim.z]; + uint code_a = shared_clip_code[prim.x]; + uint code_b = shared_clip_code[prim.y]; + uint code_c = shared_clip_code[prim.z]; + + uint or_code_a = code_a | code_b | code_c; + uint and_code_a = code_a & code_b & code_c; + + if ((and_code_a & CLIP_CODE_NEGATIVE_W) == 0) + { + if ((or_code_a & CLIP_CODE_INACCURATE) != 0 || cull_triangle(a, b, c)) + { + is_active_prim = true; + atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); + atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); + atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); + atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); + } + } } - MESHLET_DECODE_STREAM_64(meta.stream_offset, 1, POSITION); + + barrier(); + + if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) + { + uvec3 num_active_prim = bitCount(shared_active_prim.xyz); + num_active_prim.y += num_active_prim.x; + num_active_prim.z += num_active_prim.y; + shared_active_prim_count = uvec4(0, num_active_prim); + } + else if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 1) + { + uvec3 num_active_vert = bitCount(shared_active_vert.xyz); + num_active_vert.y += num_active_vert.x; + num_active_vert.z += num_active_vert.y; + shared_active_vert_count = uvec4(0, num_active_vert); + } + + barrier(); + + uint num_verts = shared_active_vert_count.w + bitCount(shared_active_vert.w); + uint num_prims = shared_active_prim_count.w + bitCount(shared_active_prim.w); + + SetMeshOutputsEXT(num_verts, num_prims); + + if (is_active_prim) + gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); + + if (gl_LocalInvocationIndex < num_prims) + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; + + bool has_active_vert = lane_has_active_vert(linear_index); + uint out_vert_index = compacted_vertex_output(linear_index); + + if (has_active_vert) + { + gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; + vWorldPos[out_vert_index] = world_pos; + } #if 0 #define NORMAL(index, value) \ @@ -116,4 +240,4 @@ void main() } MESHLET_DECODE_STREAM_64(meta.stream_offset, 5, UV); #endif -} \ No newline at end of file +} diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index a1f3e9d6..7ea713c2 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -172,6 +172,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto &scene_nodes = parser.get_scenes()[parser.get_default_scene()]; auto root = scene.create_node(); +#if 1 for (int z = -10; z <= 10; z++) for (int y = -10; y <= 10; y++) for (int x = -10; x <= 10; x++) @@ -189,6 +190,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V renderable->flags |= RENDERABLE_FORCE_VISIBLE_BIT; scene.create_renderable(std::move(renderable), nodeptr.get()); } +#endif if (false) { @@ -456,8 +458,15 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); - bool large_workgroup = - device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32; + *cmd->allocate_typed_constant_data(1, 2, 1) = + float(1 << 8 /* shader assumes 8 */) * + vec4(cmd->get_viewport().x + 0.5f * cmd->get_viewport().width - 0.5f, + cmd->get_viewport().y + 0.5f * cmd->get_viewport().height - 0.5f, + 0.5f * cmd->get_viewport().width, + 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); + + bool large_workgroup = true; + //device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32; bool supports_subgroup_path = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT) && device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); From 8e8a7023f75e2ca76d569494d0909ef4c2b31112 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 16 Dec 2023 16:56:18 +0100 Subject: [PATCH 04/59] Fixups for 128. --- assets/shaders/inc/meshlet_payload_decode.h | 85 ++++----------------- 1 file changed, 13 insertions(+), 72 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 7aa1e626..074b43cb 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -49,7 +49,7 @@ struct MeshletStream u16vec4 predictor_b; u8vec4 initial_value; uint offset_from_base; - uint16_t bitplane_meta[MESHLET_PAYLOAD_NUM_CHUNKS]; + u16vec4 bitplane_meta; }; struct MeshletMetaRaw @@ -88,7 +88,7 @@ layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_B } payload; #if MESHLET_PAYLOAD_LARGE_WORKGROUP -shared uint shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS][MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uvec4 shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS]; shared uvec2 chunk_values0[MESHLET_PAYLOAD_NUM_CHUNKS]; shared uvec2 chunk_values1[MESHLET_PAYLOAD_NUM_CHUNKS]; #endif @@ -132,37 +132,6 @@ uvec4 wgx_shuffle(uvec4 v, uint lane) #define wgx_broadcast_last(v) wgx_shuffle(v, gl_WorkGroupSize.x - 1) -uint wgx_exclusive_add8(uint v) -{ - // WAR hazard. - barrier(); - wave_buffer_x[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v; - barrier(); - - uint idx = gl_LocalInvocationID.x; - - [[unroll]] - for (int chunk_size = 2; chunk_size <= 8; chunk_size *= 2) - { - int upper_mask = chunk_size >> 1; - int lower_mask = upper_mask - 1; - int chunk_mask = ~(chunk_size - 1); - if ((idx & upper_mask) != 0) - { - v += wave_buffer_x[gl_LocalInvocationID.y][(idx & chunk_mask) + lower_mask]; - wave_buffer_x[gl_LocalInvocationID.y][idx] = v; - } - barrier(); - } - - if (idx > 0) - v = wave_buffer_x[gl_LocalInvocationID.y][idx - 1]; - else - v = 0; - - return v; -} - uvec2 wgx_inclusive_add(uvec2 v) { barrier(); @@ -238,7 +207,6 @@ uvec4 wgx_inclusive_add(uvec4 v) #define wgx_num_subgroups (gl_WorkGroupSize.y) #define wgx_mark_uniform(v) (v) #else -#define wgx_exclusive_add8(v) subgroupExclusiveAdd(v) #define wgx_inclusive_add(v) subgroupInclusiveAdd(v) #define wgx_subgroup_invocation_id gl_SubgroupInvocationID #define wgx_subgroup_size gl_SubgroupSize @@ -282,51 +250,24 @@ void meshlet_compute_stream_counts(uint bitplane_value, out uint out_total_bits, void meshlet_init_workgroup(uint base_stream_index) { #if MESHLET_PAYLOAD_LARGE_WORKGROUP -#if MESHLET_PAYLOAD_SUBGROUP for (uint stream_index = wgx_subgroup_id; stream_index < MESHLET_PAYLOAD_NUM_U32_STREAMS; stream_index += wgx_num_subgroups) { - if (wgx_subgroup_invocation_id < MESHLET_PAYLOAD_NUM_CHUNKS) + if (gl_LocalInvocationID.x == 0) { - uvec4 bit_counts; - uint total_bits; - uint unrolled_stream_index = base_stream_index + stream_index; - uint bitplane_value = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[wgx_subgroup_invocation_id]); - meshlet_compute_stream_counts(bitplane_value, total_bits, bit_counts); + uvec4 bitplane_values = uvec4(meshlet_streams.data[unrolled_stream_index].bitplane_meta); - uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base + wgx_exclusive_add8(total_bits); - // Start by decoding the offset for bitplanes for all u32 streams. - shared_chunk_offset[stream_index][wgx_subgroup_invocation_id] = chunk_offset; - } - } -#else - for (uint uniform_stream_index = 0; uniform_stream_index < MESHLET_PAYLOAD_NUM_U32_STREAMS; uniform_stream_index += wgx_num_subgroups) - { - uint stream_index = uniform_stream_index + wgx_subgroup_id; - uint bitplane_value; - uvec4 bit_counts; - uint total_bits; - - uint unrolled_stream_index = base_stream_index + stream_index; - bool active_lane = stream_index < MESHLET_PAYLOAD_NUM_U32_STREAMS && wgx_subgroup_invocation_id < MESHLET_PAYLOAD_NUM_CHUNKS; - - if (active_lane) - { - bitplane_value = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[wgx_subgroup_invocation_id]); - meshlet_compute_stream_counts(bitplane_value, total_bits, bit_counts); - } - - // This needs to happen in dynamically uniform control flow. - uint chunk_offset = wgx_exclusive_add8(total_bits); - - if (active_lane) - { - chunk_offset += meshlet_streams.data[unrolled_stream_index].offset_from_base; - // Start by decoding the offset for bitplanes for all u32 streams. - shared_chunk_offset[stream_index][wgx_subgroup_invocation_id] = chunk_offset; + uvec3 total_bits; + uvec4 bit_counts; + meshlet_compute_stream_counts(bitplane_values.x, total_bits.x, bit_counts); + meshlet_compute_stream_counts(bitplane_values.y, total_bits.y, bit_counts); + meshlet_compute_stream_counts(bitplane_values.z, total_bits.z, bit_counts); + total_bits.y += total_bits.x; + total_bits.z += total_bits.y; + uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; + shared_chunk_offset[stream_index] = chunk_offset + uvec4(0, total_bits); } } -#endif barrier(); #endif } From 19f69d2f12be8a2a5326509c4cb81a752a3f98b6 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 16 Dec 2023 17:49:46 +0100 Subject: [PATCH 05/59] Fixup non-subgroup build. --- assets/shaders/inc/meshlet_payload_decode.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 074b43cb..1e47f7cf 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -331,7 +331,7 @@ uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); - uint chunk_id = gl_SubgroupID; + uint chunk_id = wgx_subgroup_id; MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); @@ -353,7 +353,7 @@ uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) // Dual-pump the computation. VGPR use is quite low either way, so this is fine. uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); - uint chunk_id = gl_SubgroupID; + uint chunk_id = wgx_subgroup_id; MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); From 2d922c45b06396675e092b4449f2d61f8f60a69e Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 16 Dec 2023 20:22:32 +0100 Subject: [PATCH 06/59] Experiment with generic subgroup path. --- assets/shaders/inc/meshlet_payload_decode.h | 298 ++++---------------- tests/assets/shaders/meshlet_debug.mesh | 23 +- tests/meshlet_viewer.cpp | 10 +- 3 files changed, 66 insertions(+), 265 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 1e47f7cf..ed406658 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -5,6 +5,10 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_scalar_block_layout : require #extension GL_EXT_control_flow_attributes : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_basic : require #include "meshlet_payload_constants.h" @@ -16,17 +20,6 @@ #error "Must define MESHLET_PAYLOAD_LARGE_WORKGROUP" #endif -#ifndef MESHLET_PAYLOAD_SUBGROUP -#error "Must define MESHLET_PAYLOAD_SUBGROUP" -#endif - -#if MESHLET_PAYLOAD_SUBGROUP -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_KHR_shader_subgroup_basic : require -#endif - #ifndef MESHLET_PAYLOAD_DESCRIPTOR_SET #error "Must define MESHLET_PAYLOAD_DESCRIPTOR_SET" #endif @@ -87,77 +80,30 @@ layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_B uint data[]; } payload; -#if MESHLET_PAYLOAD_LARGE_WORKGROUP shared uvec4 shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS]; shared uvec2 chunk_values0[MESHLET_PAYLOAD_NUM_CHUNKS]; shared uvec2 chunk_values1[MESHLET_PAYLOAD_NUM_CHUNKS]; -#endif - -#if !MESHLET_PAYLOAD_SUBGROUP -shared uint wave_buffer_x[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; -shared uint wave_buffer_y[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; -shared uint wave_buffer_z[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; -shared uint wave_buffer_w[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; -shared uvec4 wave_broadcast_value[gl_WorkGroupSize.y]; - -uint wgx_shuffle(uint v, uint lane) -{ - // WAR hazard. - barrier(); - if (gl_LocalInvocationID.x == lane) - wave_broadcast_value[gl_LocalInvocationID.y].x = v; - barrier(); - return wave_broadcast_value[gl_LocalInvocationID.y].x; -} -uvec2 wgx_shuffle(uvec2 v, uint lane) -{ - // WAR hazard. - barrier(); - if (gl_LocalInvocationID.x == lane) - wave_broadcast_value[gl_LocalInvocationID.y].xy = v; - barrier(); - return wave_broadcast_value[gl_LocalInvocationID.y].xy; -} - -uvec4 wgx_shuffle(uvec4 v, uint lane) -{ - // WAR hazard. - barrier(); - if (gl_LocalInvocationID.x == lane) - wave_broadcast_value[gl_LocalInvocationID.y] = v; - barrier(); - return wave_broadcast_value[gl_LocalInvocationID.y]; -} - -#define wgx_broadcast_last(v) wgx_shuffle(v, gl_WorkGroupSize.x - 1) +shared uint wave_buffer_x[MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uint wave_buffer_y[MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uint wave_buffer_z[MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uint wave_buffer_w[MESHLET_PAYLOAD_NUM_CHUNKS]; uvec2 wgx_inclusive_add(uvec2 v) { - barrier(); - wave_buffer_x[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v.x; - wave_buffer_y[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v.y; + v = subgroupInclusiveAdd(v); + if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) + { + wave_buffer_x[gl_SubgroupID] = v.x; + wave_buffer_y[gl_SubgroupID] = v.y; + } - uint idx = gl_LocalInvocationID.x; + barrier(); - [[unroll]] - for (int chunk_size = 2; chunk_size <= 32; chunk_size *= 2) + for (uint i = 0; i < gl_SubgroupID; i++) { - int upper_mask = chunk_size >> 1; - int lower_mask = upper_mask - 1; - int chunk_mask = ~(chunk_size - 1); - barrier(); - if ((idx & upper_mask) != 0) - { - v.x += wave_buffer_x[gl_LocalInvocationID.y][(idx & chunk_mask) | lower_mask]; - v.y += wave_buffer_y[gl_LocalInvocationID.y][(idx & chunk_mask) | lower_mask]; - - if (chunk_size != 32) - { - wave_buffer_x[gl_LocalInvocationID.y][idx] = v.x; - wave_buffer_y[gl_LocalInvocationID.y][idx] = v.y; - } - } + v.x += wave_buffer_x[i]; + v.y += wave_buffer_y[i]; } return v; @@ -165,58 +111,28 @@ uvec2 wgx_inclusive_add(uvec2 v) uvec4 wgx_inclusive_add(uvec4 v) { - barrier(); - wave_buffer_x[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v.x; - wave_buffer_y[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v.y; - wave_buffer_z[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v.z; - wave_buffer_w[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = v.w; + v = subgroupInclusiveAdd(v); + if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) + { + wave_buffer_x[gl_SubgroupID] = v.x; + wave_buffer_y[gl_SubgroupID] = v.y; + wave_buffer_z[gl_SubgroupID] = v.z; + wave_buffer_w[gl_SubgroupID] = v.w; + } - uint idx = gl_LocalInvocationID.x; + barrier(); - [[unroll]] - for (int chunk_size = 2; chunk_size <= 32; chunk_size *= 2) + for (uint i = 0; i < gl_SubgroupID; i++) { - int upper_mask = chunk_size >> 1; - int lower_mask = upper_mask - 1; - int chunk_mask = ~(chunk_size - 1); - barrier(); - if ((idx & upper_mask) != 0) - { - v.x += wave_buffer_x[gl_LocalInvocationID.y][(idx & chunk_mask) | lower_mask]; - v.y += wave_buffer_y[gl_LocalInvocationID.y][(idx & chunk_mask) | lower_mask]; - v.z += wave_buffer_z[gl_LocalInvocationID.y][(idx & chunk_mask) | lower_mask]; - v.w += wave_buffer_w[gl_LocalInvocationID.y][(idx & chunk_mask) | lower_mask]; - - if (chunk_size != 32) - { - wave_buffer_x[gl_LocalInvocationID.y][idx] = v.x; - wave_buffer_y[gl_LocalInvocationID.y][idx] = v.y; - wave_buffer_z[gl_LocalInvocationID.y][idx] = v.z; - wave_buffer_w[gl_LocalInvocationID.y][idx] = v.w; - } - } + v.x += wave_buffer_x[i]; + v.y += wave_buffer_y[i]; + v.z += wave_buffer_z[i]; + v.w += wave_buffer_w[i]; } return v; } - -#define wgx_subgroup_invocation_id (gl_LocalInvocationID.x) -#define wgx_subgroup_size (gl_WorkGroupSize.x) -#define wgx_subgroup_id (gl_LocalInvocationID.y) -#define wgx_num_subgroups (gl_WorkGroupSize.y) -#define wgx_mark_uniform(v) (v) -#else -#define wgx_inclusive_add(v) subgroupInclusiveAdd(v) -#define wgx_subgroup_invocation_id gl_SubgroupInvocationID -#define wgx_subgroup_size gl_SubgroupSize -#define wgx_subgroup_id gl_SubgroupID -#define wgx_num_subgroups gl_NumSubgroups -#define wgx_broadcast_last(v) subgroupBroadcast(v, 31) -#define wgx_mark_uniform(v) subgroupBroadcastFirst(v) -#define wgx_shuffle(v, lane) subgroupShuffle(v, lane) -#endif - // Hardcodes wave32 atm. Need fallback. uvec2 pack_u16vec4_to_uvec2(u16vec4 v) @@ -249,38 +165,27 @@ void meshlet_compute_stream_counts(uint bitplane_value, out uint out_total_bits, void meshlet_init_workgroup(uint base_stream_index) { -#if MESHLET_PAYLOAD_LARGE_WORKGROUP - for (uint stream_index = wgx_subgroup_id; stream_index < MESHLET_PAYLOAD_NUM_U32_STREAMS; stream_index += wgx_num_subgroups) + if (gl_LocalInvocationIndex < MESHLET_PAYLOAD_NUM_U32_STREAMS) { - if (gl_LocalInvocationID.x == 0) - { - uint unrolled_stream_index = base_stream_index + stream_index; - uvec4 bitplane_values = uvec4(meshlet_streams.data[unrolled_stream_index].bitplane_meta); - - uvec3 total_bits; - uvec4 bit_counts; - meshlet_compute_stream_counts(bitplane_values.x, total_bits.x, bit_counts); - meshlet_compute_stream_counts(bitplane_values.y, total_bits.y, bit_counts); - meshlet_compute_stream_counts(bitplane_values.z, total_bits.z, bit_counts); - total_bits.y += total_bits.x; - total_bits.z += total_bits.y; - uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; - shared_chunk_offset[stream_index] = chunk_offset + uvec4(0, total_bits); - } + uint unrolled_stream_index = base_stream_index + gl_LocalInvocationIndex; + uvec4 bitplane_values = uvec4(meshlet_streams.data[unrolled_stream_index].bitplane_meta); + + uvec3 total_bits; + uvec4 bit_counts; + meshlet_compute_stream_counts(bitplane_values.x, total_bits.x, bit_counts); + meshlet_compute_stream_counts(bitplane_values.y, total_bits.y, bit_counts); + meshlet_compute_stream_counts(bitplane_values.z, total_bits.z, bit_counts); + total_bits.y += total_bits.x; + total_bits.z += total_bits.y; + uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; + shared_chunk_offset[gl_LocalInvocationIndex] = chunk_offset + uvec4(0, total_bits); } barrier(); -#endif } uint meshlet_get_linear_index() { -#if !MESHLET_PAYLOAD_SUBGROUP - return gl_LocalInvocationIndex; -#elif MESHLET_PAYLOAD_LARGE_WORKGROUP return gl_SubgroupSize * gl_SubgroupID + gl_SubgroupInvocationID; -#else - return gl_SubgroupInvocationID; -#endif } // Overlap load with consumption. @@ -288,7 +193,7 @@ uint meshlet_get_linear_index() #define MESHLET_FETCH_BITPLANES(decoded_value, counts, payload_value, offset) \ for (int i = 0; i < counts; i++) \ { \ - decoded_value |= bitfieldExtract(payload_value, int(wgx_subgroup_invocation_id), 1) << i; \ + decoded_value |= bitfieldExtract(payload_value, local_chunk_index, 1) << i; \ payload_value = payload.data[++offset]; \ } \ decoded_value = bitfieldExtract(int(decoded_value), 0, counts) @@ -301,16 +206,10 @@ uint meshlet_get_linear_index() u8vec4 initial_value_##iter = meshlet_streams.data[unrolled_stream_index].initial_value; \ uvec2 initial_value##iter = pack_u16vec4_to_uvec2(u16vec4(initial_value_##iter)) -#if MESHLET_PAYLOAD_LARGE_WORKGROUP #define MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(unrolled_stream_index, stream_index, chunk_id, iter) \ uint bitplane_offsets##iter = shared_chunk_offset[stream_index][chunk_id]; \ uint bitplane_value##iter = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[chunk_id]); \ ivec4 bit_counts##iter = ivec4(meshlet_decode_bit_counts(bitplane_value##iter)) -#else -#define MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(unrolled_stream_index, stream_index, chunk_id, iter) \ - uint bitplane_offsets##iter = wgx_shuffle(shared_chunk_offset##iter, chunk_id); \ - ivec4 bit_counts##iter = ivec4(wgx_shuffle(shared_chunk_bit_counts##iter, chunk_id)) -#endif #define MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, iter) \ uvec4 decoded##iter = ivec4(0); \ @@ -325,26 +224,16 @@ uint meshlet_get_linear_index() packed_decoded##iter += initial_value##iter; \ packed_decoded##iter += pack_u16vec4_to_uvec2((predictor_a##iter + predictor_b##iter * uint16_t(linear_index)) >> 8us) -#if MESHLET_PAYLOAD_LARGE_WORKGROUP uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) { uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); - - uint chunk_id = wgx_subgroup_id; + uint chunk_id = linear_index / 32u; + int local_chunk_index = int(linear_index & 31); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); packed_decoded0 = wgx_inclusive_add(packed_decoded0); - - barrier(); // Resolve WAR hazard from last iteration. - if (wgx_subgroup_invocation_id == wgx_subgroup_size - 1) - chunk_values0[chunk_id] = packed_decoded0 & 0xff00ffu; - barrier(); - - for (uint i = 0; i < chunk_id; i++) - packed_decoded0 += chunk_values0[i]; - return repack_uint(packed_decoded0); } @@ -353,106 +242,23 @@ uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) // Dual-pump the computation. VGPR use is quite low either way, so this is fine. uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); - uint chunk_id = wgx_subgroup_id; + uint chunk_id = linear_index / 32u; + int local_chunk_index = int(linear_index & 31); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index + 1, stream_index + 1, chunk_id, 1); uvec4 packed_decoded = wgx_inclusive_add(uvec4(packed_decoded0, packed_decoded1)); - - barrier(); // Resolve WAR hazard from last iteration. - if (wgx_subgroup_invocation_id == wgx_subgroup_size - 1) - { - chunk_values0[chunk_id] = packed_decoded.xy & 0xff00ffu; - chunk_values1[chunk_id] = packed_decoded.zw & 0xff00ffu; - } - barrier(); - - for (uint i = 0; i < chunk_id; i++) - { - packed_decoded.xy += chunk_values0[i]; - packed_decoded.zw += chunk_values1[i]; - } - return uvec2(repack_uint(packed_decoded.xy), repack_uint(packed_decoded.zw)); } -// For large workgroups, we imply AMD, where LocalInvocationIndex indexing is preferred. -// We assume that SubgroupInvocationID == LocalInvocationID.x here since it's the only reasonable it would work. #define MESHLET_DECODE_STREAM_32(meshlet_index, stream_index, report_cb) { \ uint value = meshlet_decode_stream_32_wg256(meshlet_index, stream_index); \ - report_cb(gl_LocalInvocationIndex, value); } + report_cb(meshlet_get_linear_index(), value); } #define MESHLET_DECODE_STREAM_64(meshlet_index, stream_index, report_cb) { \ uvec2 value = meshlet_decode_stream_64_wg256(meshlet_index, stream_index); \ - report_cb(gl_LocalInvocationIndex, value); } - -#else - -// Have to iterate and report once per chunk. Avoids having to spend a lot of LDS memory. -#define MESHLET_DECODE_STREAM_32(base_stream_index, stream_index, report_cb) { \ - uint unrolled_stream_index = base_stream_index + stream_index; \ - uint linear_index = meshlet_get_linear_index(); \ - uvec2 prev_value = uvec2(0); \ - uint shared_chunk_offset0; \ - uvec4 shared_chunk_bit_counts0; \ - uint total_bits0; \ - uint bitplane_value0; \ - if (wgx_subgroup_invocation_id < MESHLET_PAYLOAD_NUM_CHUNKS) \ - bitplane_value0 = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[wgx_subgroup_invocation_id]); \ - meshlet_compute_stream_counts(bitplane_value0, total_bits0, shared_chunk_bit_counts0); \ - shared_chunk_offset0 = wgx_exclusive_add8(total_bits0) + meshlet_streams.data[unrolled_stream_index].offset_from_base; \ - MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); \ - [[loop]] \ - for (uint chunk_id = 0; chunk_id < MESHLET_PAYLOAD_NUM_CHUNKS; chunk_id++) \ - { \ - MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); \ - packed_decoded0 = wgx_inclusive_add(packed_decoded0); \ - packed_decoded0 += prev_value; \ - prev_value = wgx_broadcast_last(packed_decoded0) & 0xff00ffu; \ - report_cb(linear_index, repack_uint(packed_decoded0)); \ - linear_index += wgx_subgroup_size; \ - } \ -} - -// Have to iterate and report once per chunk. Avoids having to spend a lot of LDS memory. -#define MESHLET_DECODE_STREAM_64(base_stream_index, stream_index, report_cb) { \ - uint unrolled_stream_index = base_stream_index + stream_index; \ - uint linear_index = meshlet_get_linear_index(); \ - uvec4 prev_value = uvec4(0); \ - uint shared_chunk_offset0; \ - uvec4 shared_chunk_bit_counts0; \ - uint shared_chunk_offset1; \ - uvec4 shared_chunk_bit_counts1; \ - uint total_bits0; \ - uint total_bits1; \ - uint bitplane_value0; \ - uint bitplane_value1; \ - if (wgx_subgroup_invocation_id < MESHLET_PAYLOAD_NUM_CHUNKS) \ - { \ - bitplane_value0 = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[wgx_subgroup_invocation_id]); \ - bitplane_value1 = uint(meshlet_streams.data[unrolled_stream_index + 1].bitplane_meta[wgx_subgroup_invocation_id]); \ - } \ - meshlet_compute_stream_counts(bitplane_value0, total_bits0, shared_chunk_bit_counts0); \ - meshlet_compute_stream_counts(bitplane_value1, total_bits1, shared_chunk_bit_counts1); \ - shared_chunk_offset0 = wgx_exclusive_add8(total_bits0) + meshlet_streams.data[unrolled_stream_index].offset_from_base; \ - shared_chunk_offset1 = wgx_exclusive_add8(total_bits1) + meshlet_streams.data[unrolled_stream_index + 1].offset_from_base; \ - MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); \ - MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); \ - [[loop]] \ - for (uint chunk_id = 0; chunk_id < MESHLET_PAYLOAD_NUM_CHUNKS; chunk_id++) \ - { \ - MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); \ - MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index + 1, stream_index + 1, chunk_id, 1); \ - uvec4 packed_decoded = wgx_inclusive_add(uvec4(packed_decoded0, packed_decoded1)); \ - packed_decoded += prev_value; \ - prev_value = wgx_broadcast_last(packed_decoded) & 0xff00ffu; \ - report_cb(linear_index, uvec2(repack_uint(packed_decoded.xy), repack_uint(packed_decoded.zw))); \ - linear_index += wgx_subgroup_size; \ - } \ -} - -#endif + report_cb(meshlet_get_linear_index(), value); } #endif diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index eeedb1e6..219e3a06 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -1,18 +1,9 @@ #version 450 #extension GL_EXT_mesh_shader : require -#pragma optimize off - layout(max_primitives = 128, max_vertices = 128, triangles) out; - #include "meshlet_payload_constants.h" - -#if MESHLET_PAYLOAD_LARGE_WORKGROUP -#define MESHLET_PAYLOAD_WG_Y MESHLET_PAYLOAD_NUM_CHUNKS -#else -#define MESHLET_PAYLOAD_WG_Y 1 -#endif -layout(local_size_x = 32, local_size_y = MESHLET_PAYLOAD_WG_Y) in; +layout(local_size_x = 128) in; layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; #define MESHLET_PAYLOAD_NUM_U32_STREAMS NUM_U32_STREAMS @@ -105,10 +96,10 @@ bool cull_triangle(vec2 a, vec2 b, vec2 c) precise vec2 ac = c - a; // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. - // Accuracy decays after that, but any error causing wrong rendering is imperceptible. + // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. precise float neg_area = ab.x * ac.y; precise float pos_area = ab.y * ac.x; - if (neg_area >= pos_area) + if (neg_area > pos_area) return false; // Micropoly test. @@ -180,15 +171,13 @@ void main() barrier(); - if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) + if (gl_LocalInvocationIndex == 0) { uvec3 num_active_prim = bitCount(shared_active_prim.xyz); num_active_prim.y += num_active_prim.x; num_active_prim.z += num_active_prim.y; shared_active_prim_count = uvec4(0, num_active_prim); - } - else if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 1) - { + uvec3 num_active_vert = bitCount(shared_active_vert.xyz); num_active_vert.y += num_active_vert.x; num_active_vert.z += num_active_vert.y; @@ -209,10 +198,10 @@ void main() vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; bool has_active_vert = lane_has_active_vert(linear_index); - uint out_vert_index = compacted_vertex_output(linear_index); if (has_active_vert) { + uint out_vert_index = compacted_vertex_output(linear_index); gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; vWorldPos[out_vert_index] = world_pos; } diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 7ea713c2..27a880b3 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -249,9 +249,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V allocator.reset(); } - void render_frame(double, double) override + void render_frame(double frame_time, double) override { scene.update_all_transforms(); + LOGI("Frame time: %.3f ms.\n", frame_time * 1e3); auto &wsi = get_wsi(); auto &device = wsi.get_device(); @@ -371,7 +372,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V push.camera_pos = render_context.get_render_parameters().camera_position; const bool use_meshlets = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet; - const bool use_preculling = !use_meshlets || true; + const bool use_preculling = !use_meshlets || false; if (use_preculling) { @@ -477,6 +478,11 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); } + else + { + cmd->enable_subgroup_size_control(true); + cmd->set_subgroup_size_log2(true, 5, 7, VK_SHADER_STAGE_MESH_BIT_EXT); + } cmd->set_program(use_preculling ? "" : "assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug.mesh", From 16d8c69609230c3301d1ef4a7292676dc4405c61 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 12:14:48 +0100 Subject: [PATCH 07/59] Use explicit wave32 path. --- assets/shaders/inc/meshlet_payload_decode.h | 12 ++++++++++++ tests/assets/shaders/meshlet_cull.comp | 8 ++++---- tests/assets/shaders/meshlet_debug.task | 6 +++--- tests/meshlet_viewer.cpp | 16 ++++++++-------- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index ed406658..8bb60387 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -228,8 +228,14 @@ uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) { uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); + +#if MESHLET_PAYLOAD_WAVE32 + uint chunk_id = gl_SubgroupID; + int local_chunk_index = int(gl_SubgroupInvocationID); +#else uint chunk_id = linear_index / 32u; int local_chunk_index = int(linear_index & 31); +#endif MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); @@ -242,8 +248,14 @@ uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) // Dual-pump the computation. VGPR use is quite low either way, so this is fine. uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); + +#if MESHLET_PAYLOAD_WAVE32 + uint chunk_id = gl_SubgroupID; + int local_chunk_index = int(gl_SubgroupInvocationID); +#else uint chunk_id = linear_index / 32u; int local_chunk_index = int(linear_index & 31); +#endif MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); diff --git a/tests/assets/shaders/meshlet_cull.comp b/tests/assets/shaders/meshlet_cull.comp index aead9f95..1b4a5d85 100644 --- a/tests/assets/shaders/meshlet_cull.comp +++ b/tests/assets/shaders/meshlet_cull.comp @@ -1,6 +1,6 @@ #version 450 #extension GL_EXT_scalar_block_layout : require -#if MESHLET_PAYLOAD_SUBGROUP +#if MESHLET_PAYLOAD_WAVE32 #extension GL_KHR_shader_subgroup_ballot : require #extension GL_KHR_shader_subgroup_vote : require #extension GL_KHR_shader_subgroup_shuffle : require @@ -47,7 +47,7 @@ layout(push_constant, std430) uniform Registers uint count; } registers; -#if !MESHLET_PAYLOAD_SUBGROUP +#if !MESHLET_PAYLOAD_WAVE32 shared uint ballot_value; shared uint global_offset; @@ -102,7 +102,7 @@ void main() int lane = findLSB(b); b &= ~(1u << lane); -#if MESHLET_PAYLOAD_SUBGROUP +#if MESHLET_PAYLOAD_WAVE32 uint node_instance = subgroupShuffle(task.node_instance, lane); uint node_count_material_index = subgroupShuffle(task.node_count_material_index, lane); uint mesh_index_count = subgroupShuffle(task.mesh_index_count, lane); @@ -130,7 +130,7 @@ void main() uint draw_count = ballotBitCount(ballot); uint local_offset = ballotExclusiveBitCount(ballot); -#if MESHLET_PAYLOAD_SUBGROUP +#if MESHLET_PAYLOAD_WAVE32 uint global_offset; if (subgroupElect()) global_offset = atomicAdd(output_draws.count, draw_count); diff --git a/tests/assets/shaders/meshlet_debug.task b/tests/assets/shaders/meshlet_debug.task index dab1e96a..3f101f72 100644 --- a/tests/assets/shaders/meshlet_debug.task +++ b/tests/assets/shaders/meshlet_debug.task @@ -1,7 +1,7 @@ #version 450 #extension GL_EXT_mesh_shader : require -#if MESHLET_PAYLOAD_SUBGROUP +#if MESHLET_PAYLOAD_WAVE32 #extension GL_KHR_shader_subgroup_arithmetic : require #extension GL_KHR_shader_subgroup_ballot : require #extension GL_KHR_shader_subgroup_shuffle : require @@ -27,7 +27,7 @@ taskPayloadSharedEXT CompactedDrawInfo mesh_payload[32 * 32]; layout(set = 0, binding = 9) buffer Counter { uint task_counter; }; -#if !MESHLET_PAYLOAD_SUBGROUP +#if !MESHLET_PAYLOAD_WAVE32 shared uint ballot_value; uvec4 ballot(bool v) { @@ -82,7 +82,7 @@ void main() int lane = findLSB(b); b &= ~(1u << lane); -#if MESHLET_PAYLOAD_SUBGROUP +#if MESHLET_PAYLOAD_WAVE32 uint node_instance = subgroupShuffle(task.node_instance, lane); uint node_count_material_index = subgroupShuffle(task.node_count_material_index, lane); uint mesh_index_count = subgroupShuffle(task.mesh_index_count, lane); diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 27a880b3..cf9b7a53 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -372,7 +372,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V push.camera_pos = render_context.get_render_parameters().camera_position; const bool use_meshlets = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet; - const bool use_preculling = !use_meshlets || false; + const bool use_preculling = !use_meshlets || true; if (use_preculling) { @@ -408,9 +408,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V if (use_preculling) { auto *indirect = manager.get_indirect_buffer(); - bool supports_subgroup_path = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - if (supports_subgroup_path) + if (supports_wave32) { cmd->enable_subgroup_size_control(true); cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); @@ -419,7 +419,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto command_words = use_meshlets ? 0 : (sizeof(VkDrawIndexedIndirectCommand) / sizeof(uint32_t)); cmd->set_program("assets://shaders/meshlet_cull.comp", - {{"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path)}, + {{"MESHLET_PAYLOAD_WAVE32", int(supports_wave32)}, {"MESHLET_RENDER_DRAW_WORDS", int(command_words)}}); cmd->set_storage_buffer(0, 0, *aabb_buffer); cmd->set_storage_buffer(0, 1, *cached_transform_buffer); @@ -469,10 +469,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V bool large_workgroup = true; //device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32; - bool supports_subgroup_path = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT) && - device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); + bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT) && + device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); - if (supports_subgroup_path) + if (supports_wave32) { cmd->enable_subgroup_size_control(true); cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); @@ -488,7 +488,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V "assets://shaders/meshlet_debug.mesh", "assets://shaders/meshlet_debug.mesh.frag", {{"MESHLET_PAYLOAD_LARGE_WORKGROUP", int(large_workgroup)}, - {"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path)}, + {"MESHLET_PAYLOAD_WAVE32", int(supports_wave32)}, {"MESHLET_RENDER_TASK", int(!use_preculling)}}); cmd->set_storage_buffer(0, 0, *aabb_buffer); From c7a492cb6e9721370e2eadef586fc7b24eb63131 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 12:38:58 +0100 Subject: [PATCH 08/59] Improve culling test. --- tests/assets/shaders/meshlet_debug.mesh | 67 +++++++++++++++++++------ 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index 219e3a06..b2599a7a 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -63,8 +63,13 @@ shared uvec4 shared_active_prim; shared uvec4 shared_active_vert_count; shared uvec4 shared_active_prim_count; -const uint CLIP_CODE_NEGATIVE_W = 1; -const uint CLIP_CODE_INACCURATE = 2; +const uint CLIP_CODE_INACCURATE = 1 << 0; +const uint CLIP_CODE_NEGATIVE_W = 1 << 1; +const uint CLIP_CODE_NEGATIVE_X = 1 << 2; +const uint CLIP_CODE_NEGATIVE_Y = 1 << 3; +const uint CLIP_CODE_POSITIVE_X = 1 << 4; +const uint CLIP_CODE_POSITIVE_Y = 1 << 5; +const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; uint compacted_vertex_output(uint index) { @@ -97,15 +102,26 @@ bool cull_triangle(vec2 a, vec2 b, vec2 c) // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. - precise float neg_area = ab.x * ac.y; precise float pos_area = ab.y * ac.x; - if (neg_area > pos_area) - return false; + precise float neg_area = ab.x * ac.y; - // Micropoly test. - vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); - vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); - return all(notEqual(lo, hi)); + // If the pos value is (-2^24, +2^24), the FP math is exact, if not, we have to be conservative. + // Less-than check is there to ensure that 1.0 delta in neg_area *will* resolve to a different value. + bool active_primitive; + if (abs(pos_area) < 16777216.0) + active_primitive = pos_area > neg_area; + else + active_primitive = pos_area >= neg_area; + + if (active_primitive) + { + // Micropoly test. + vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); + vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); + active_primitive = all(notEqual(lo, hi)); + } + + return active_primitive; } void main() @@ -134,9 +150,19 @@ void main() vec3 world_pos = (M * vec4(pos, 1.0)).xyz; vec4 clip_pos = VP * vec4(world_pos, 1.0); vec2 c = clip_pos.xy / clip_pos.w; + uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; if (any(greaterThan(abs(c), vec2(4.0)))) clip_code |= CLIP_CODE_INACCURATE; + if (c.x <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_X; + if (c.y <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_Y; + if (c.x >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_X; + if (c.y >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_Y; + vec2 window = roundEven(c * viewport.zw + viewport.xy); shared_window_positions[linear_index] = window; shared_clip_code[linear_index] = uint8_t(clip_code); @@ -146,19 +172,28 @@ void main() bool is_active_prim = false; if (linear_index < meta.num_primitives) { - vec2 a = shared_window_positions[prim.x]; - vec2 b = shared_window_positions[prim.y]; - vec2 c = shared_window_positions[prim.z]; uint code_a = shared_clip_code[prim.x]; uint code_b = shared_clip_code[prim.y]; uint code_c = shared_clip_code[prim.z]; - uint or_code_a = code_a | code_b | code_c; - uint and_code_a = code_a & code_b & code_c; + uint or_code = code_a | code_b | code_c; + uint and_code = code_a & code_b & code_c; + + bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; - if ((and_code_a & CLIP_CODE_NEGATIVE_W) == 0) + if (!culled_planes) { - if ((or_code_a & CLIP_CODE_INACCURATE) != 0 || cull_triangle(a, b, c)) + bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; + + if (!force_accept) + { + vec2 a = shared_window_positions[prim.x]; + vec2 b = shared_window_positions[prim.y]; + vec2 c = shared_window_positions[prim.z]; + force_accept = cull_triangle(a, b, c); + } + + if (force_accept) { is_active_prim = true; atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); From 81868a26f2d768f9e3da68d6a4e277afd58a6365 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 12:43:02 +0100 Subject: [PATCH 09/59] Get rid of "minus_1". --- assets/shaders/decode/meshlet_decode.comp | 18 +++++++++--------- assets/shaders/inc/meshlet_payload_decode.h | 4 ++-- scene-export/meshlet_export.cpp | 4 ++-- vulkan/managers/resource_manager.cpp | 4 ++-- vulkan/mesh/meshlet.cpp | 8 ++++---- vulkan/mesh/meshlet.hpp | 4 ++-- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index 078bf34e..0581609f 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -106,7 +106,7 @@ void main() if (!RAW_PAYLOAD) { IndirectIndexedDraw draw; - draw.indexCount = 3 * (meta.num_primitives_minus_1 + 1); + draw.indexCount = 3 * meta.num_primitives; draw.instanceCount = 1; draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset; draw.firstIndex = 3 * (output_offset_strides.data[meshlet_index].x + registers.primitive_offset); @@ -121,12 +121,12 @@ void main() indices += meta.base_vertex_offset + registers.vertex_offset; \ output_offset = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS].x; \ output_offset += registers.primitive_offset; \ - if (linear_index <= uint(meta.num_primitives_minus_1)) \ + if (linear_index < uint(meta.num_primitives)) \ output_indices32.data[output_offset + linear_index] = indices; \ } else { \ output_offset = output_offset_strides.data[meshlet_index].x; \ output_offset += registers.primitive_offset; \ - if (linear_index <= uint(meta.num_primitives_minus_1)) \ + if (linear_index < uint(meta.num_primitives)) \ output_indices8.data[output_offset + linear_index] = unpack8(packed_indices).xyz; \ } \ } @@ -140,7 +140,7 @@ void main() #define ATTR(linear_index, packed_decoded) { \ uvec2 output_offset_stride0 = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS + i]; \ output_offset_stride0.x += registers.vertex_offset; \ - if (linear_index <= uint(meta.num_attributes_minus_1)) \ + if (linear_index < uint(meta.num_attributes)) \ output_stream_raw.data[output_offset_stride0.x + linear_index * output_offset_stride0.y] = packed_decoded; \ } @@ -155,30 +155,30 @@ void main() output_offset += registers.vertex_offset; #define POS(linear_index, packed_decoded) { \ - if (linear_index <= uint(meta.num_attributes_minus_1)) \ + if (linear_index < uint(meta.num_attributes)) \ output_stream_pos.data[output_offset + linear_index] = attribute_decode_snorm_exp_position(packed_decoded); \ } #define NORMAL(linear_index, packed_decoded) { \ - if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + if (linear_index < uint(meta.num_attributes)) { \ output_stream_textured_attr.data[output_offset + linear_index].normal = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ } \ } #define TANGENT(linear_index, packed_decoded) { \ - if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + if (linear_index < uint(meta.num_attributes)) { \ output_stream_textured_attr.data[output_offset + linear_index].tangent = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ } \ } #define UV(linear_index, packed_decoded) { \ - if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + if (linear_index < uint(meta.num_attributes)) { \ output_stream_textured_attr.data[output_offset + linear_index].uv = attribute_decode_snorm_exp_uv(packed_decoded); \ } \ } #define SKIN(linear_index, packed_decoded) { \ - if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + if (linear_index < uint(meta.num_attributes)) { \ output_stream_skin.data[output_offset + linear_index] = packed_decoded; \ } \ } diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 8bb60387..b14e2575 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -48,8 +48,8 @@ struct MeshletStream struct MeshletMetaRaw { uint base_vertex_offset; - uint8_t num_primitives_minus_1; - uint8_t num_attributes_minus_1; + uint8_t num_primitives; + uint8_t num_attributes; uint16_t reserved; }; diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index 3905f1c9..bd4f08a6 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -471,8 +471,8 @@ static void encode_mesh(Encoded &encoded, u8vec4 stream_buffer[MaxElements]; meshlet.base_vertex_offset = base_vertex_offset; - meshlet.num_primitives_minus_1 = analysis_result.num_primitives - 1; - meshlet.num_attributes_minus_1 = analysis_result.num_vertices - 1; + meshlet.num_primitives = analysis_result.num_primitives; + meshlet.num_attributes = analysis_result.num_vertices; meshlet.reserved = 0; // Encode index buffer. diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 587c47cc..ac3996dc 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -451,8 +451,8 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) { headers[i].stream_offset = asset.mesh.attr_or_stream.offset + i * view.format_header->u32_stream_count; - headers[i].num_attributes = view.headers[i].num_attributes_minus_1 + 1; - headers[i].num_primitives = view.headers[i].num_primitives_minus_1 + 1; + headers[i].num_attributes = view.headers[i].num_attributes; + headers[i].num_primitives = view.headers[i].num_primitives; } auto *bounds = static_cast( diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index ce98aaad..546af8d0 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -78,8 +78,8 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping) for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) { - view.total_primitives += view.headers[i].num_primitives_minus_1 + 1; - view.total_vertices += view.headers[i].num_attributes_minus_1 + 1; + view.total_primitives += view.headers[i].num_primitives; + view.total_vertices += view.headers[i].num_attributes; } return view; @@ -178,7 +178,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) { decode_offsets.push_back({ index_count, 0 }); - index_count += view.headers[i].num_primitives_minus_1 + 1; + index_count += view.headers[i].num_primitives; for (uint32_t j = 0; j < output_u32_streams; j++) decode_offsets.push_back({ view.headers[i].base_vertex_offset * output_u32_streams + j, output_u32_streams }); } @@ -214,7 +214,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) { decode_offsets.push_back({ index_count, view.headers[i].base_vertex_offset }); - index_count += view.headers[i].num_primitives_minus_1 + 1; + index_count += view.headers[i].num_primitives; } cmd.set_specialization_constant(1, uint32_t(info.target_style)); diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index dc9f6c4a..1319d002 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -55,8 +55,8 @@ struct Stream struct Header { uint32_t base_vertex_offset; - uint8_t num_primitives_minus_1; - uint8_t num_attributes_minus_1; + uint8_t num_primitives; + uint8_t num_attributes; uint16_t reserved; }; From a8a9f5a9c52c62199601e808e0467c3fd37f5a2a Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 12:49:21 +0100 Subject: [PATCH 10/59] Move total count to scalar math. --- tests/assets/shaders/meshlet_debug.mesh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index b2599a7a..a5a934c6 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -62,6 +62,8 @@ shared uvec4 shared_active_vert; shared uvec4 shared_active_prim; shared uvec4 shared_active_vert_count; shared uvec4 shared_active_prim_count; +shared uint shared_active_vert_count_total; +shared uint shared_active_prim_count_total; const uint CLIP_CODE_INACCURATE = 1 << 0; const uint CLIP_CODE_NEGATIVE_W = 1 << 1; @@ -217,12 +219,15 @@ void main() num_active_vert.y += num_active_vert.x; num_active_vert.z += num_active_vert.y; shared_active_vert_count = uvec4(0, num_active_vert); + + shared_active_prim_count_total = num_active_prim.z + bitCount(shared_active_prim.w); + shared_active_vert_count_total = num_active_vert.z + bitCount(shared_active_vert.w); } barrier(); - uint num_verts = shared_active_vert_count.w + bitCount(shared_active_vert.w); - uint num_prims = shared_active_prim_count.w + bitCount(shared_active_prim.w); + uint num_verts = shared_active_vert_count_total; + uint num_prims = shared_active_prim_count_total; SetMeshOutputsEXT(num_verts, num_prims); From fb6fe8bc9c499d3fc28ae3d1c7c5027a6d21dbe2 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 13:16:39 +0100 Subject: [PATCH 11/59] Experiment with local-invocation indexing --- tests/assets/shaders/meshlet_debug.mesh | 40 +++++++++++++++++-------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index a5a934c6..a122016e 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -64,6 +64,11 @@ shared uvec4 shared_active_vert_count; shared uvec4 shared_active_prim_count; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; +shared vec4 shared_clip_positions[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; +shared vec3 shared_positions[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; +shared uint shared_attr_x[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; +shared uint shared_attr_y[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; +shared u8vec4 shared_prim_u8[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; const uint CLIP_CODE_INACCURATE = 1 << 0; const uint CLIP_CODE_NEGATIVE_W = 1 << 1; @@ -88,11 +93,10 @@ bool lane_has_active_vert(uint index) return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; } -uvec3 remap_index_buffer(uvec3 prim) +u8vec4 remap_index_buffer(u8vec4 prim_u8) { - return uvec3(compacted_vertex_output(prim.x), - compacted_vertex_output(prim.y), - compacted_vertex_output(prim.z)); + uvec3 prim = uvec3(prim_u8.xyz); + return u8vec4(uvec4(compacted_vertex_output(prim.x), compacted_vertex_output(prim.y), compacted_vertex_output(prim.z), 0)); } bool cull_triangle(vec2 a, vec2 b, vec2 c) @@ -141,8 +145,8 @@ void main() uint linear_index = meshlet_get_linear_index(); mat4 M = transforms.data[task.node_offset]; - uvec3 prim; -#define INDEX(index, value) prim = uvec3(unpack8(value).xyz) + u8vec4 prim_u8; +#define INDEX(index, value) prim_u8 = unpack8(value) MESHLET_DECODE_STREAM_32(meta.stream_offset, 0, INDEX); vec3 pos; @@ -174,6 +178,7 @@ void main() bool is_active_prim = false; if (linear_index < meta.num_primitives) { + uvec3 prim = uvec3(prim_u8.xyz); uint code_a = shared_clip_code[prim.x]; uint code_b = shared_clip_code[prim.y]; uint code_c = shared_clip_code[prim.z]; @@ -232,20 +237,31 @@ void main() SetMeshOutputsEXT(num_verts, num_prims); if (is_active_prim) - gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); - - if (gl_LocalInvocationIndex < num_prims) - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; + shared_prim_u8[compacted_index_output(linear_index)] = remap_index_buffer(prim_u8); bool has_active_vert = lane_has_active_vert(linear_index); if (has_active_vert) { uint out_vert_index = compacted_vertex_output(linear_index); - gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - vWorldPos[out_vert_index] = world_pos; + shared_positions[out_vert_index] = world_pos; + shared_clip_positions[out_vert_index] = clip_pos; } + barrier(); + + if (gl_LocalInvocationIndex < num_prims) + { + gl_PrimitiveTriangleIndicesEXT[gl_LocalInvocationIndex] = uvec3(shared_prim_u8[gl_LocalInvocationIndex].xyz); + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; + } + + if (gl_LocalInvocationIndex < num_verts) + { + gl_MeshVerticesEXT[gl_LocalInvocationIndex].gl_Position = shared_clip_positions[gl_LocalInvocationIndex]; + vWorldPos[gl_LocalInvocationIndex] = shared_positions[gl_LocalInvocationIndex]; + } + #if 0 #define NORMAL(index, value) \ if (index < meta.num_attributes) \ From 16b5ce070f6a0e82d87a7337dfe396bdc8af07d9 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 13:16:49 +0100 Subject: [PATCH 12/59] Revert "Experiment with local-invocation indexing" This reverts commit fb6fe8bc9c499d3fc28ae3d1c7c5027a6d21dbe2. --- tests/assets/shaders/meshlet_debug.mesh | 40 ++++++++----------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index a122016e..a5a934c6 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -64,11 +64,6 @@ shared uvec4 shared_active_vert_count; shared uvec4 shared_active_prim_count; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; -shared vec4 shared_clip_positions[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; -shared vec3 shared_positions[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; -shared uint shared_attr_x[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; -shared uint shared_attr_y[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; -shared u8vec4 shared_prim_u8[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; const uint CLIP_CODE_INACCURATE = 1 << 0; const uint CLIP_CODE_NEGATIVE_W = 1 << 1; @@ -93,10 +88,11 @@ bool lane_has_active_vert(uint index) return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; } -u8vec4 remap_index_buffer(u8vec4 prim_u8) +uvec3 remap_index_buffer(uvec3 prim) { - uvec3 prim = uvec3(prim_u8.xyz); - return u8vec4(uvec4(compacted_vertex_output(prim.x), compacted_vertex_output(prim.y), compacted_vertex_output(prim.z), 0)); + return uvec3(compacted_vertex_output(prim.x), + compacted_vertex_output(prim.y), + compacted_vertex_output(prim.z)); } bool cull_triangle(vec2 a, vec2 b, vec2 c) @@ -145,8 +141,8 @@ void main() uint linear_index = meshlet_get_linear_index(); mat4 M = transforms.data[task.node_offset]; - u8vec4 prim_u8; -#define INDEX(index, value) prim_u8 = unpack8(value) + uvec3 prim; +#define INDEX(index, value) prim = uvec3(unpack8(value).xyz) MESHLET_DECODE_STREAM_32(meta.stream_offset, 0, INDEX); vec3 pos; @@ -178,7 +174,6 @@ void main() bool is_active_prim = false; if (linear_index < meta.num_primitives) { - uvec3 prim = uvec3(prim_u8.xyz); uint code_a = shared_clip_code[prim.x]; uint code_b = shared_clip_code[prim.y]; uint code_c = shared_clip_code[prim.z]; @@ -237,31 +232,20 @@ void main() SetMeshOutputsEXT(num_verts, num_prims); if (is_active_prim) - shared_prim_u8[compacted_index_output(linear_index)] = remap_index_buffer(prim_u8); + gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); + + if (gl_LocalInvocationIndex < num_prims) + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; bool has_active_vert = lane_has_active_vert(linear_index); if (has_active_vert) { uint out_vert_index = compacted_vertex_output(linear_index); - shared_positions[out_vert_index] = world_pos; - shared_clip_positions[out_vert_index] = clip_pos; + gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; + vWorldPos[out_vert_index] = world_pos; } - barrier(); - - if (gl_LocalInvocationIndex < num_prims) - { - gl_PrimitiveTriangleIndicesEXT[gl_LocalInvocationIndex] = uvec3(shared_prim_u8[gl_LocalInvocationIndex].xyz); - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; - } - - if (gl_LocalInvocationIndex < num_verts) - { - gl_MeshVerticesEXT[gl_LocalInvocationIndex].gl_Position = shared_clip_positions[gl_LocalInvocationIndex]; - vWorldPos[gl_LocalInvocationIndex] = shared_positions[gl_LocalInvocationIndex]; - } - #if 0 #define NORMAL(index, value) \ if (index < meta.num_attributes) \ From 77768dc2fefc678b3fef5dd51edfc5a6c6fe454f Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 14:17:53 +0100 Subject: [PATCH 13/59] Experiment with shading directly from decoded data. --- assets/shaders/decode/meshlet_decode.comp | 24 ++ assets/shaders/inc/meshlet_render_types.h | 8 + tests/assets/shaders/meshlet_debug_plain.mesh | 264 ++++++++++++++++++ tests/meshlet_viewer.cpp | 93 ++---- vulkan/managers/resource_manager.cpp | 66 +++-- vulkan/managers/resource_manager.hpp | 12 +- vulkan/mesh/meshlet.cpp | 21 +- vulkan/mesh/meshlet.hpp | 15 + 8 files changed, 382 insertions(+), 121 deletions(-) create mode 100644 tests/assets/shaders/meshlet_debug_plain.mesh diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index 0581609f..d02d89ad 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -70,6 +70,20 @@ layout(set = 0, binding = 7, std430) readonly buffer OutputOffsets uvec2 data[]; } output_offset_strides; +#if MESHLET_PAYLOAD_RUNTIME_MESH +struct IndirectDrawMesh +{ + uint primitive_offset; + uint vertex_offset; + uint num_primitives; + uint num_attributes; +}; + +layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommands +{ + IndirectDrawMesh draws[]; +} indirect_commands_mesh; +#else struct IndirectIndexedDraw { uint indexCount; @@ -83,6 +97,7 @@ layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommands { IndirectIndexedDraw draws[]; } indirect_commands; +#endif layout(push_constant, std430) uniform Registers { @@ -105,6 +120,14 @@ void main() if (!RAW_PAYLOAD) { +#if MESHLET_PAYLOAD_RUNTIME_MESH + IndirectDrawMesh draw; + draw.primitive_offset = output_offset_strides.data[meshlet_index].x + registers.primitive_offset; + draw.vertex_offset = meta.base_vertex_offset + registers.vertex_offset; + draw.num_primitives = meta.num_primitives; + draw.num_attributes = meta.num_attributes; + indirect_commands_mesh.draws[meshlet_index + registers.meshlet_offset] = draw; +#else IndirectIndexedDraw draw; draw.indexCount = 3 * meta.num_primitives; draw.instanceCount = 1; @@ -112,6 +135,7 @@ void main() draw.firstIndex = 3 * (output_offset_strides.data[meshlet_index].x + registers.primitive_offset); draw.firstInstance = 0; indirect_commands.draws[meshlet_index + registers.meshlet_offset] = draw; +#endif } #define INDEX(linear_index, packed_indices) { \ diff --git a/assets/shaders/inc/meshlet_render_types.h b/assets/shaders/inc/meshlet_render_types.h index 76da97ca..d2e785a6 100644 --- a/assets/shaders/inc/meshlet_render_types.h +++ b/assets/shaders/inc/meshlet_render_types.h @@ -27,6 +27,14 @@ struct CompactedDrawInfo uint node_count_material_offset; }; +struct IndirectDrawMesh +{ + uint primitive_offset; + uint vertex_offset; + uint num_primitives; + uint num_attributes; +}; + #if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS struct MeshletDrawCommand { diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh new file mode 100644 index 00000000..60b2083f --- /dev/null +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -0,0 +1,264 @@ +#version 450 +#extension GL_EXT_mesh_shader : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_scalar_block_layout : require + +layout(max_primitives = 128, max_vertices = 128, triangles) out; +layout(local_size_x = 128) in; + +#include "meshlet_render_types.h" + +layout(location = 0) out vec3 vWorldPos[]; +layout(location = 1) perprimitiveEXT out uint vDrawID[]; + +layout(set = 1, binding = 0) uniform UBO +{ + mat4 VP; +}; + +layout(set = 1, binding = 2) uniform UBOViewport +{ + vec4 viewport; +}; + +layout(set = 0, binding = 0, scalar) readonly buffer IBO +{ + u8vec3 data[]; +} ibo; + +layout(set = 0, binding = 1, scalar) readonly buffer VBOPOS +{ + vec3 data[]; +} pos; + +layout(set = 0, binding = 3, std430) readonly buffer IndirectCommands +{ + layout(offset = 256) IndirectDrawMesh draws[]; +} indirect_commands_mesh; + +layout(set = 0, binding = 4, std430) readonly buffer CompactedDraws +{ + CompactedDrawInfo mesh_payload[]; +}; + +layout(set = 0, binding = 5, std430) readonly buffer Transforms +{ + mat4 data[]; +} transforms; + +#define MESHLET_CULL 1 + +#if MESHLET_CULL +shared vec2 shared_window_positions[128]; +shared uint8_t shared_clip_code[128]; +shared uvec4 shared_active_vert; +shared uvec4 shared_active_prim; +shared uvec4 shared_active_vert_count; +shared uvec4 shared_active_prim_count; +shared uint shared_active_vert_count_total; +shared uint shared_active_prim_count_total; + +const uint CLIP_CODE_INACCURATE = 1 << 0; +const uint CLIP_CODE_NEGATIVE_W = 1 << 1; +const uint CLIP_CODE_NEGATIVE_X = 1 << 2; +const uint CLIP_CODE_NEGATIVE_Y = 1 << 3; +const uint CLIP_CODE_POSITIVE_X = 1 << 4; +const uint CLIP_CODE_POSITIVE_Y = 1 << 5; +const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; + +uint compacted_vertex_output(uint index) +{ + return shared_active_vert_count[index / 32u] + bitCount(shared_active_vert[index / 32u] & ((1u << (index & 31u)) - 1u)); +} + +uint compacted_index_output(uint index) +{ + return shared_active_prim_count[index / 32u] + bitCount(shared_active_prim[index / 32u] & ((1u << (index & 31u)) - 1u)); +} + +bool lane_has_active_vert(uint index) +{ + return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; +} + +uvec3 remap_index_buffer(uvec3 prim) +{ + return uvec3(compacted_vertex_output(prim.x), + compacted_vertex_output(prim.y), + compacted_vertex_output(prim.z)); +} + +bool cull_triangle(vec2 a, vec2 b, vec2 c) +{ + // To be completely accurate, this should be done in fixed point, + // but we can YOLO a bit since glitches in extreme edge cases are considered okay. + precise vec2 ab = b - a; + precise vec2 ac = c - a; + + // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. + // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. + precise float pos_area = ab.y * ac.x; + precise float neg_area = ab.x * ac.y; + + // If the pos value is (-2^24, +2^24), the FP math is exact, if not, we have to be conservative. + // Less-than check is there to ensure that 1.0 delta in neg_area *will* resolve to a different value. + bool active_primitive; + if (abs(pos_area) < 16777216.0) + active_primitive = pos_area > neg_area; + else + active_primitive = pos_area >= neg_area; + + if (active_primitive) + { + // Micropoly test. + vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); + vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); + active_primitive = all(notEqual(lo, hi)); + } + + return active_primitive; +} +#endif + +void main() +{ +#if MESHLET_CULL + if (gl_LocalInvocationIndex < 4) + { + shared_active_vert[gl_LocalInvocationIndex] = 0; + shared_active_prim[gl_LocalInvocationIndex] = 0; + } +#endif + + IndirectDrawMesh meshlet = indirect_commands_mesh.draws[gl_WorkGroupID.x]; + CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; + uint linear_index = gl_LocalInvocationIndex; + mat4 M = transforms.data[task.node_offset]; + +#if MESHLET_CULL + vec3 world_pos; + vec4 clip_pos; + uvec3 prim; + + if (linear_index < meshlet.num_attributes) + { + vec3 pos = pos.data[meshlet.vertex_offset + linear_index]; + world_pos = (M * vec4(pos, 1.0)).xyz; + clip_pos = VP * vec4(world_pos, 1.0); + + vec2 c = clip_pos.xy / clip_pos.w; + + uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; + if (any(greaterThan(abs(c), vec2(4.0)))) + clip_code |= CLIP_CODE_INACCURATE; + if (c.x <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_X; + if (c.y <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_Y; + if (c.x >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_X; + if (c.y >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_Y; + + vec2 window = roundEven(c * viewport.zw + viewport.xy); + shared_window_positions[linear_index] = window; + shared_clip_code[linear_index] = uint8_t(clip_code); + } + + barrier(); + + bool is_active_prim = false; + if (linear_index < meshlet.num_primitives) + { + prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index]); + uint code_a = shared_clip_code[prim.x]; + uint code_b = shared_clip_code[prim.y]; + uint code_c = shared_clip_code[prim.z]; + + uint or_code = code_a | code_b | code_c; + uint and_code = code_a & code_b & code_c; + + bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; + + if (!culled_planes) + { + bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; + + if (!force_accept) + { + vec2 a = shared_window_positions[prim.x]; + vec2 b = shared_window_positions[prim.y]; + vec2 c = shared_window_positions[prim.z]; + force_accept = cull_triangle(a, b, c); + } + + if (force_accept) + { + is_active_prim = true; + atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); + atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); + atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); + atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); + } + } + } + + barrier(); + + if (gl_LocalInvocationIndex == 0) + { + uvec3 num_active_prim = bitCount(shared_active_prim.xyz); + num_active_prim.y += num_active_prim.x; + num_active_prim.z += num_active_prim.y; + shared_active_prim_count = uvec4(0, num_active_prim); + + uvec3 num_active_vert = bitCount(shared_active_vert.xyz); + num_active_vert.y += num_active_vert.x; + num_active_vert.z += num_active_vert.y; + shared_active_vert_count = uvec4(0, num_active_vert); + + shared_active_prim_count_total = num_active_prim.z + bitCount(shared_active_prim.w); + shared_active_vert_count_total = num_active_vert.z + bitCount(shared_active_vert.w); + } + + barrier(); + + uint num_verts = shared_active_vert_count_total; + uint num_prims = shared_active_prim_count_total; + + SetMeshOutputsEXT(num_verts, num_prims); + + if (is_active_prim) + gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); + + if (gl_LocalInvocationIndex < num_prims) + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; + + bool has_active_vert = lane_has_active_vert(linear_index); + if (has_active_vert) + { + uint out_vert_index = compacted_vertex_output(linear_index); + gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; + vWorldPos[out_vert_index] = world_pos; + } +#else + SetMeshOutputsEXT(meshlet.num_attributes, meshlet.num_primitives); + if (linear_index < meshlet.num_attributes) + { + vec3 pos = pos.data[meshlet.vertex_offset + linear_index]; + vec3 world_pos = (M * vec4(pos, 1.0)).xyz; + vec4 clip_pos = VP * vec4(world_pos, 1.0); + + gl_MeshVerticesEXT[linear_index].gl_Position = clip_pos; + vWorldPos[linear_index] = world_pos; + } + + if (linear_index < meshlet.num_primitives) + { + uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index]); + gl_PrimitiveTriangleIndicesEXT[linear_index] = prim; + vDrawID[linear_index] = task.meshlet_index; + } +#endif +} diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index cf9b7a53..19acdc80 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -378,7 +378,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V { BufferCreateInfo info; if (use_meshlets) - info.size = 16; + info.size = max_draws * sizeof(Vulkan::Meshlet::RuntimeHeaderDecoded) + 256; else info.size = max_draws * sizeof(VkDrawIndexedIndirectCommand) + 256; @@ -416,7 +416,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); } - auto command_words = use_meshlets ? 0 : (sizeof(VkDrawIndexedIndirectCommand) / sizeof(uint32_t)); + auto command_words = (use_meshlets ? + sizeof(Vulkan::Meshlet::RuntimeHeaderDecoded) : + sizeof(VkDrawIndexedIndirectCommand)) / sizeof(uint32_t); cmd->set_program("assets://shaders/meshlet_cull.comp", {{"MESHLET_PAYLOAD_WAVE32", int(supports_wave32)}, @@ -424,8 +426,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_storage_buffer(0, 0, *aabb_buffer); cmd->set_storage_buffer(0, 1, *cached_transform_buffer); cmd->set_storage_buffer(0, 2, *task_buffer); - if (!use_meshlets) - cmd->set_storage_buffer(0, 3, *indirect); + cmd->set_storage_buffer(0, 3, *indirect); cmd->set_storage_buffer(0, 4, *indirect_draws); cmd->set_storage_buffer(0, 5, *compacted_params); cmd->set_storage_buffer(0, 6, *manager.get_cluster_bounds_buffer()); @@ -444,12 +445,12 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_READ_BIT); } + auto *ibo = manager.get_index_buffer(); + auto *pos = manager.get_position_buffer(); + auto *attr = manager.get_attribute_buffer(); + if (use_meshlets) { - auto *header_buffer = manager.get_meshlet_header_buffer(); - auto *stream_header_buffer = manager.get_meshlet_stream_header_buffer(); - auto *payload_buffer = manager.get_meshlet_payload_buffer(); - cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); render_context.set_camera(camera); @@ -466,79 +467,21 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - bool large_workgroup = true; - //device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32; - - bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT) && - device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); - - if (supports_wave32) - { - cmd->enable_subgroup_size_control(true); - cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_TASK_BIT_EXT); - cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); - } - else - { - cmd->enable_subgroup_size_control(true); - cmd->set_subgroup_size_log2(true, 5, 7, VK_SHADER_STAGE_MESH_BIT_EXT); - } - - cmd->set_program(use_preculling ? "" : "assets://shaders/meshlet_debug.task", - "assets://shaders/meshlet_debug.mesh", - "assets://shaders/meshlet_debug.mesh.frag", - {{"MESHLET_PAYLOAD_LARGE_WORKGROUP", int(large_workgroup)}, - {"MESHLET_PAYLOAD_WAVE32", int(supports_wave32)}, - {"MESHLET_RENDER_TASK", int(!use_preculling)}}); - - cmd->set_storage_buffer(0, 0, *aabb_buffer); - cmd->set_storage_buffer(0, 1, *cached_transform_buffer); - cmd->set_storage_buffer(0, 2, *task_buffer); - cmd->set_storage_buffer(0, 3, *header_buffer); - cmd->set_storage_buffer(0, 4, *stream_header_buffer); - cmd->set_storage_buffer(0, 5, *payload_buffer); - - cmd->set_sampler(0, 6, StockSampler::DefaultGeometryFilterWrap); - - cmd->set_storage_buffer(0, 7, *manager.get_cluster_bounds_buffer()); - memcpy(cmd->allocate_typed_constant_data(0, 8, 6), - render_context.get_visibility_frustum().get_planes(), - 6 * sizeof(vec4)); - - cmd->set_storage_buffer(0, 9, *readback_counter); - cmd->set_storage_buffer(0, 10, *compacted_params); + cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", + "assets://shaders/meshlet_debug.mesh.frag"); + cmd->set_storage_buffer(0, 0, *ibo); + cmd->set_storage_buffer(0, 1, *pos); + cmd->set_storage_buffer(0, 2, *attr); + cmd->set_storage_buffer(0, 3, *indirect_draws); + cmd->set_storage_buffer(0, 4, *compacted_params); + cmd->set_storage_buffer(0, 5, *cached_transform_buffer); GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); - - cmd->set_specialization_constant_mask(1); - cmd->set_specialization_constant(0, style_to_u32_streams(MeshStyle::Wireframe)); - - if (use_preculling) - { - cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); - } - else - { - uint32_t count = task_params.size(); - push.count = count; - cmd->push_constants(&push, 0, sizeof(push)); - cmd->draw_mesh_tasks((count + 31) / 32, 1, 1); - } - + cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); cmd->end_render_pass(); - - cmd->barrier(VK_PIPELINE_STAGE_TASK_SHADER_BIT_EXT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_READ_BIT); - cmd->copy_buffer(*readback, 0, *readback_counter, 0, sizeof(uint32_t)); - cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); } else { - auto *ibo = manager.get_index_buffer(); - auto *pos = manager.get_position_buffer(); - auto *attr = manager.get_attribute_buffer(); - cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); cmd->set_opaque_state(); diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index ac3996dc..374ae252 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -39,9 +39,9 @@ ResourceManager::ResourceManager(Device *device_) , index_buffer_allocator(*device_, 256, 17) , attribute_buffer_allocator(*device_, 256, 17) , indirect_buffer_allocator(*device_, 32, 15) - , mesh_header_allocator(*device_, 32, 15) - , mesh_stream_allocator(*device_, 8, 17) - , mesh_payload_allocator(*device_, 128, 17) + //, mesh_header_allocator(*device_, 32, 15) + //, mesh_stream_allocator(*device_, 8, 17) + //, mesh_payload_allocator(*device_, 128, 17) { // Simplified style. index_buffer_allocator.set_element_size(0, 3); // 8-bit indices. @@ -51,9 +51,9 @@ ResourceManager::ResourceManager(Device *device_) attribute_buffer_allocator.set_element_size(2, sizeof(uint32_t) * 2); indirect_buffer_allocator.set_element_size(0, sizeof(VkDrawIndexedIndirectCommand)); - mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); - mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); - mesh_payload_allocator.set_element_size(0, sizeof(uint32_t)); + //mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); + //mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); + //mesh_payload_allocator.set_element_size(0, sizeof(uint32_t)); assets.reserve(Granite::AssetID::MaxIDs); } @@ -180,27 +180,18 @@ void ResourceManager::init() { mesh_encoding = MeshEncoding::Meshlet; LOGI("Opting in to meshlet path.\n"); + indirect_buffer_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeaderDecoded)); + } - mesh_header_allocator.set_soa_count(2); - mesh_header_allocator.set_element_size(1, sizeof(Meshlet::Bound)); + indirect_buffer_allocator.set_soa_count(2); + indirect_buffer_allocator.set_element_size(1, sizeof(Meshlet::Bound)); - opaque.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - mesh_header_allocator.prime(&opaque); - mesh_stream_allocator.prime(&opaque); - mesh_payload_allocator.prime(&opaque); - } - else - { - indirect_buffer_allocator.set_soa_count(2); - indirect_buffer_allocator.set_element_size(1, sizeof(Meshlet::Bound)); - - opaque.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - index_buffer_allocator.prime(&opaque); - opaque.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - attribute_buffer_allocator.prime(&opaque); - opaque.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - indirect_buffer_allocator.prime(&opaque); - } + opaque.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + index_buffer_allocator.prime(&opaque); + opaque.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + attribute_buffer_allocator.prime(&opaque); + opaque.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + indirect_buffer_allocator.prime(&opaque); } ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, Granite::AssetID id) @@ -360,7 +351,7 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me bool ret = true; - if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) + //if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) { if (ret) ret = index_buffer_allocator.allocate(view.total_primitives, &asset.mesh.index_or_payload); @@ -369,6 +360,7 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me if (ret) ret = indirect_buffer_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header); } +#if 0 else { if (ret) @@ -384,6 +376,7 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me if (ret) ret = mesh_payload_allocator.allocate(view.format_header->payload_size_words, &asset.mesh.index_or_payload); } +#endif asset.mesh.draw = { asset.mesh.indirect_or_header.offset, @@ -392,18 +385,20 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me if (!ret) { - if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) + //if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) { index_buffer_allocator.free(asset.mesh.index_or_payload); attribute_buffer_allocator.free(asset.mesh.attr_or_stream); indirect_buffer_allocator.free(asset.mesh.indirect_or_header); } +#if 0 else { mesh_payload_allocator.free(asset.mesh.index_or_payload); mesh_stream_allocator.free(asset.mesh.attr_or_stream); mesh_header_allocator.free(asset.mesh.indirect_or_header); } +#endif asset.mesh = {}; } @@ -434,6 +429,7 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, if (ret) { +#if 0 if (mesh_encoding == MeshEncoding::Meshlet) { auto cmd = device->request_command_buffer(CommandBuffer::Type::AsyncTransfer); @@ -484,6 +480,7 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, false); } else +#endif { auto cmd = device->request_command_buffer(CommandBuffer::Type::AsyncCompute); @@ -507,6 +504,9 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, info.push.primitive_offset = asset.mesh.index_or_payload.offset; info.push.vertex_offset = asset.mesh.attr_or_stream.offset; + info.runtime_style = mesh_encoding == MeshEncoding::Meshlet ? + Meshlet::RuntimeStyle::Meshlet : Meshlet::RuntimeStyle::MDI; + auto *bounds = static_cast( cmd->update_buffer(*indirect_buffer_allocator.get_buffer(0, 1), asset.mesh.indirect_or_header.offset * sizeof(Meshlet::Bound), @@ -529,6 +529,7 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, uint64_t cost = 0; if (ret) { +#if 0 if (mesh_encoding == MeshEncoding::Meshlet) { cost += view.format_header->payload_size_words * mesh_payload_allocator.get_element_size(0); @@ -537,6 +538,7 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, cost += view.format_header->meshlet_count * view.format_header->u32_stream_count * mesh_stream_allocator.get_element_size(0); } else +#endif { cost += view.total_primitives * index_buffer_allocator.get_element_size(0); cost += view.total_vertices * attribute_buffer_allocator.get_element_size(0); @@ -624,6 +626,7 @@ void ResourceManager::latch_handles() { { std::lock_guard holder_alloc{mesh_allocator_lock}; +#if 0 if (mesh_encoding == MeshEncoding::Meshlet) { mesh_payload_allocator.free(asset.mesh.index_or_payload); @@ -631,6 +634,7 @@ void ResourceManager::latch_handles() mesh_header_allocator.free(asset.mesh.indirect_or_header); } else +#endif { index_buffer_allocator.free(asset.mesh.index_or_payload); attribute_buffer_allocator.free(asset.mesh.attr_or_stream); @@ -689,6 +693,7 @@ const Buffer *ResourceManager::get_indirect_buffer() const return indirect_buffer_allocator.get_buffer(0, 0); } +#if 0 const Buffer *ResourceManager::get_meshlet_payload_buffer() const { return mesh_payload_allocator.get_buffer(0, 0); @@ -703,12 +708,13 @@ const Buffer *ResourceManager::get_meshlet_stream_header_buffer() const { return mesh_stream_allocator.get_buffer(0, 0); } +#endif const Buffer *ResourceManager::get_cluster_bounds_buffer() const { - if (mesh_encoding == MeshEncoding::Meshlet) - return mesh_header_allocator.get_buffer(0, 1); - else + //if (mesh_encoding == MeshEncoding::Meshlet) + // return mesh_header_allocator.get_buffer(0, 1); + //else return indirect_buffer_allocator.get_buffer(0, 1); } diff --git a/vulkan/managers/resource_manager.hpp b/vulkan/managers/resource_manager.hpp index 1d3756d4..f246d287 100644 --- a/vulkan/managers/resource_manager.hpp +++ b/vulkan/managers/resource_manager.hpp @@ -125,9 +125,9 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface const Buffer *get_skinning_buffer() const; const Buffer *get_indirect_buffer() const; - const Buffer *get_meshlet_payload_buffer() const; - const Buffer *get_meshlet_header_buffer() const; - const Buffer *get_meshlet_stream_header_buffer() const; + //const Buffer *get_meshlet_payload_buffer() const; + //const Buffer *get_meshlet_header_buffer() const; + //const Buffer *get_meshlet_stream_header_buffer() const; const Buffer *get_cluster_bounds_buffer() const; @@ -181,9 +181,9 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface MeshBufferAllocator index_buffer_allocator; MeshBufferAllocator attribute_buffer_allocator; MeshBufferAllocator indirect_buffer_allocator; - MeshBufferAllocator mesh_header_allocator; - MeshBufferAllocator mesh_stream_allocator; - MeshBufferAllocator mesh_payload_allocator; + //MeshBufferAllocator mesh_header_allocator; + //MeshBufferAllocator mesh_stream_allocator; + //MeshBufferAllocator mesh_payload_allocator; MeshEncoding mesh_encoding = MeshEncoding::VBOAndIBOMDI; diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index 546af8d0..aa688d32 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -87,10 +87,9 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping) bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &view) { - // TODO: Implement LDS fallback. - if (!cmd.get_device().supports_subgroup_size_log2(true, 5, 5)) + if (!cmd.get_device().supports_subgroup_size_log2(true, 5, 7)) { - LOGE("Device does not support Wave32.\n"); + LOGE("Device does not support subgroup paths.\n"); return false; } @@ -123,15 +122,17 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie struct DecodeOffset { uint32_t arg0, arg1; }; std::vector decode_offsets; - bool supports_subgroup_path = cmd.get_device().supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + bool supports_wave32 = cmd.get_device().supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + bool meshlet_runtime = (info.flags & DECODE_MODE_RAW_PAYLOAD) == 0 && info.runtime_style == RuntimeStyle::Meshlet; cmd.set_program("builtin://shaders/decode/meshlet_decode.comp", - {{"MESHLET_PAYLOAD_SUBGROUP", int(supports_subgroup_path) }}); + {{"MESHLET_PAYLOAD_WAVE32", int(supports_wave32) }, + {"MESHLET_PAYLOAD_RUNTIME_MESH", int(meshlet_runtime)}}); - if (supports_subgroup_path) - { - cmd.enable_subgroup_size_control(true); - cmd.set_subgroup_size_log2(true, 5, 5); - } + cmd.enable_subgroup_size_control(true); + if (supports_wave32) + cmd.set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + else + cmd.set_subgroup_size_log2(true, 5, 7, VK_SHADER_STAGE_COMPUTE_BIT); cmd.set_storage_buffer(0, 0, *meshlet_meta_buffer); cmd.set_storage_buffer(0, 1, *meshlet_stream_buffer); diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index 1319d002..f15c0140 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -68,6 +68,14 @@ struct RuntimeHeader uint16_t num_attributes; }; +struct RuntimeHeaderDecoded +{ + uint32_t primitive_offset; + uint32_t vertex_offset; + uint32_t num_primitives; + uint32_t num_attributes; +}; + struct Bound { float center[3]; @@ -122,11 +130,18 @@ enum DecodeModeFlagBits : uint32_t }; using DecodeModeFlags = uint32_t; +enum class RuntimeStyle +{ + MDI, + Meshlet +}; + struct DecodeInfo { const Vulkan::Buffer *ibo, *streams[3], *indirect, *payload; DecodeModeFlags flags; MeshStyle target_style; + RuntimeStyle runtime_style; struct { From 94a3f63bf948ab58587252e818038d7ed6daf4be Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 15:40:34 +0100 Subject: [PATCH 14/59] Experiment with small task shaders. --- tests/assets/shaders/meshlet_debug_plain.mesh | 18 ++++-- tests/assets/shaders/meshlet_debug_plain.task | 63 +++++++++++++++++++ tests/meshlet_viewer.cpp | 28 ++++++--- 3 files changed, 94 insertions(+), 15 deletions(-) create mode 100644 tests/assets/shaders/meshlet_debug_plain.task diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 60b2083f..9e064aa4 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -32,17 +32,22 @@ layout(set = 0, binding = 1, scalar) readonly buffer VBOPOS vec3 data[]; } pos; +// binding = 2 for attributes + layout(set = 0, binding = 3, std430) readonly buffer IndirectCommands { - layout(offset = 256) IndirectDrawMesh draws[]; + IndirectDrawMesh draws[]; } indirect_commands_mesh; -layout(set = 0, binding = 4, std430) readonly buffer CompactedDraws +struct TaskPayload { - CompactedDrawInfo mesh_payload[]; + CompactedDrawInfo info; + uint8_t indices[32]; }; -layout(set = 0, binding = 5, std430) readonly buffer Transforms +taskPayloadSharedEXT TaskPayload mesh_payload; + +layout(set = 0, binding = 4, std430) readonly buffer Transforms { mat4 data[]; } transforms; @@ -131,8 +136,9 @@ void main() } #endif - IndirectDrawMesh meshlet = indirect_commands_mesh.draws[gl_WorkGroupID.x]; - CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; + CompactedDrawInfo task = mesh_payload.info; + task.meshlet_index += mesh_payload.indices[gl_WorkGroupID.x]; + IndirectDrawMesh meshlet = indirect_commands_mesh.draws[task.meshlet_index]; uint linear_index = gl_LocalInvocationIndex; mat4 M = transforms.data[task.node_offset]; diff --git a/tests/assets/shaders/meshlet_debug_plain.task b/tests/assets/shaders/meshlet_debug_plain.task new file mode 100644 index 00000000..c1c61867 --- /dev/null +++ b/tests/assets/shaders/meshlet_debug_plain.task @@ -0,0 +1,63 @@ +#version 450 +#extension GL_EXT_mesh_shader : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require + +layout(local_size_x = 32) in; + +#define MESHLET_RENDER_DESCRIPTOR_SET 0 +#define MESHLET_RENDER_TRANSFORM_BINDING 4 +#define MESHLET_RENDER_AABB_BINDING 5 +#define MESHLET_RENDER_TASKS_BINDING 6 +#define MESHLET_RENDER_BOUND_BINDING 7 +#define MESHLET_RENDER_FRUSTUM_BINDING 8 +#include "meshlet_render.h" + +layout(push_constant, std430) uniform Registers +{ + vec3 camera_pos; + uint offset; +} registers; + +struct TaskPayload +{ + CompactedDrawInfo info; + uint8_t indices[32]; +}; + +taskPayloadSharedEXT TaskPayload mesh_payload; + +void main() +{ + uint task_index = gl_WorkGroupID.x + registers.offset; + TaskInfo task = task_info.data[task_index]; + + // Precull the group. + uint offset = task.mesh_index_count & ~31u; + uint count = bitfieldExtract(task.mesh_index_count, 0, 5) + 1; + uint meshlet_index = offset + gl_SubgroupInvocationID; + + if (gl_SubgroupInvocationID == 0) + { + mesh_payload.info = + CompactedDrawInfo(offset, task.node_instance, task.node_count_material_index); + } + + bool alloc_draw = false; + if (gl_SubgroupInvocationID < count) + { + mat4 M = transforms.data[task.node_instance]; + Bound b = bounds.data[meshlet_index]; + alloc_draw = cluster_cull(M, b, registers.camera_pos); + } + + uvec4 ballot = subgroupBallot(alloc_draw); + uint draw_count = subgroupBallotBitCount(ballot); + uint local_offset = subgroupBallotExclusiveBitCount(ballot); + + if (alloc_draw) + mesh_payload.indices[local_offset] = uint8_t(gl_SubgroupInvocationID); + + EmitMeshTasksEXT(draw_count, 1, 1); +} diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 19acdc80..8fc6a223 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -372,7 +372,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V push.camera_pos = render_context.get_render_parameters().camera_position; const bool use_meshlets = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet; - const bool use_preculling = !use_meshlets || true; + const bool use_preculling = !use_meshlets || false; if (use_preculling) { @@ -457,8 +457,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_opaque_state(); *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; - memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), - 6 * sizeof(vec4)); *cmd->allocate_typed_constant_data(1, 2, 1) = float(1 << 8 /* shader assumes 8 */) * @@ -467,17 +465,29 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", + cmd->set_program("assets://shaders/meshlet_debug_plain.task", "assets://shaders/meshlet_debug_plain.mesh", "assets://shaders/meshlet_debug.mesh.frag"); cmd->set_storage_buffer(0, 0, *ibo); cmd->set_storage_buffer(0, 1, *pos); - cmd->set_storage_buffer(0, 2, *attr); - cmd->set_storage_buffer(0, 3, *indirect_draws); - cmd->set_storage_buffer(0, 4, *compacted_params); - cmd->set_storage_buffer(0, 5, *cached_transform_buffer); + //cmd->set_storage_buffer(0, 2, *attr); + cmd->set_storage_buffer(0, 3, *manager.get_indirect_buffer()); + cmd->set_storage_buffer(0, 4, *cached_transform_buffer); + cmd->set_storage_buffer(0, 5, *aabb_buffer); + cmd->set_storage_buffer(0, 6, *task_buffer); + cmd->set_storage_buffer(0, 7, *manager.get_cluster_bounds_buffer()); + + memcpy(cmd->allocate_typed_constant_data(0, 8, 6), render_context.get_visibility_frustum().get_planes(), + 6 * sizeof(vec4)); + GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); - cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + for (uint32_t i = 0, n = task_params.size(); i < n; i += device.get_device_features().mesh_shader_properties.maxTaskWorkGroupCount[0]) + { + push.count = i; + uint32_t to_draw = std::min(n - i, device.get_device_features().mesh_shader_properties.maxTaskWorkGroupCount[0]); + cmd->push_constants(&push, 0, sizeof(push)); + cmd->draw_mesh_tasks(to_draw, 1, 1); + } cmd->end_render_pass(); } else From 9c1ae96dad31e1566412b12b1d3ce50c74c0021f Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 17 Dec 2023 15:40:42 +0100 Subject: [PATCH 15/59] Revert "Experiment with small task shaders." This reverts commit 94a3f63bf948ab58587252e818038d7ed6daf4be. --- tests/assets/shaders/meshlet_debug_plain.mesh | 18 ++---- tests/assets/shaders/meshlet_debug_plain.task | 63 ------------------- tests/meshlet_viewer.cpp | 28 +++------ 3 files changed, 15 insertions(+), 94 deletions(-) delete mode 100644 tests/assets/shaders/meshlet_debug_plain.task diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 9e064aa4..60b2083f 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -32,22 +32,17 @@ layout(set = 0, binding = 1, scalar) readonly buffer VBOPOS vec3 data[]; } pos; -// binding = 2 for attributes - layout(set = 0, binding = 3, std430) readonly buffer IndirectCommands { - IndirectDrawMesh draws[]; + layout(offset = 256) IndirectDrawMesh draws[]; } indirect_commands_mesh; -struct TaskPayload +layout(set = 0, binding = 4, std430) readonly buffer CompactedDraws { - CompactedDrawInfo info; - uint8_t indices[32]; + CompactedDrawInfo mesh_payload[]; }; -taskPayloadSharedEXT TaskPayload mesh_payload; - -layout(set = 0, binding = 4, std430) readonly buffer Transforms +layout(set = 0, binding = 5, std430) readonly buffer Transforms { mat4 data[]; } transforms; @@ -136,9 +131,8 @@ void main() } #endif - CompactedDrawInfo task = mesh_payload.info; - task.meshlet_index += mesh_payload.indices[gl_WorkGroupID.x]; - IndirectDrawMesh meshlet = indirect_commands_mesh.draws[task.meshlet_index]; + IndirectDrawMesh meshlet = indirect_commands_mesh.draws[gl_WorkGroupID.x]; + CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; uint linear_index = gl_LocalInvocationIndex; mat4 M = transforms.data[task.node_offset]; diff --git a/tests/assets/shaders/meshlet_debug_plain.task b/tests/assets/shaders/meshlet_debug_plain.task deleted file mode 100644 index c1c61867..00000000 --- a/tests/assets/shaders/meshlet_debug_plain.task +++ /dev/null @@ -1,63 +0,0 @@ -#version 450 -#extension GL_EXT_mesh_shader : require -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require - -layout(local_size_x = 32) in; - -#define MESHLET_RENDER_DESCRIPTOR_SET 0 -#define MESHLET_RENDER_TRANSFORM_BINDING 4 -#define MESHLET_RENDER_AABB_BINDING 5 -#define MESHLET_RENDER_TASKS_BINDING 6 -#define MESHLET_RENDER_BOUND_BINDING 7 -#define MESHLET_RENDER_FRUSTUM_BINDING 8 -#include "meshlet_render.h" - -layout(push_constant, std430) uniform Registers -{ - vec3 camera_pos; - uint offset; -} registers; - -struct TaskPayload -{ - CompactedDrawInfo info; - uint8_t indices[32]; -}; - -taskPayloadSharedEXT TaskPayload mesh_payload; - -void main() -{ - uint task_index = gl_WorkGroupID.x + registers.offset; - TaskInfo task = task_info.data[task_index]; - - // Precull the group. - uint offset = task.mesh_index_count & ~31u; - uint count = bitfieldExtract(task.mesh_index_count, 0, 5) + 1; - uint meshlet_index = offset + gl_SubgroupInvocationID; - - if (gl_SubgroupInvocationID == 0) - { - mesh_payload.info = - CompactedDrawInfo(offset, task.node_instance, task.node_count_material_index); - } - - bool alloc_draw = false; - if (gl_SubgroupInvocationID < count) - { - mat4 M = transforms.data[task.node_instance]; - Bound b = bounds.data[meshlet_index]; - alloc_draw = cluster_cull(M, b, registers.camera_pos); - } - - uvec4 ballot = subgroupBallot(alloc_draw); - uint draw_count = subgroupBallotBitCount(ballot); - uint local_offset = subgroupBallotExclusiveBitCount(ballot); - - if (alloc_draw) - mesh_payload.indices[local_offset] = uint8_t(gl_SubgroupInvocationID); - - EmitMeshTasksEXT(draw_count, 1, 1); -} diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 8fc6a223..19acdc80 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -372,7 +372,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V push.camera_pos = render_context.get_render_parameters().camera_position; const bool use_meshlets = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet; - const bool use_preculling = !use_meshlets || false; + const bool use_preculling = !use_meshlets || true; if (use_preculling) { @@ -457,6 +457,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_opaque_state(); *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; + memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), + 6 * sizeof(vec4)); *cmd->allocate_typed_constant_data(1, 2, 1) = float(1 << 8 /* shader assumes 8 */) * @@ -465,29 +467,17 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - cmd->set_program("assets://shaders/meshlet_debug_plain.task", "assets://shaders/meshlet_debug_plain.mesh", + cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", "assets://shaders/meshlet_debug.mesh.frag"); cmd->set_storage_buffer(0, 0, *ibo); cmd->set_storage_buffer(0, 1, *pos); - //cmd->set_storage_buffer(0, 2, *attr); - cmd->set_storage_buffer(0, 3, *manager.get_indirect_buffer()); - cmd->set_storage_buffer(0, 4, *cached_transform_buffer); - cmd->set_storage_buffer(0, 5, *aabb_buffer); - cmd->set_storage_buffer(0, 6, *task_buffer); - cmd->set_storage_buffer(0, 7, *manager.get_cluster_bounds_buffer()); - - memcpy(cmd->allocate_typed_constant_data(0, 8, 6), render_context.get_visibility_frustum().get_planes(), - 6 * sizeof(vec4)); - + cmd->set_storage_buffer(0, 2, *attr); + cmd->set_storage_buffer(0, 3, *indirect_draws); + cmd->set_storage_buffer(0, 4, *compacted_params); + cmd->set_storage_buffer(0, 5, *cached_transform_buffer); GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); - for (uint32_t i = 0, n = task_params.size(); i < n; i += device.get_device_features().mesh_shader_properties.maxTaskWorkGroupCount[0]) - { - push.count = i; - uint32_t to_draw = std::min(n - i, device.get_device_features().mesh_shader_properties.maxTaskWorkGroupCount[0]); - cmd->push_constants(&push, 0, sizeof(push)); - cmd->draw_mesh_tasks(to_draw, 1, 1); - } + cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); cmd->end_render_pass(); } else From 7a4df928e0f2b7b1b38ab03772a188ad6fda953a Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 18 Dec 2023 11:07:11 +0100 Subject: [PATCH 16/59] Get rid of unnecessary reserved field. --- assets/shaders/inc/meshlet_payload_decode.h | 5 ++--- scene-export/meshlet_export.cpp | 1 - vulkan/mesh/meshlet.hpp | 5 ++--- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index b14e2575..aa28b269 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -48,9 +48,8 @@ struct MeshletStream struct MeshletMetaRaw { uint base_vertex_offset; - uint8_t num_primitives; - uint8_t num_attributes; - uint16_t reserved; + uint16_t num_primitives; + uint16_t num_attributes; }; struct MeshletMetaRuntime diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index bd4f08a6..c38a4270 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -473,7 +473,6 @@ static void encode_mesh(Encoded &encoded, meshlet.base_vertex_offset = base_vertex_offset; meshlet.num_primitives = analysis_result.num_primitives; meshlet.num_attributes = analysis_result.num_vertices; - meshlet.reserved = 0; // Encode index buffer. for (uint32_t i = 0; i < analysis_result.num_primitives; i++) diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index f15c0140..ea727de2 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -55,9 +55,8 @@ struct Stream struct Header { uint32_t base_vertex_offset; - uint8_t num_primitives; - uint8_t num_attributes; - uint16_t reserved; + uint16_t num_primitives; + uint16_t num_attributes; }; // For GPU use From 4e4bec3d34070084b4020190f3408c4bc004e0ba Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 18 Dec 2023 11:08:33 +0100 Subject: [PATCH 17/59] Refactor meshlet_compute_stream_counts. --- assets/shaders/inc/meshlet_payload_decode.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index aa28b269..f7367bfd 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -155,9 +155,9 @@ uvec4 meshlet_decode_bit_counts(uint bitplane_value) return out_bit_counts; } -void meshlet_compute_stream_counts(uint bitplane_value, out uint out_total_bits, out uvec4 out_bit_counts) +void meshlet_compute_stream_counts(uint bitplane_value, out uint out_total_bits) { - out_bit_counts = meshlet_decode_bit_counts(bitplane_value); + uvec4 out_bit_counts = meshlet_decode_bit_counts(bitplane_value); uvec2 bit_counts2 = out_bit_counts.xy + out_bit_counts.zw; out_total_bits = bit_counts2.x + bit_counts2.y; } @@ -170,10 +170,9 @@ void meshlet_init_workgroup(uint base_stream_index) uvec4 bitplane_values = uvec4(meshlet_streams.data[unrolled_stream_index].bitplane_meta); uvec3 total_bits; - uvec4 bit_counts; - meshlet_compute_stream_counts(bitplane_values.x, total_bits.x, bit_counts); - meshlet_compute_stream_counts(bitplane_values.y, total_bits.y, bit_counts); - meshlet_compute_stream_counts(bitplane_values.z, total_bits.z, bit_counts); + meshlet_compute_stream_counts(bitplane_values.x, total_bits.x); + meshlet_compute_stream_counts(bitplane_values.y, total_bits.y); + meshlet_compute_stream_counts(bitplane_values.z, total_bits.z); total_bits.y += total_bits.x; total_bits.z += total_bits.y; uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; From a358d98cede9c142542ee431bbf7130c5784df42 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 18 Dec 2023 11:15:37 +0100 Subject: [PATCH 18/59] Check for subgroup size in shader. --- assets/shaders/inc/meshlet_payload_decode.h | 38 +++++++++++++-------- tests/meshlet_viewer.cpp | 9 +++-- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index f7367bfd..738ad52d 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -227,13 +227,18 @@ uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); -#if MESHLET_PAYLOAD_WAVE32 - uint chunk_id = gl_SubgroupID; - int local_chunk_index = int(gl_SubgroupInvocationID); -#else - uint chunk_id = linear_index / 32u; - int local_chunk_index = int(linear_index & 31); -#endif + int local_chunk_index; + uint chunk_id; + if (gl_SubgroupSize == 32) + { + chunk_id = gl_SubgroupID; + local_chunk_index = int(gl_SubgroupInvocationID); + } + else + { + chunk_id = linear_index / 32u; + local_chunk_index = int(linear_index & 31); + } MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); @@ -247,13 +252,18 @@ uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) uint unrolled_stream_index = base_stream_index + stream_index; uint linear_index = meshlet_get_linear_index(); -#if MESHLET_PAYLOAD_WAVE32 - uint chunk_id = gl_SubgroupID; - int local_chunk_index = int(gl_SubgroupInvocationID); -#else - uint chunk_id = linear_index / 32u; - int local_chunk_index = int(linear_index & 31); -#endif + int local_chunk_index; + uint chunk_id; + if (gl_SubgroupSize == 32) + { + chunk_id = gl_SubgroupID; + local_chunk_index = int(gl_SubgroupInvocationID); + } + else + { + chunk_id = linear_index / 32u; + local_chunk_index = int(linear_index & 31); + } MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 19acdc80..6a43ed6f 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -410,19 +410,18 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto *indirect = manager.get_indirect_buffer(); bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); + cmd->enable_subgroup_size_control(true); if (supports_wave32) - { - cmd->enable_subgroup_size_control(true); cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - } + else + cmd->set_subgroup_size_log2(true, 5, 7, VK_SHADER_STAGE_COMPUTE_BIT); auto command_words = (use_meshlets ? sizeof(Vulkan::Meshlet::RuntimeHeaderDecoded) : sizeof(VkDrawIndexedIndirectCommand)) / sizeof(uint32_t); cmd->set_program("assets://shaders/meshlet_cull.comp", - {{"MESHLET_PAYLOAD_WAVE32", int(supports_wave32)}, - {"MESHLET_RENDER_DRAW_WORDS", int(command_words)}}); + {{"MESHLET_RENDER_DRAW_WORDS", int(command_words)}}); cmd->set_storage_buffer(0, 0, *aabb_buffer); cmd->set_storage_buffer(0, 1, *cached_transform_buffer); cmd->set_storage_buffer(0, 2, *task_buffer); From 438361bbb98e41845490baea545a5b12774f1e49 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 18 Dec 2023 13:49:43 +0100 Subject: [PATCH 19/59] Experiment with 256 pre-decoded. --- assets/shaders/decode/meshlet_decode.comp | 2 +- .../shaders/inc/meshlet_payload_constants.h | 4 +- assets/shaders/inc/meshlet_payload_decode.h | 64 +++++++++++++++---- scene-export/meshlet_export.cpp | 5 +- tests/assets/shaders/meshlet_debug_plain.mesh | 47 +++++++------- tests/meshopt_sandbox.cpp | 4 +- vulkan/mesh/meshlet.hpp | 2 +- 7 files changed, 85 insertions(+), 43 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index d02d89ad..166ca870 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -10,7 +10,7 @@ #else #define MESHLET_PAYLOAD_WG_Y 1 #endif -layout(local_size_x = 32, local_size_y = MESHLET_PAYLOAD_WG_Y) in; +layout(local_size_x = 256) in; layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; layout(constant_id = 1) const uint NUM_OUTPUT_U32_STREAMS = 1; diff --git a/assets/shaders/inc/meshlet_payload_constants.h b/assets/shaders/inc/meshlet_payload_constants.h index eb8b5644..2a91ff53 100644 --- a/assets/shaders/inc/meshlet_payload_constants.h +++ b/assets/shaders/inc/meshlet_payload_constants.h @@ -1,8 +1,8 @@ #ifndef MESHLET_PAYLOAD_CONSTANTS_H_ #define MESHLET_PAYLOAD_CONSTANTS_H_ -#define MESHLET_PAYLOAD_MAX_ELEMENTS 128 -#define MESHLET_PAYLOAD_NUM_CHUNKS 4 +#define MESHLET_PAYLOAD_MAX_ELEMENTS 256 +#define MESHLET_PAYLOAD_NUM_CHUNKS 8 #define MESHLET_PAYLOAD_MAX_STREAMS 16 #endif \ No newline at end of file diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 738ad52d..0e6ee02f 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -42,7 +42,7 @@ struct MeshletStream u16vec4 predictor_b; u8vec4 initial_value; uint offset_from_base; - u16vec4 bitplane_meta; + uint16_t bitplane_meta[MESHLET_PAYLOAD_NUM_CHUNKS]; }; struct MeshletMetaRaw @@ -79,10 +79,7 @@ layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_B uint data[]; } payload; -shared uvec4 shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS]; -shared uvec2 chunk_values0[MESHLET_PAYLOAD_NUM_CHUNKS]; -shared uvec2 chunk_values1[MESHLET_PAYLOAD_NUM_CHUNKS]; - +shared uint shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS][MESHLET_PAYLOAD_NUM_CHUNKS]; shared uint wave_buffer_x[MESHLET_PAYLOAD_NUM_CHUNKS]; shared uint wave_buffer_y[MESHLET_PAYLOAD_NUM_CHUNKS]; shared uint wave_buffer_z[MESHLET_PAYLOAD_NUM_CHUNKS]; @@ -90,6 +87,7 @@ shared uint wave_buffer_w[MESHLET_PAYLOAD_NUM_CHUNKS]; uvec2 wgx_inclusive_add(uvec2 v) { + v &= 0xff00ffu; v = subgroupInclusiveAdd(v); if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) { @@ -110,6 +108,7 @@ uvec2 wgx_inclusive_add(uvec2 v) uvec4 wgx_inclusive_add(uvec4 v) { + v &= 0xff00ffu; v = subgroupInclusiveAdd(v); if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) { @@ -164,20 +163,59 @@ void meshlet_compute_stream_counts(uint bitplane_value, out uint out_total_bits) void meshlet_init_workgroup(uint base_stream_index) { +#if 0 if (gl_LocalInvocationIndex < MESHLET_PAYLOAD_NUM_U32_STREAMS) { uint unrolled_stream_index = base_stream_index + gl_LocalInvocationIndex; - uvec4 bitplane_values = uvec4(meshlet_streams.data[unrolled_stream_index].bitplane_meta); - - uvec3 total_bits; - meshlet_compute_stream_counts(bitplane_values.x, total_bits.x); - meshlet_compute_stream_counts(bitplane_values.y, total_bits.y); - meshlet_compute_stream_counts(bitplane_values.z, total_bits.z); + uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; + uvec4 bitplane_values0 = uvec4( + meshlet_streams.data[unrolled_stream_index].bitplane_meta[0], + meshlet_streams.data[unrolled_stream_index].bitplane_meta[1], + meshlet_streams.data[unrolled_stream_index].bitplane_meta[2], + meshlet_streams.data[unrolled_stream_index].bitplane_meta[3]); + uvec3 bitplane_values1 = uvec3( + meshlet_streams.data[unrolled_stream_index].bitplane_meta[4], + meshlet_streams.data[unrolled_stream_index].bitplane_meta[5], + meshlet_streams.data[unrolled_stream_index].bitplane_meta[6]); + + uvec4 total_bits; + meshlet_compute_stream_counts(bitplane_values0.x, total_bits.x); + meshlet_compute_stream_counts(bitplane_values0.y, total_bits.y); + meshlet_compute_stream_counts(bitplane_values0.z, total_bits.z); + meshlet_compute_stream_counts(bitplane_values0.w, total_bits.w); total_bits.y += total_bits.x; total_bits.z += total_bits.y; - uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; - shared_chunk_offset[gl_LocalInvocationIndex] = chunk_offset + uvec4(0, total_bits); + total_bits.w += total_bits.z; + shared_chunk_offset[gl_LocalInvocationIndex][0] = chunk_offset; + shared_chunk_offset[gl_LocalInvocationIndex][1] = chunk_offset + total_bits.x; + shared_chunk_offset[gl_LocalInvocationIndex][2] = chunk_offset + total_bits.y; + shared_chunk_offset[gl_LocalInvocationIndex][3] = chunk_offset + total_bits.z; + chunk_offset += total_bits.w; + + meshlet_compute_stream_counts(bitplane_values1.x, total_bits.x); + meshlet_compute_stream_counts(bitplane_values1.y, total_bits.y); + meshlet_compute_stream_counts(bitplane_values1.z, total_bits.z); + total_bits.y += total_bits.x; + total_bits.z += total_bits.y; + shared_chunk_offset[gl_LocalInvocationIndex][4] = chunk_offset; + shared_chunk_offset[gl_LocalInvocationIndex][5] = chunk_offset + total_bits.x; + shared_chunk_offset[gl_LocalInvocationIndex][6] = chunk_offset + total_bits.y; + shared_chunk_offset[gl_LocalInvocationIndex][7] = chunk_offset + total_bits.z; + } +#else + for (uint i = gl_SubgroupID; i < MESHLET_PAYLOAD_NUM_U32_STREAMS; i += gl_NumSubgroups) + { + if (gl_SubgroupInvocationID < MESHLET_PAYLOAD_NUM_CHUNKS) + { + uint unrolled_stream_index = base_stream_index + i; + uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; + uint bitplane = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[gl_SubgroupInvocationID]); + uint total_bits; + meshlet_compute_stream_counts(bitplane, total_bits); + shared_chunk_offset[i][gl_SubgroupInvocationID] = chunk_offset + subgroupExclusiveAdd(total_bits); + } } +#endif barrier(); } diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index c38a4270..ced85ade 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -670,8 +670,8 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me for (auto &p : positions) position_buffer.push_back(decode_snorm_exp(p)); - constexpr unsigned max_vertices = 128; - constexpr unsigned max_primitives = 128; + constexpr unsigned max_vertices = 255; + constexpr unsigned max_primitives = 256; size_t num_meshlets = meshopt_buildMeshletsBound(mesh.count, max_vertices, max_primitives); std::vector out_vertex_redirection_buffer(num_meshlets * max_vertices); @@ -690,6 +690,7 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me std::vector out_index_buffer; out_meshlets.reserve(num_meshlets); + for (auto &meshlet : meshlets) { Meshlet m = {}; diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 60b2083f..390e7ccd 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -3,9 +3,11 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_scalar_block_layout : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_basic : require -layout(max_primitives = 128, max_vertices = 128, triangles) out; -layout(local_size_x = 128) in; +layout(max_primitives = 256, max_vertices = 255, triangles) out; +layout(local_size_x = 64, local_size_y = 4) in; #include "meshlet_render_types.h" @@ -50,12 +52,12 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms #define MESHLET_CULL 1 #if MESHLET_CULL -shared vec2 shared_window_positions[128]; -shared uint8_t shared_clip_code[128]; -shared uvec4 shared_active_vert; -shared uvec4 shared_active_prim; -shared uvec4 shared_active_vert_count; -shared uvec4 shared_active_prim_count; +shared vec2 shared_window_positions[255]; +shared uint8_t shared_clip_code[255]; +shared uint shared_active_vert[8]; +shared uint shared_active_prim[8]; +shared uint shared_active_vert_count[8]; +shared uint shared_active_prim_count[8]; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; @@ -124,7 +126,7 @@ bool cull_triangle(vec2 a, vec2 b, vec2 c) void main() { #if MESHLET_CULL - if (gl_LocalInvocationIndex < 4) + if (gl_LocalInvocationIndex < 8) { shared_active_vert[gl_LocalInvocationIndex] = 0; shared_active_prim[gl_LocalInvocationIndex] = 0; @@ -206,20 +208,21 @@ void main() barrier(); - if (gl_LocalInvocationIndex == 0) + if (gl_SubgroupInvocationID < 8 && gl_SubgroupID == 0) { - uvec3 num_active_prim = bitCount(shared_active_prim.xyz); - num_active_prim.y += num_active_prim.x; - num_active_prim.z += num_active_prim.y; - shared_active_prim_count = uvec4(0, num_active_prim); - - uvec3 num_active_vert = bitCount(shared_active_vert.xyz); - num_active_vert.y += num_active_vert.x; - num_active_vert.z += num_active_vert.y; - shared_active_vert_count = uvec4(0, num_active_vert); - - shared_active_prim_count_total = num_active_prim.z + bitCount(shared_active_prim.w); - shared_active_vert_count_total = num_active_vert.z + bitCount(shared_active_vert.w); + uint local_active_prim = bitCount(shared_active_prim[gl_SubgroupInvocationID]); + uint active_prim = subgroupInclusiveAdd(local_active_prim); + shared_active_prim_count[gl_SubgroupInvocationID] = active_prim - local_active_prim; + if (gl_SubgroupInvocationID == 7) + shared_active_prim_count_total = active_prim; + } + else if (gl_SubgroupInvocationID < 8 && gl_SubgroupID == 1) + { + uint local_active_vert = bitCount(shared_active_vert[gl_SubgroupInvocationID]); + uint active_vert = subgroupInclusiveAdd(local_active_vert); + shared_active_vert_count[gl_SubgroupInvocationID] = active_vert - local_active_vert; + if (gl_SubgroupInvocationID == 7) + shared_active_vert_count_total = active_vert; } barrier(); diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 84180e1e..90531ef1 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -86,7 +86,7 @@ static void decode_mesh(std::vector &out_index_buffer, std::vector &out_index_buffer, std::vector Date: Mon, 18 Dec 2023 14:32:56 +0100 Subject: [PATCH 20/59] Experiment with wg256 on NV. --- tests/assets/shaders/meshlet_debug_plain.mesh | 72 ++++++++++++------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 390e7ccd..8b4746a9 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -7,7 +7,8 @@ #extension GL_KHR_shader_subgroup_basic : require layout(max_primitives = 256, max_vertices = 255, triangles) out; -layout(local_size_x = 64, local_size_y = 4) in; +//layout(local_size_x = 64, local_size_y = 4) in; +layout(local_size_x = 32) in; #include "meshlet_render_types.h" @@ -60,6 +61,7 @@ shared uint shared_active_vert_count[8]; shared uint shared_active_prim_count[8]; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; +shared uint8_t shared_vertex_map[255]; const uint CLIP_CODE_INACCURATE = 1 << 0; const uint CLIP_CODE_NEGATIVE_W = 1 << 1; @@ -84,6 +86,11 @@ bool lane_has_active_vert(uint index) return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; } +bool lane_has_active_prim(uint index) +{ + return (shared_active_prim[index / 32u] & (1u << (index & 31u))) != 0u; +} + uvec3 remap_index_buffer(uvec3 prim) { return uvec3(compacted_vertex_output(prim.x), @@ -143,9 +150,9 @@ void main() vec4 clip_pos; uvec3 prim; - if (linear_index < meshlet.num_attributes) + for (uint i = linear_index; i < meshlet.num_attributes; i += gl_WorkGroupSize.x) { - vec3 pos = pos.data[meshlet.vertex_offset + linear_index]; + vec3 pos = pos.data[meshlet.vertex_offset + i]; world_pos = (M * vec4(pos, 1.0)).xyz; clip_pos = VP * vec4(world_pos, 1.0); @@ -164,16 +171,15 @@ void main() clip_code |= CLIP_CODE_POSITIVE_Y; vec2 window = roundEven(c * viewport.zw + viewport.xy); - shared_window_positions[linear_index] = window; - shared_clip_code[linear_index] = uint8_t(clip_code); + shared_window_positions[i] = window; + shared_clip_code[i] = uint8_t(clip_code); } barrier(); - bool is_active_prim = false; - if (linear_index < meshlet.num_primitives) + for (uint i = linear_index; i < meshlet.num_primitives; i += gl_WorkGroupSize.x) { - prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index]); + prim = uvec3(ibo.data[meshlet.primitive_offset + i]); uint code_a = shared_clip_code[prim.x]; uint code_b = shared_clip_code[prim.y]; uint code_c = shared_clip_code[prim.z]; @@ -197,8 +203,7 @@ void main() if (force_accept) { - is_active_prim = true; - atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); + atomicOr(shared_active_prim[i / 32], 1u << (i & 31)); atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); @@ -215,14 +220,12 @@ void main() shared_active_prim_count[gl_SubgroupInvocationID] = active_prim - local_active_prim; if (gl_SubgroupInvocationID == 7) shared_active_prim_count_total = active_prim; - } - else if (gl_SubgroupInvocationID < 8 && gl_SubgroupID == 1) - { - uint local_active_vert = bitCount(shared_active_vert[gl_SubgroupInvocationID]); - uint active_vert = subgroupInclusiveAdd(local_active_vert); - shared_active_vert_count[gl_SubgroupInvocationID] = active_vert - local_active_vert; - if (gl_SubgroupInvocationID == 7) - shared_active_vert_count_total = active_vert; + + uint local_active_vert = bitCount(shared_active_vert[gl_SubgroupInvocationID]); + uint active_vert = subgroupInclusiveAdd(local_active_vert); + shared_active_vert_count[gl_SubgroupInvocationID] = active_vert - local_active_vert; + if (gl_SubgroupInvocationID == 7) + shared_active_vert_count_total = active_vert; } barrier(); @@ -232,19 +235,34 @@ void main() SetMeshOutputsEXT(num_verts, num_prims); - if (is_active_prim) - gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); + for (uint i = linear_index; i < meshlet.num_primitives; i += gl_WorkGroupSize.x) + { + if (lane_has_active_prim(i)) + { + prim = uvec3(ibo.data[meshlet.primitive_offset + i]); + gl_PrimitiveTriangleIndicesEXT[compacted_index_output(i)] = remap_index_buffer(prim); + vDrawID[compacted_index_output(i)] = task.meshlet_index; + } + } - if (gl_LocalInvocationIndex < num_prims) - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; + for (uint i = linear_index; i < meshlet.num_attributes; i += gl_WorkGroupSize.x) + if (lane_has_active_vert(i)) + shared_vertex_map[compacted_vertex_output(i)] = uint8_t(i); + + barrier(); - bool has_active_vert = lane_has_active_vert(linear_index); - if (has_active_vert) + for (uint i = linear_index; i < num_verts; i += gl_WorkGroupSize.x) { - uint out_vert_index = compacted_vertex_output(linear_index); - gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - vWorldPos[out_vert_index] = world_pos; + uint remapped_index = uint(shared_vertex_map[i]); + + vec3 pos = pos.data[meshlet.vertex_offset + remapped_index]; + world_pos = (M * vec4(pos, 1.0)).xyz; + clip_pos = VP * vec4(world_pos, 1.0); + + gl_MeshVerticesEXT[i].gl_Position = clip_pos; + vWorldPos[i] = world_pos; } + #else SetMeshOutputsEXT(meshlet.num_attributes, meshlet.num_primitives); if (linear_index < meshlet.num_attributes) From 54fe53cd69d9deb4b520fbecdb1df0fdce7918dc Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 18 Dec 2023 14:38:11 +0100 Subject: [PATCH 21/59] Revert "Experiment with wg256 on NV." This reverts commit 9e033b90becccf7a3734ad1e87cc4e475500d896. --- tests/assets/shaders/meshlet_debug_plain.mesh | 72 +++++++------------ 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 8b4746a9..390e7ccd 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -7,8 +7,7 @@ #extension GL_KHR_shader_subgroup_basic : require layout(max_primitives = 256, max_vertices = 255, triangles) out; -//layout(local_size_x = 64, local_size_y = 4) in; -layout(local_size_x = 32) in; +layout(local_size_x = 64, local_size_y = 4) in; #include "meshlet_render_types.h" @@ -61,7 +60,6 @@ shared uint shared_active_vert_count[8]; shared uint shared_active_prim_count[8]; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; -shared uint8_t shared_vertex_map[255]; const uint CLIP_CODE_INACCURATE = 1 << 0; const uint CLIP_CODE_NEGATIVE_W = 1 << 1; @@ -86,11 +84,6 @@ bool lane_has_active_vert(uint index) return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; } -bool lane_has_active_prim(uint index) -{ - return (shared_active_prim[index / 32u] & (1u << (index & 31u))) != 0u; -} - uvec3 remap_index_buffer(uvec3 prim) { return uvec3(compacted_vertex_output(prim.x), @@ -150,9 +143,9 @@ void main() vec4 clip_pos; uvec3 prim; - for (uint i = linear_index; i < meshlet.num_attributes; i += gl_WorkGroupSize.x) + if (linear_index < meshlet.num_attributes) { - vec3 pos = pos.data[meshlet.vertex_offset + i]; + vec3 pos = pos.data[meshlet.vertex_offset + linear_index]; world_pos = (M * vec4(pos, 1.0)).xyz; clip_pos = VP * vec4(world_pos, 1.0); @@ -171,15 +164,16 @@ void main() clip_code |= CLIP_CODE_POSITIVE_Y; vec2 window = roundEven(c * viewport.zw + viewport.xy); - shared_window_positions[i] = window; - shared_clip_code[i] = uint8_t(clip_code); + shared_window_positions[linear_index] = window; + shared_clip_code[linear_index] = uint8_t(clip_code); } barrier(); - for (uint i = linear_index; i < meshlet.num_primitives; i += gl_WorkGroupSize.x) + bool is_active_prim = false; + if (linear_index < meshlet.num_primitives) { - prim = uvec3(ibo.data[meshlet.primitive_offset + i]); + prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index]); uint code_a = shared_clip_code[prim.x]; uint code_b = shared_clip_code[prim.y]; uint code_c = shared_clip_code[prim.z]; @@ -203,7 +197,8 @@ void main() if (force_accept) { - atomicOr(shared_active_prim[i / 32], 1u << (i & 31)); + is_active_prim = true; + atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); @@ -220,12 +215,14 @@ void main() shared_active_prim_count[gl_SubgroupInvocationID] = active_prim - local_active_prim; if (gl_SubgroupInvocationID == 7) shared_active_prim_count_total = active_prim; - - uint local_active_vert = bitCount(shared_active_vert[gl_SubgroupInvocationID]); - uint active_vert = subgroupInclusiveAdd(local_active_vert); - shared_active_vert_count[gl_SubgroupInvocationID] = active_vert - local_active_vert; - if (gl_SubgroupInvocationID == 7) - shared_active_vert_count_total = active_vert; + } + else if (gl_SubgroupInvocationID < 8 && gl_SubgroupID == 1) + { + uint local_active_vert = bitCount(shared_active_vert[gl_SubgroupInvocationID]); + uint active_vert = subgroupInclusiveAdd(local_active_vert); + shared_active_vert_count[gl_SubgroupInvocationID] = active_vert - local_active_vert; + if (gl_SubgroupInvocationID == 7) + shared_active_vert_count_total = active_vert; } barrier(); @@ -235,34 +232,19 @@ void main() SetMeshOutputsEXT(num_verts, num_prims); - for (uint i = linear_index; i < meshlet.num_primitives; i += gl_WorkGroupSize.x) - { - if (lane_has_active_prim(i)) - { - prim = uvec3(ibo.data[meshlet.primitive_offset + i]); - gl_PrimitiveTriangleIndicesEXT[compacted_index_output(i)] = remap_index_buffer(prim); - vDrawID[compacted_index_output(i)] = task.meshlet_index; - } - } + if (is_active_prim) + gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); - for (uint i = linear_index; i < meshlet.num_attributes; i += gl_WorkGroupSize.x) - if (lane_has_active_vert(i)) - shared_vertex_map[compacted_vertex_output(i)] = uint8_t(i); - - barrier(); + if (gl_LocalInvocationIndex < num_prims) + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; - for (uint i = linear_index; i < num_verts; i += gl_WorkGroupSize.x) + bool has_active_vert = lane_has_active_vert(linear_index); + if (has_active_vert) { - uint remapped_index = uint(shared_vertex_map[i]); - - vec3 pos = pos.data[meshlet.vertex_offset + remapped_index]; - world_pos = (M * vec4(pos, 1.0)).xyz; - clip_pos = VP * vec4(world_pos, 1.0); - - gl_MeshVerticesEXT[i].gl_Position = clip_pos; - vWorldPos[i] = world_pos; + uint out_vert_index = compacted_vertex_output(linear_index); + gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; + vWorldPos[out_vert_index] = world_pos; } - #else SetMeshOutputsEXT(meshlet.num_attributes, meshlet.num_primitives); if (linear_index < meshlet.num_attributes) From 6fe9819defc01fed5bcb94238e183ad9ddc16282 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 19 Dec 2023 15:34:20 +0100 Subject: [PATCH 22/59] Start rewriting everything for new meshlet format. --- physics/physics_system.cpp | 2 +- physics/physics_system.hpp | 2 +- scene-export/meshlet_export.cpp | 675 ++++++++++++++------------- tests/meshopt_sandbox.cpp | 103 ++-- viewer/physics_sandbox.cpp | 2 +- vulkan/managers/resource_manager.cpp | 2 +- vulkan/mesh/meshlet.cpp | 16 +- vulkan/mesh/meshlet.hpp | 36 +- 8 files changed, 431 insertions(+), 407 deletions(-) diff --git a/physics/physics_system.cpp b/physics/physics_system.cpp index c206b996..57e804ca 100644 --- a/physics/physics_system.cpp +++ b/physics/physics_system.cpp @@ -439,7 +439,7 @@ unsigned PhysicsSystem::register_collision_mesh(const CollisionMesh &mesh) auto *index_vertex_array = new btTriangleIndexVertexArray(mesh.num_triangles, const_cast(reinterpret_cast(mesh.indices)), mesh.index_stride_triangle, - mesh.num_vertices, + mesh.num_attributes, const_cast(mesh.positions), mesh.position_stride); const vec3 &lo = mesh.aabb.get_minimum(); diff --git a/physics/physics_system.hpp b/physics/physics_system.hpp index ee703542..94a0f059 100644 --- a/physics/physics_system.hpp +++ b/physics/physics_system.hpp @@ -175,7 +175,7 @@ class PhysicsSystem final : public PhysicsSystemInterface struct CollisionMesh { unsigned num_triangles = 0; - unsigned num_vertices = 0; + unsigned num_attributes = 0; const uint32_t *indices = nullptr; size_t index_stride_triangle = 0; const float *positions = nullptr; diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index ced85ade..175980e3 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -36,7 +36,7 @@ using namespace Vulkan::Meshlet; struct Metadata : Header { Bound bound; - Stream u32_streams[MaxU32Streams]; + Stream streams[MaxStreams]; }; struct CombinedMesh @@ -49,60 +49,51 @@ struct CombinedMesh struct Encoded { - std::vector payload; + std::vector payload; CombinedMesh mesh; }; struct Meshlet { uint32_t offset; - uint32_t count; + uint32_t primitive_count; + uint32_t vertex_count; }; struct PrimitiveAnalysisResult { uint32_t num_primitives; - uint32_t num_vertices; + uint32_t num_attributes; }; -static i16vec4 encode_vec3_to_snorm_exp(vec3 v) +static i16vec3 encode_vec3_to_snorm_exp(vec3 v, int scale_log2) { - vec3 vabs = abs(v); - float max_scale = max(max(vabs.x, vabs.y), vabs.z); - int max_scale_log2 = int(muglm::floor(log2(max_scale))); - int scale_log2 = 14 - max_scale_log2; - - // Maximum component should have range of [1, 2) since we use floor of log2, so scale with 2^14 instead of 15. v.x = ldexpf(v.x, scale_log2); v.y = ldexpf(v.y, scale_log2); v.z = ldexpf(v.z, scale_log2); v = clamp(round(v), vec3(-0x8000), vec3(0x7fff)); - - return i16vec4(i16vec3(v), int16_t(-scale_log2)); + return i16vec3(v); } -static i16vec3 encode_vec2_to_snorm_exp(vec2 v) +static i16vec2 encode_vec2_to_snorm_exp(vec2 v, int scale_log2) { - vec2 vabs = abs(v); - float max_scale = max(vabs.x, vabs.y); - int max_scale_log2 = int(muglm::floor(log2(max_scale))); - int scale_log2 = 14 - max_scale_log2; - - // UVs are unorm scaled, don't need more accuracy than this. - // If all UVs are in range of [0, 1] space, we should get a constant exponent which aids compression. - scale_log2 = min(scale_log2, 15); - - // Maximum component should have range of [1, 2) since we use floor of log2, so scale with 2^14 instead of 15. v.x = ldexpf(v.x, scale_log2); v.y = ldexpf(v.y, scale_log2); v = clamp(round(v), vec2(-0x8000), vec2(0x7fff)); + return i16vec2(v); +} - return i16vec3(i16vec2(v), int16_t(-scale_log2)); +static int compute_log2_scale(float max_value) +{ + // Maximum component should have range of [1, 2) since we use floor of log2, so scale with 2^14 instead of 15. + int max_scale_log2 = int(muglm::floor(muglm::log2(max_value))); + int scale_log2 = 14 - max_scale_log2; + return scale_log2; } -static std::vector mesh_extract_position_snorm_exp(const SceneFormats::Mesh &mesh) +static std::vector mesh_extract_position_snorm_exp(const SceneFormats::Mesh &mesh, int &exp) { - std::vector encoded_positions; + std::vector encoded_positions; std::vector positions; size_t num_positions = mesh.positions.size() / mesh.position_stride; @@ -113,7 +104,11 @@ static std::vector mesh_extract_position_snorm_exp(const SceneFormats:: if (fmt == VK_FORMAT_R32G32B32A32_SFLOAT || fmt == VK_FORMAT_R32G32B32_SFLOAT) { for (size_t i = 0; i < num_positions; i++) - memcpy(positions[i].data, mesh.positions.data() + i * mesh.position_stride + layout.offset, sizeof(float) * 3); + { + memcpy(positions[i].data, + mesh.positions.data() + i * mesh.position_stride + layout.offset, + sizeof(float) * 3); + } } else if (fmt == VK_FORMAT_UNDEFINED) return {}; @@ -123,75 +118,108 @@ static std::vector mesh_extract_position_snorm_exp(const SceneFormats:: return {}; } + vec3 max_extent = vec3(0.0f); + for (auto &p : positions) + max_extent = max(max_extent, abs(p)); + + float max_value = max(max(max_extent.x, max_extent.y), max_extent.z); + int log2_scale = compute_log2_scale(max_value); + + log2_scale = std::min(log2_scale, 12); + encoded_positions.reserve(positions.size()); for (auto &pos : positions) - encoded_positions.push_back(encode_vec3_to_snorm_exp(pos)); + encoded_positions.push_back(encode_vec3_to_snorm_exp(pos, log2_scale)); + exp = -log2_scale; return encoded_positions; } -static std::vector mesh_extract_normal_tangent_oct8(const SceneFormats::Mesh &mesh, MeshAttribute attr) +struct NormalTangent { - std::vector encoded_attributes; - std::vector normals; + i8vec2 n; + i8vec2 t; + bool t_sign; +}; - auto &layout = mesh.attribute_layout[Util::ecast(attr)]; - auto fmt = layout.format; +static std::vector mesh_extract_normal_tangent_oct8(const SceneFormats::Mesh &mesh) +{ + std::vector encoded_attributes; + std::vector normals; + std::vector tangents; + + auto &normal = mesh.attribute_layout[Util::ecast(MeshAttribute::Normal)]; + auto &tangent = mesh.attribute_layout[Util::ecast(MeshAttribute::Tangent)]; size_t num_attrs = mesh.attributes.size() / mesh.attribute_stride; normals.resize(num_attrs); + tangents.resize(num_attrs); - if (fmt == VK_FORMAT_R32G32B32_SFLOAT) + if (normal.format == VK_FORMAT_R32G32B32_SFLOAT || normal.format == VK_FORMAT_R32G32B32A32_SFLOAT) { for (size_t i = 0; i < num_attrs; i++) { memcpy(normals[i].data, - mesh.attributes.data() + i * mesh.attribute_stride + layout.offset, + mesh.attributes.data() + i * mesh.attribute_stride + normal.offset, + sizeof(float) * 3); + } + } + else if (normal.format == VK_FORMAT_UNDEFINED) + return {}; + else + { + LOGE("Unexpected format %u.\n", normal.format); + return {}; + } + + if (tangent.format == VK_FORMAT_R32G32B32_SFLOAT) + { + for (size_t i = 0; i < num_attrs; i++) + { + memcpy(tangents[i].data, + mesh.attributes.data() + i * mesh.attribute_stride + tangent.offset, sizeof(float) * 3); - normals[i].w = 0.0f; + tangents[i].w = 0.0f; } } - else if (fmt == VK_FORMAT_R32G32B32A32_SFLOAT) + else if (tangent.format == VK_FORMAT_R32G32B32A32_SFLOAT) { for (size_t i = 0; i < num_attrs; i++) { memcpy(normals[i].data, - mesh.attributes.data() + i * mesh.attribute_stride + layout.offset, + mesh.attributes.data() + i * mesh.attribute_stride + tangent.offset, sizeof(float) * 4); } } - else if (fmt == VK_FORMAT_UNDEFINED) + else if (tangent.format == VK_FORMAT_UNDEFINED) return {}; else { - LOGE("Unexpected format %u.\n", fmt); + LOGE("Unexpected format %u.\n", tangent.format); return {}; } encoded_attributes.resize(normals.size()); - meshopt_encodeFilterOct(encoded_attributes.data(), encoded_attributes.size(), - sizeof(i8vec4), 8, normals[0].data); - for (auto &n : encoded_attributes) - n.w = n.w <= 0 ? -1 : 0; - return encoded_attributes; -} + std::vector n(encoded_attributes.size()); + std::vector t(encoded_attributes.size()); + meshopt_encodeFilterOct(n.data(), n.size(), sizeof(i8vec4), 8, normals[0].data); + meshopt_encodeFilterOct(t.data(), t.size(), sizeof(i8vec4), 8, tangents[0].data); -static i16vec4 encode_uv_to_snorm_scale(vec2 uv) -{ - // UVs tend to be in [0, 1] range. Readjust to use more of the available range. - uv = 2.0f * uv - 1.0f; - return i16vec4(encode_vec2_to_snorm_exp(uv), 0); + for (size_t i = 0, size = encoded_attributes.size(); i < size; i++) + encoded_attributes.push_back({ n[i].xy(), t[i].xy(), tangents[i].w < 0.0f }); + + return encoded_attributes; } -static std::vector mesh_extract_uv_snorm_scale(const SceneFormats::Mesh &mesh) +static std::vector mesh_extract_uv_snorm_scale(const SceneFormats::Mesh &mesh, int &exp) { - std::vector encoded_uvs; + std::vector encoded_uvs; std::vector uvs; size_t num_uvs = mesh.attributes.size() / mesh.attribute_stride; uvs.resize(num_uvs); - auto &layout = mesh.attribute_layout[Util::ecast(MeshAttribute::UV)]; + auto &layout = mesh.attribute_layout[int(MeshAttribute::UV)]; auto fmt = layout.format; if (fmt == VK_FORMAT_R32G32_SFLOAT) @@ -216,23 +244,26 @@ static std::vector mesh_extract_uv_snorm_scale(const SceneFormats::Mesh return {}; } + vec2 max_extent = vec2(0.0f); + for (auto &uv : uvs) + { + // UVs tend to be in [0, 1] range. Readjust to use more of the available range. + uv = 2.0f * uv - 1.0f; + max_extent = max(max_extent, abs(uv)); + } + + float max_value = max(max_extent.x, max_extent.y); + int log2_scale = compute_log2_scale(max_value); + encoded_uvs.reserve(uvs.size()); for (auto &uv : uvs) - encoded_uvs.push_back(encode_uv_to_snorm_scale(uv)); + encoded_uvs.push_back(encode_vec2_to_snorm_exp(uv, log2_scale)); + exp = -log2_scale; return encoded_uvs; } -static vec3 decode_snorm_exp(i16vec4 p) -{ - vec3 result; - result.x = ldexpf(float(p.x), p.w); - result.y = ldexpf(float(p.y), p.w); - result.z = ldexpf(float(p.z), p.w); - return result; -} - -static PrimitiveAnalysisResult analyze_primitive_count(std::unordered_map &vertex_remap, +static PrimitiveAnalysisResult analyze_primitive_count(std::unordered_map &vertex_remap, const uint32_t *index_buffer, uint32_t max_num_primitives) { PrimitiveAnalysisResult result = {}; @@ -264,260 +295,275 @@ static PrimitiveAnalysisResult analyze_primitive_count(std::unordered_map = 0x80u) - v ^= 0xffu; - result[i] = v == 0 ? 1 : (33 - leading_zeroes(v)); - } - } - return result; + vec3 result; + result.x = ldexpf(float(p.x), exp); + result.y = ldexpf(float(p.y), exp); + result.z = ldexpf(float(p.z), exp); + return result; } -static uint32_t extract_bit_plane(const uint8_t *bytes, unsigned bit_index) +static void encode_index_stream(std::vector &out_payload_buffer, + u8vec3 (&stream_buffer)[ElementsPerChunk]) { - uint32_t u32 = 0; - for (unsigned i = 0; i < 32; i++) - u32 |= ((bytes[4 * i] >> bit_index) & 1u) << i; - return u32; -} + PayloadB128 p0{}; + PayloadB128 p1{}; + PayloadB128 p2{}; + PayloadB128 p3{}; -static void find_linear_predictor(uint16_t *predictor, - const u8vec4 (&stream_buffer)[MaxElements], - unsigned num_elements) -{ - // Sign-extend since the deltas are considered to be signed ints. - ivec4 unrolled_data[MaxElements]; - for (unsigned i = 0; i < num_elements; i++) - unrolled_data[i] = ivec4(i8vec4(stream_buffer[i])); - - // Simple linear regression. - // Pilfered from: https://www.codesansar.com/numerical-methods/linear-regression-method-using-c-programming.htm - ivec4 x{0}, x2{0}, y{0}, xy{0}; - for (unsigned i = 0; i < num_elements; i++) + for (unsigned i = 0; i < 32; i++) { - x += int(i); - x2 += int(i * i); - y += unrolled_data[i]; - xy += int(i) * unrolled_data[i]; + u8vec3 indices = stream_buffer[i]; + assert(all(lessThan(indices, u8vec3(32)))); + + p0.words[0] |= ((indices.x >> 0u) & 1u) << i; + p0.words[1] |= ((indices.x >> 1u) & 1u) << i; + p0.words[2] |= ((indices.x >> 2u) & 1u) << i; + p0.words[3] |= ((indices.x >> 3u) & 1u) << i; + p3.words[0] |= ((indices.x >> 4u) & 1u) << i; + + p1.words[0] |= ((indices.y >> 0u) & 1u) << i; + p1.words[1] |= ((indices.y >> 1u) & 1u) << i; + p1.words[2] |= ((indices.y >> 2u) & 1u) << i; + p1.words[3] |= ((indices.y >> 3u) & 1u) << i; + p3.words[1] |= ((indices.y >> 4u) & 1u) << i; + + p2.words[0] |= ((indices.z >> 0u) & 1u) << i; + p2.words[1] |= ((indices.z >> 1u) & 1u) << i; + p2.words[2] |= ((indices.z >> 2u) & 1u) << i; + p2.words[3] |= ((indices.z >> 3u) & 1u) << i; + p3.words[2] |= ((indices.z >> 4u) & 1u) << i; } - int n = int(num_elements); - ivec4 b_denom = (n * x2 - x * x); - b_denom = select(b_denom, ivec4(1), equal(ivec4(0), b_denom)); - - // Encode in u8.8 fixed point. - ivec4 b = (ivec4(256) * (n * xy - x * y)) / b_denom; - ivec4 a = ((ivec4(256) * y - b * x)) / n; - - for (unsigned i = 0; i < 4; i++) - predictor[i] = uint16_t(a[i]); - for (unsigned i = 0; i < 4; i++) - predictor[4 + i] = uint16_t(b[i]); + out_payload_buffer.push_back(p0); + out_payload_buffer.push_back(p1); + out_payload_buffer.push_back(p2); + out_payload_buffer.push_back(p3); } -static size_t encode_stream(std::vector &out_payload_buffer, - Stream &stream, u8vec4 (&stream_buffer)[MaxElements], - unsigned num_elements) +static void encode_attribute_stream(std::vector &out_payload_buffer, + Stream &stream, + const u16vec3 *raw_positions, + uint32_t chunk_index, const uint32_t *vbo_remap, + uint32_t num_attributes) { - stream.offset_from_base_u32 = uint32_t(out_payload_buffer.size()); + u16vec3 positions[ElementsPerChunk]; + for (uint32_t i = 0; i < num_attributes; i++) + positions[i] = raw_positions[vbo_remap[i]]; + for (uint32_t i = num_attributes; i < ElementsPerChunk; i++) + positions[i] = positions[0]; - // Delta-encode - u8vec4 current_value; - if (num_elements > 1) - current_value = u8vec4(2) * stream_buffer[0] - stream_buffer[1]; - else - current_value = stream_buffer[0]; - u8vec4 bias_value = current_value; + u16vec3 lo{0xffff}; + u16vec3 hi{0}; - for (unsigned i = 0; i < num_elements; i++) + for (auto &p : positions) { - u8vec4 next_value = stream_buffer[i]; - stream_buffer[i] = next_value - current_value; - current_value = next_value; + lo = min(lo, p); + hi = max(hi, p); } - // Find optimal linear predictor. - find_linear_predictor(stream.predictor, stream_buffer, num_elements); - - // u8.8 fixed point. - auto base_predictor = u16vec4(stream.predictor[0], stream.predictor[1], stream.predictor[2], stream.predictor[3]); - auto linear_predictor = u16vec4(stream.predictor[4], stream.predictor[5], stream.predictor[6], stream.predictor[7]); + u16vec3 diff = hi - lo; + u16vec3 diff_rev = lo - hi; - for (unsigned i = 0; i < num_elements; i++) + unsigned diff3 = max(max(diff.x, diff.y), diff.z); + unsigned diff3_rev = max(max(diff_rev.x, diff_rev.y), diff_rev.z); + if (diff3_rev < diff3) { - // Only predict in-bounds elements, since we want all out of bounds elements to be encoded to 0 delta - // without having them affect the predictor. - stream_buffer[i] -= u8vec4((base_predictor + linear_predictor * uint16_t(i)) >> uint16_t(8)); + std::swap(lo, hi); + diff3 = diff3_rev; } - for (unsigned i = num_elements; i < MaxElements; i++) - stream_buffer[i] = u8vec4(0); + unsigned bits = compute_required_bits_unsigned(diff3); + unsigned encoded_bits = (bits + 1) / 2; - // Try to adjust the range such that it can fit in fewer bits. - // We can use the constant term in the linear predictor to nudge values in place. - i8vec4 lo(127); - i8vec4 hi(-128); + stream.bit_plane_config0 |= encoded_bits << (4 * chunk_index); - for (unsigned i = 0; i < num_elements; i++) + stream.base_value_or_vertex_offset[chunk_index] = uint32_t(lo.x) | (uint32_t(lo.y) << 16); + stream.base_value_or_vertex_offset[chunk_index / 2 + 8] |= uint32_t(lo.z) << (16 * (chunk_index & 1)); + for (auto &p : positions) + p -= lo; + + if (encoded_bits == 8) { - lo = min(lo, i8vec4(stream_buffer[i])); - hi = max(hi, i8vec4(stream_buffer[i])); + // Plain write. + PayloadB128 p[12]; + + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + u16vec2 d = positions[i].xy(); + p[i / 4].words[i % 4] = uint32_t(d.x) | (uint32_t(d.y) << 16); + } + + for (uint32_t i = 0; i < ElementsPerChunk / 2; i++) + { + u16vec2 d = u16vec2(positions[2 * i].z, positions[2 * i + 1].z); + p[8 + i / 4].words[i % 4] = uint32_t(d.x) | (uint32_t(d.y) << 16); + } + + out_payload_buffer.insert(out_payload_buffer.end(), p, p + 12); } + else + { + unsigned bit_offset = 0; - uvec4 full_bits = compute_required_bits_unsigned(u8vec4(hi - lo)); - u8vec4 target_lo_value = u8vec4(-((uvec4(1) << full_bits) >> 1u)); - u8vec4 bias = target_lo_value - u8vec4(lo); + if (encoded_bits & 4) + { + PayloadB128 p[6]{}; - for (unsigned i = 0; i < num_elements; i++) - stream_buffer[i] += bias; + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + u16vec3 d = positions[i]; + for (int c = 0; c < 3; c++) + for (int b = 0; b < 8; b++) + p[c * 2 + b / 4].words[b % 4] |= ((d[c] >> (bit_offset + b)) & 1u) << i; + } - for (unsigned i = 0; i < 4; i++) - stream.predictor[i] -= uint16_t(bias[i]) << 8; + for (auto v : p) + out_payload_buffer.push_back(v); + bit_offset += 8; + } - // Based on the linear predictor, it's possible that the encoded value in stream_buffer[0] becomes non-zero again. - // This is undesirable, since we can use the initial value to force a delta of 0 here, saving precious bits. - bias_value += stream_buffer[0]; - stream_buffer[0] = u8vec4(0); + if (encoded_bits & 2) + { + PayloadB128 p[3]{}; - // Simple linear predictor, base equal elements[0], gradient = 0. - stream.predictor[8] = uint16_t((bias_value.y << 8) | bias_value.x); - stream.predictor[9] = uint16_t((bias_value.w << 8) | bias_value.z); + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + u16vec3 d = positions[i]; + for (int c = 0; c < 3; c++) + for (int b = 0; b < 4; b++) + p[c].words[b] |= ((d[c] >> (bit_offset + b)) & 1u) << i; + } - // Encode 32 elements at once. - for (unsigned chunk_index = 0; chunk_index < MaxElements / 32; chunk_index++) - { - uvec4 required_bits = {}; - for (unsigned i = 0; i < 32; i++) - required_bits = max(required_bits, compute_required_bits_signed(stream_buffer[chunk_index * 32 + i])); - - // Encode bit counts. - stream.bitplane_meta[chunk_index] = uint16_t((required_bits.x << 0) | (required_bits.y << 4) | - (required_bits.z << 8) | (required_bits.w << 12)); - - for (unsigned i = 0; i < required_bits.x; i++) - out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][0], i)); - for (unsigned i = 0; i < required_bits.y; i++) - out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][1], i)); - for (unsigned i = 0; i < required_bits.z; i++) - out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][2], i)); - for (unsigned i = 0; i < required_bits.w; i++) - out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][3], i)); - } + for (auto v : p) + out_payload_buffer.push_back(v); + bit_offset += 4; + } - return out_payload_buffer.size() - stream.offset_from_base_u32; + if (encoded_bits & 1) + { + PayloadB128 p[2]{}; + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + u16vec3 d = positions[i]; + + p[0].words[0] |= ((d.x >> bit_offset) & 1u) << i; + p[0].words[1] |= ((d.x >> (bit_offset + 1)) & 1u) << i; + p[0].words[2] |= ((d.y >> bit_offset) & 1u) << i; + p[0].words[3] |= ((d.y >> (bit_offset + 1)) & 1u) << i; + p[1].words[0] |= ((d.z >> bit_offset) & 1u) << i; + p[1].words[1] |= ((d.z >> (bit_offset + 1)) & 1u) << i; + } + + for (auto v : p) + out_payload_buffer.push_back(v); + bit_offset += 2; + } + } } static void encode_mesh(Encoded &encoded, const Meshlet *meshlets, size_t num_meshlets, - const uint32_t *index_buffer, uint32_t primitive_count, - const uint32_t *attributes, - unsigned num_u32_streams) + const void * const *pp_data, + const int *p_aux, + unsigned num_streams) { encoded = {}; auto &mesh = encoded.mesh; - mesh.stream_count = num_u32_streams + 1; - mesh.meshlets.reserve(num_meshlets); - uint32_t base_vertex_offset = 0; - - std::unordered_map vbo_remap; - uint32_t primitive_index = 0; - size_t words_per_stream[MaxU32Streams] = {}; + assert(num_streams > 0); + mesh.stream_count = num_streams; - for (uint32_t meshlet_index = 0; meshlet_index < num_meshlets; meshlet_index++) - { - uint32_t primitives_to_process = min(primitive_count - primitive_index, meshlets[meshlet_index].count); - assert(primitives_to_process); - assert(primitive_count > primitive_index); + size_t num_full_meshlets = (num_meshlets + NumChunks - 1) / NumChunks; + mesh.meshlets.reserve(num_full_meshlets); + uint32_t base_vertex_offset = 0; - primitive_index = meshlets[meshlet_index].offset; + auto *index_buffer = static_cast(pp_data[0]); - auto analysis_result = analyze_primitive_count( - vbo_remap, index_buffer + 3 * primitive_index, - primitives_to_process); + std::unordered_map vbo_remap; - assert(analysis_result.num_primitives); - assert(analysis_result.num_vertices); + for (uint32_t full_meshlet_index = 0; full_meshlet_index < num_full_meshlets; full_meshlet_index++) + { + Metadata out_meshlet = {}; + out_meshlet.base_vertex_offset = base_vertex_offset; - primitives_to_process = analysis_result.num_primitives; + uint32_t num_chunks = std::min(num_meshlets - full_meshlet_index * NumChunks, NumChunks); + for (uint32_t chunk_index = 0; chunk_index < num_chunks; chunk_index++) + { + auto &meshlet = meshlets[full_meshlet_index * NumChunks + chunk_index]; - Metadata meshlet = {}; - u8vec4 stream_buffer[MaxElements]; + uint32_t primitive_index = meshlets[full_meshlet_index].offset; - meshlet.base_vertex_offset = base_vertex_offset; - meshlet.num_primitives = analysis_result.num_primitives; - meshlet.num_attributes = analysis_result.num_vertices; + auto analysis_result = analyze_primitive_count( + vbo_remap, index_buffer + 3 * primitive_index, + meshlet.primitive_count); + assert(analysis_result.num_primitives <= ElementsPerChunk); + assert(analysis_result.num_attributes <= ElementsPerChunk); - // Encode index buffer. - for (uint32_t i = 0; i < analysis_result.num_primitives; i++) - { - uint8_t i0 = vbo_remap[index_buffer[3 * (primitive_index + i) + 0]]; - uint8_t i1 = vbo_remap[index_buffer[3 * (primitive_index + i) + 1]]; - uint8_t i2 = vbo_remap[index_buffer[3 * (primitive_index + i) + 2]]; - stream_buffer[i] = u8vec4(i0, i1, i2, 0); - } + // Encode index buffer + { + u8vec3 index_stream_buffer[ElementsPerChunk]; + for (uint32_t i = 0; i < analysis_result.num_primitives; i++) + { + uint8_t i0 = vbo_remap.at(index_buffer[3 * (primitive_index + i) + 0]); + uint8_t i1 = vbo_remap.at(index_buffer[3 * (primitive_index + i) + 1]); + uint8_t i2 = vbo_remap.at(index_buffer[3 * (primitive_index + i) + 2]); + index_stream_buffer[i] = u8vec3(i0, i1, i2); + } + + auto &index_stream = out_meshlet.streams[0]; + index_stream.base_value_or_vertex_offset[chunk_index] = out_meshlet.num_attributes; + index_stream.offset_in_b128 = uint32_t(encoded.payload.size()); + encode_index_stream(encoded.payload, index_stream_buffer); + } - words_per_stream[0] += - encode_stream(encoded.payload, meshlet.u32_streams[0], stream_buffer, analysis_result.num_primitives); + uint64_t vbo_remapping[ElementsPerChunk]; + unsigned vbo_index = 0; + for (auto &v : vbo_remap) + { + assert(vbo_index < ElementsPerChunk); + vbo_remapping[vbo_index++] = (uint64_t(v.second) << 32) | v.first; + } + std::sort(vbo_remapping, vbo_remapping + vbo_index); - // Handle spill region just in case. - uint64_t vbo_remapping[MaxVertices + 3]; - unsigned vbo_index = 0; - for (auto &v : vbo_remap) - { - assert(vbo_index < MaxVertices + 3); - vbo_remapping[vbo_index++] = (uint64_t(v.second) << 32) | v.first; - } - std::sort(vbo_remapping, vbo_remapping + vbo_index); + uint32_t vbo_table[ElementsPerChunk]; + for (unsigned i = 0; i < ElementsPerChunk; i++) + vbo_table[i] = uint32_t(vbo_remapping[i]); - for (uint32_t stream_index = 0; stream_index < num_u32_streams; stream_index++) - { - for (uint32_t i = 0; i < analysis_result.num_vertices; i++) + for (uint32_t stream_index = 1; stream_index < num_streams; stream_index++) { - auto vertex_index = uint32_t(vbo_remapping[i]); - uint32_t payload = attributes[stream_index + num_u32_streams * vertex_index]; - memcpy(stream_buffer[i].data, &payload, sizeof(payload)); + out_meshlet.streams[stream_index].aux = p_aux[stream_index]; + + switch (StreamType(stream_index)) + { + case StreamType::Position: + encode_attribute_stream(encoded.payload, out_meshlet.streams[stream_index], + static_cast(pp_data[stream_index]), + chunk_index, vbo_table, analysis_result.num_attributes); + break; + + default: + break; + } } - words_per_stream[stream_index + 1] += - encode_stream(encoded.payload, meshlet.u32_streams[stream_index + 1], stream_buffer, - analysis_result.num_vertices); + out_meshlet.num_primitives += analysis_result.num_primitives; + out_meshlet.num_attributes += analysis_result.num_attributes; } - mesh.meshlets.push_back(meshlet); - base_vertex_offset += analysis_result.num_vertices; - primitive_index += primitives_to_process; + mesh.meshlets.push_back(out_meshlet); + base_vertex_offset += out_meshlet.num_attributes; } - - for (unsigned i = 0; i < MaxU32Streams; i++) - if (words_per_stream[i]) - LOGI("Stream[%u] = %zu bytes.\n", i, words_per_stream[i] * sizeof(uint32_t)); } static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) @@ -527,9 +573,9 @@ static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) FormatHeader header = {}; header.style = encoded.mesh.mesh_style; - header.u32_stream_count = encoded.mesh.stream_count; + header.stream_count = encoded.mesh.stream_count; header.meshlet_count = uint32_t(encoded.mesh.meshlets.size()); - header.payload_size_words = uint32_t(encoded.payload.size()); + header.payload_size_b128 = uint32_t(encoded.payload.size()); required_size += sizeof(magic); required_size += sizeof(FormatHeader); @@ -545,7 +591,7 @@ static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) // Payload. // Need a padding word to speed up decoder. - required_size += (encoded.payload.size() + 1) * sizeof(uint32_t); + required_size += (encoded.payload.size() + 1) * sizeof(PayloadB128); auto file = GRANITE_FILESYSTEM()->open(path, FileMode::WriteOnly); if (!file) @@ -578,16 +624,16 @@ static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) for (uint32_t i = 0; i < header.meshlet_count; i++) { - for (uint32_t j = 0; j < header.u32_stream_count; j++) + for (uint32_t j = 0; j < header.stream_count; j++) { - memcpy(ptr, &encoded.mesh.meshlets[i].u32_streams[j], sizeof(Stream)); + memcpy(ptr, &encoded.mesh.meshlets[i].streams[j], sizeof(Stream)); ptr += sizeof(Stream); } } - memcpy(ptr, encoded.payload.data(), encoded.payload.size() * sizeof(uint32_t)); + memcpy(ptr, encoded.payload.data(), encoded.payload.size() * sizeof(PayloadB128)); ptr += encoded.payload.size() * sizeof(uint32_t); - memset(ptr, 0, sizeof(uint32_t)); + memset(ptr, 0, sizeof(PayloadB128)); return true; } @@ -597,10 +643,13 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me if (!mesh_optimize_index_buffer(mesh, {})) return false; - std::vector positions, uv; - std::vector normals, tangent; + std::vector positions; + std::vector uv; + std::vector normal_tangent; - unsigned num_u32_streams = 0; + unsigned num_attribute_streams = 0; + int aux[MaxStreams] = {}; + const void *p_data[MaxStreams] = {}; switch (style) { @@ -608,29 +657,31 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me LOGE("Unimplemented.\n"); return false; case MeshStyle::Textured: - uv = mesh_extract_uv_snorm_scale(mesh); - num_u32_streams += 4; + uv = mesh_extract_uv_snorm_scale(mesh, aux[int(StreamType::UV)]); + num_attribute_streams += 2; if (uv.empty()) { LOGE("No UVs.\n"); return false; } - normals = mesh_extract_normal_tangent_oct8(mesh, MeshAttribute::Normal); - tangent = mesh_extract_normal_tangent_oct8(mesh, MeshAttribute::Tangent); - if (normals.empty() || tangent.empty()) + normal_tangent = mesh_extract_normal_tangent_oct8(mesh); + if (normal_tangent.empty()) { LOGE("No tangent or normal.\n"); return false; } + p_data[int(StreamType::UV)] = uv.data(); + p_data[int(StreamType::NormalTangentOct8)] = normal_tangent.data(); // Fallthrough case MeshStyle::Wireframe: - positions = mesh_extract_position_snorm_exp(mesh); + positions = mesh_extract_position_snorm_exp(mesh, aux[int(StreamType::Position)]); if (positions.empty()) { LOGE("No positions.\n"); return false; } - num_u32_streams += 2; + p_data[int(StreamType::Position)] = positions.data(); + num_attribute_streams += 1; break; default: @@ -638,40 +689,14 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me return false; } - std::vector attributes(num_u32_streams * positions.size()); - uint32_t *ptr = attributes.data(); - for (size_t i = 0, n = positions.size(); i < n; i++) - { - memcpy(ptr, positions[i].data, sizeof(positions.front())); - ptr += sizeof(positions.front()) / sizeof(uint32_t); - - if (!normals.empty()) - { - memcpy(ptr, normals[i].data, sizeof(normals.front())); - ptr += sizeof(normals.front()) / sizeof(uint32_t); - } - - if (!tangent.empty()) - { - memcpy(ptr, tangent[i].data, sizeof(tangent.front())); - ptr += sizeof(tangent.front()) / sizeof(uint32_t); - } - - if (!uv.empty()) - { - memcpy(ptr, uv[i].data, sizeof(uv.front())); - ptr += sizeof(uv.front()) / sizeof(uint32_t); - } - } - // Use quantized position to guide the clustering. std::vector position_buffer; position_buffer.reserve(positions.size()); for (auto &p : positions) - position_buffer.push_back(decode_snorm_exp(p)); + position_buffer.push_back(decode_snorm_exp(p, aux[int(StreamType::Position)])); - constexpr unsigned max_vertices = 255; - constexpr unsigned max_primitives = 256; + constexpr unsigned max_vertices = 32; + constexpr unsigned max_primitives = 32; size_t num_meshlets = meshopt_buildMeshletsBound(mesh.count, max_vertices, max_primitives); std::vector out_vertex_redirection_buffer(num_meshlets * max_vertices); @@ -695,7 +720,8 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me { Meshlet m = {}; m.offset = uint32_t(out_index_buffer.size()); - m.count = meshlet.triangle_count; + m.primitive_count = meshlet.triangle_count; + m.vertex_count = meshlet.vertex_count; out_meshlets.push_back(m); auto *local_indices = local_index_buffer.data() + meshlet.triangle_offset; @@ -708,23 +734,40 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me } } + p_data[0] = out_index_buffer.data(); + + Encoded encoded; + encode_mesh(encoded, out_meshlets.data(), out_meshlets.size(), + p_data, aux, num_attribute_streams + 1); + encoded.mesh.mesh_style = style; + + // Compute bounds std::vector bounds; bounds.clear(); - bounds.reserve(num_meshlets); - for (auto &meshlet : out_meshlets) + bounds.reserve((num_meshlets + NumChunks - 1) / NumChunks); + + // Fuse 8 32-size meshlets together to form a 256 meshlet. + for (size_t i = 0, n = out_meshlets.size(); i < n; i += NumChunks) { + size_t num_chunks = std::min(n - i, NumChunks); + uint32_t total_count = 0; + uvec3 tmp_indices[256]; + + for (size_t chunk = 0; chunk < num_chunks; chunk++) + { + auto &meshlet = out_meshlets[i + chunk]; + memcpy(tmp_indices[total_count].data, + out_index_buffer[meshlet.offset].data, + meshlet.primitive_count * sizeof(tmp_indices[0].data)); + total_count += meshlet.primitive_count; + } + auto bound = meshopt_computeClusterBounds( - out_index_buffer[meshlet.offset].data, meshlet.count * 3, + tmp_indices[0].data, total_count * 3, position_buffer[0].data, positions.size(), sizeof(vec3)); bounds.push_back(bound); } - Encoded encoded; - encode_mesh(encoded, out_meshlets.data(), out_meshlets.size(), - out_index_buffer[0].data, out_index_buffer.size(), - attributes.data(), num_u32_streams); - encoded.mesh.mesh_style = style; - assert(bounds.size() == encoded.mesh.meshlets.size()); const auto *pbounds = bounds.data(); for (auto &meshlet : encoded.mesh.meshlets) @@ -738,7 +781,7 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me LOGI("Exported meshlet:\n"); LOGI(" %zu meshlets\n", encoded.mesh.meshlets.size()); - LOGI(" %zu payload bytes\n", encoded.payload.size() * sizeof(uint32_t)); + LOGI(" %zu payload bytes\n", encoded.payload.size() * sizeof(PayloadB128)); LOGI(" %u total indices\n", mesh.count); LOGI(" %zu total attributes\n", mesh.positions.size() / mesh.position_stride); diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 90531ef1..6f5b837b 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -16,93 +16,63 @@ static void decode_mesh_setup_buffers( std::vector &out_index_buffer, std::vector &out_u32_stream, const MeshView &mesh) { - assert(mesh.format_header->u32_stream_count > 1); + assert(mesh.format_header->stream_count > 1); out_index_buffer.clear(); out_u32_stream.clear(); out_index_buffer.resize(mesh.total_primitives * 3); - out_u32_stream.resize(mesh.total_vertices * (mesh.format_header->u32_stream_count - 1)); + out_u32_stream.resize(mesh.total_vertices * (mesh.format_header->stream_count - 1)); } -static void decode_mesh(std::vector &out_index_buffer, std::vector &out_u32_stream, - const MeshView &mesh) +static void decode_mesh_index_buffer(std::vector &out_index_buffer, const MeshView &mesh) { - decode_mesh_setup_buffers(out_index_buffer, out_u32_stream, mesh); out_index_buffer.clear(); - const unsigned u32_stride = mesh.format_header->u32_stream_count - 1; + out_index_buffer.reserve(mesh.total_primitives * 3); for (uint32_t meshlet_index = 0; meshlet_index < mesh.format_header->meshlet_count; meshlet_index++) { auto &meshlet = mesh.headers[meshlet_index]; - for (unsigned stream_index = 0; stream_index < mesh.format_header->u32_stream_count; stream_index++) + auto &stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(StreamType::Primitive)]; + const auto *pdata = mesh.payload + stream.offset_in_b128; + + for (uint32_t i = 0; i < meshlet.num_primitives; i += 32, pdata += 4) { - auto &stream = mesh.streams[meshlet_index * mesh.format_header->u32_stream_count + stream_index]; - const uint32_t *pdata = mesh.payload + stream.offset_from_base_u32; - - u8vec4 deltas[MaxElements] = {}; - const u16vec4 base_predictor = u16vec4( - stream.predictor[0], stream.predictor[1], - stream.predictor[2], stream.predictor[3]); - const u16vec4 linear_predictor = u16vec4( - stream.predictor[4], stream.predictor[5], - stream.predictor[6], stream.predictor[7]); - const u8vec4 initial_value = - u8vec4(u16vec2(stream.predictor[8], stream.predictor[9]).xxyy() >> u16vec4(0, 8, 0, 8)); - - for (unsigned chunk = 0; chunk < (MaxElements / 32); chunk++) + auto p0 = pdata[0]; + auto p1 = pdata[1]; + auto p2 = pdata[2]; + auto p3 = pdata[3]; + + for (uint32_t j = 0; j + i < meshlet.num_primitives && j < 32; j++) { - auto bits_per_u8 = (uvec4(stream.bitplane_meta[chunk]) >> uvec4(0, 4, 8, 12)) & 0xfu; - uvec4 bitplanes[8] = {}; - - for (unsigned comp = 0; comp < 4; comp++) - { - for (unsigned bit = 0; bit < bits_per_u8[comp]; bit++) - bitplanes[bit][comp] = *pdata++; - - // Sign-extend. - - unsigned bit_count = bits_per_u8[comp]; - if (bit_count) - for (unsigned bit = bit_count; bit < 8; bit++) - bitplanes[bit][comp] = bitplanes[bit_count - 1][comp]; - } - - for (unsigned i = 0; i < 32; i++) - { - for (uint32_t bit = 0; bit < 8; bit++) - deltas[chunk * 32 + i] |= u8vec4(((bitplanes[bit] >> i) & 1u) << bit); - } - } + uint32_t v = 0; + v |= ((p0.words[0] >> j) & 1u) << 0u; + v |= ((p0.words[1] >> j) & 1u) << 1u; + v |= ((p0.words[2] >> j) & 1u) << 2u; + v |= ((p0.words[3] >> j) & 1u) << 3u; - // Apply predictors. - deltas[0] += initial_value; - for (unsigned i = 0; i < MaxElements; i++) - deltas[i] += u8vec4((base_predictor + linear_predictor * u16vec4(i)) >> u16vec4(8)); + v |= ((p1.words[0] >> j) & 1u) << 8u; + v |= ((p1.words[1] >> j) & 1u) << 9u; + v |= ((p1.words[2] >> j) & 1u) << 10u; + v |= ((p1.words[3] >> j) & 1u) << 11u; - // Resolve deltas. - for (unsigned i = 1; i < MaxElements; i++) - deltas[i] += deltas[i - 1]; + v |= ((p2.words[0] >> j) & 1u) << 16u; + v |= ((p2.words[1] >> j) & 1u) << 17u; + v |= ((p2.words[2] >> j) & 1u) << 18u; + v |= ((p2.words[3] >> j) & 1u) << 19u; - if (stream_index == 0) - { - // Index decode. - unsigned num_primitives = meshlet.num_primitives; - for (unsigned i = 0; i < num_primitives; i++) - for (unsigned j = 0; j < 3; j++) - out_index_buffer.push_back(deltas[i][j] + meshlet.base_vertex_offset); - } - else - { - // Attributes. - unsigned num_attributes = meshlet.num_attributes; - auto *out_attr = out_u32_stream.data() + meshlet.base_vertex_offset * u32_stride + (stream_index - 1); - for (unsigned i = 0; i < num_attributes; i++, out_attr += u32_stride) - memcpy(out_attr, deltas[i].data, sizeof(*out_attr)); + v |= ((p3.words[0] >> j) & 1u) << 4u; + v |= ((p3.words[1] >> j) & 1u) << 12u; + v |= ((p3.words[2] >> j) & 1u) << 20u; + + v += stream.base_value_or_vertex_offset[i] * 0x010101u; + + out_index_buffer.push_back(v); } } } } +#if 0 static void decode_mesh_gpu( Vulkan::Device &dev, std::vector &out_index_buffer, std::vector &out_u32_stream, @@ -201,12 +171,14 @@ static bool validate_mesh_decode(const std::vector &decoded_index_buff return true; } +#endif int main(int argc, char *argv[]) { if (argc != 2) return EXIT_FAILURE; +#if 0 Global::init(Global::MANAGER_FEATURE_FILESYSTEM_BIT); Filesystem::setup_default_filesystem(GRANITE_FILESYSTEM(), ASSET_DIRECTORY); @@ -280,6 +252,7 @@ int main(int argc, char *argv[]) memcpy(ptr, reference_index_buffer.data(), reference_index_buffer.size() * sizeof(uint32_t)); memcpy(ptr + reference_index_buffer.size(), reference_attributes.data(), reference_attributes.size() * sizeof(uint32_t)); } +#endif return 0; } \ No newline at end of file diff --git a/viewer/physics_sandbox.cpp b/viewer/physics_sandbox.cpp index 761f5e7f..d2a10145 100644 --- a/viewer/physics_sandbox.cpp +++ b/viewer/physics_sandbox.cpp @@ -219,7 +219,7 @@ struct PhysicsSandboxApplication : Application, EventHandler c.indices = collision_mesh.indices.data(); c.num_triangles = collision_mesh.indices.size() / 3; c.index_stride_triangle = 3 * sizeof(uint32_t); - c.num_vertices = collision_mesh.positions.size(); + c.num_attributes = collision_mesh.positions.size(); c.positions = collision_mesh.positions.front().data; c.position_stride = sizeof(vec4); c.aabb = mesh.static_aabb; diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 374ae252..740b48ce 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -486,7 +486,7 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, BufferCreateInfo buf = {}; buf.domain = BufferDomain::Host; - buf.size = view.format_header->payload_size_words * sizeof(uint32_t); + buf.size = view.format_header->payload_size_b128 * sizeof(Meshlet::PayloadB128); buf.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; auto payload = device->create_buffer(buf, view.payload); diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index aa688d32..661f90bb 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -64,17 +64,17 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping) view.bounds = reinterpret_cast(ptr); ptr += view.format_header->meshlet_count * sizeof(Bound); - if (end_ptr - ptr < ptrdiff_t(view.format_header->meshlet_count * view.format_header->u32_stream_count * sizeof(Stream))) + if (end_ptr - ptr < ptrdiff_t(view.format_header->meshlet_count * view.format_header->stream_count * sizeof(Stream))) return {}; view.streams = reinterpret_cast(ptr); - ptr += view.format_header->meshlet_count * view.format_header->u32_stream_count * sizeof(Stream); + ptr += view.format_header->meshlet_count * view.format_header->stream_count * sizeof(Stream); - if (!view.format_header->payload_size_words) + if (!view.format_header->payload_size_b128) return {}; - if (end_ptr - ptr < ptrdiff_t(view.format_header->payload_size_words * sizeof(uint32_t))) + if (end_ptr - ptr < ptrdiff_t(view.format_header->payload_size_b128 * sizeof(uint32_t))) return {}; - view.payload = reinterpret_cast(ptr); + view.payload = reinterpret_cast(ptr); for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) { @@ -114,7 +114,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie buf_info.size = view.format_header->meshlet_count * sizeof(*view.headers); auto meshlet_meta_buffer = cmd.get_device().create_buffer(buf_info, view.headers); - buf_info.size = view.format_header->meshlet_count * view.format_header->u32_stream_count * sizeof(*view.streams); + buf_info.size = view.format_header->meshlet_count * view.format_header->stream_count * sizeof(*view.streams); auto meshlet_stream_buffer = cmd.get_device().create_buffer(buf_info, view.streams); // For Raw mode -> offset/stride @@ -140,7 +140,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie cmd.set_storage_buffer(0, 3, *info.ibo); cmd.set_specialization_constant_mask(0x7); - cmd.set_specialization_constant(0, view.format_header->u32_stream_count); + cmd.set_specialization_constant(0, view.format_header->stream_count); cmd.set_specialization_constant(2, (info.flags & DECODE_MODE_RAW_PAYLOAD) != 0); if ((info.flags & DECODE_MODE_RAW_PAYLOAD) != 0) @@ -164,7 +164,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie return false; } - if (output_u32_streams + 1 > view.format_header->u32_stream_count) + if (output_u32_streams + 1 > view.format_header->stream_count) { LOGE("Trying to decode more streams than exist in payload.\n"); return false; diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index 0997dbdf..bd012ea1 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -40,16 +40,20 @@ namespace Vulkan // MESHLET1 format. namespace Meshlet { -static constexpr unsigned MaxU32Streams = 16; +static constexpr unsigned MaxStreams = 8; static constexpr unsigned MaxElements = 256; +static constexpr unsigned ElementsPerChunk = 32; +static constexpr unsigned NumChunks = MaxElements / ElementsPerChunk; static constexpr unsigned MaxPrimitives = MaxElements; static constexpr unsigned MaxVertices = MaxElements; struct Stream { - uint16_t predictor[4 * 2 + 2]; - uint32_t offset_from_base_u32; - uint16_t bitplane_meta[MaxElements / 32]; + uint32_t base_value_or_vertex_offset[12]; + uint32_t bit_plane_config0; + uint32_t bit_plane_config1; + uint32_t aux; + uint32_t offset_in_b128; }; struct Header @@ -84,13 +88,12 @@ struct Bound enum class StreamType { - Primitive = 0, // R8G8B8X8_UINT - PositionE16, // RGB16_SSCALED * 2^(A16_SINT) - NormalOct8, // Octahedron encoding in RG8. - TangentOct8, // Octahedron encoding in RG8, sign bit in B8 (if not zero, +1, otherwise -1). - UV, // R16G16_SNORM * B16_SSCALED + Primitive = 0, // RGB8_UINT (fixed 5-bit encoding, fixed base value of 0) + Position, // RGB16_SINT * 2^aux + NormalTangentOct8, // Octahedron encoding in RG8, BA8 for tangent. Following uvec4 encodes 1-bit sign. + UV, // (0.5 * (R16G16_SINT * 2^aux) + 0.5 BoneIndices, // RGBA8_UINT - BoneWeights, // RGB8_UNORM (sums to 1, A is implied). + BoneWeights, // RGBA8_UNORM }; enum class MeshStyle : uint32_t @@ -103,9 +106,14 @@ enum class MeshStyle : uint32_t struct FormatHeader { MeshStyle style; - uint32_t u32_stream_count; + uint32_t stream_count; uint32_t meshlet_count; - uint32_t payload_size_words; + uint32_t payload_size_b128; +}; + +struct PayloadB128 +{ + uint32_t words[4]; }; struct MeshView @@ -114,12 +122,12 @@ struct MeshView const Header *headers; const Bound *bounds; const Stream *streams; - const uint32_t *payload; + const PayloadB128 *payload; uint32_t total_primitives; uint32_t total_vertices; }; -static const char magic[8] = { 'M', 'E', 'S', 'H', 'L', 'E', 'T', '1' }; +static const char magic[8] = { 'M', 'E', 'S', 'H', 'L', 'E', 'T', '2' }; MeshView create_mesh_view(const Granite::FileMapping &mapping); From ebb258afdabdb44457256ed8fe53c947128ee906 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Wed, 20 Dec 2023 14:21:14 +0100 Subject: [PATCH 23/59] Rewrite starting to come together. --- scene-export/meshlet_export.cpp | 218 ++++++++++++++++---------------- tests/meshopt_sandbox.cpp | 202 +++++++++++++++++++++++------ vulkan/mesh/meshlet.cpp | 15 ++- vulkan/mesh/meshlet.hpp | 18 +-- 4 files changed, 284 insertions(+), 169 deletions(-) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index 175980e3..bffd00b9 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -55,9 +55,12 @@ struct Encoded struct Meshlet { - uint32_t offset; + uint32_t global_indices_offset; uint32_t primitive_count; uint32_t vertex_count; + + const unsigned char *local_indices; + const uint32_t *attribute_remap; }; struct PrimitiveAnalysisResult @@ -91,6 +94,25 @@ static int compute_log2_scale(float max_value) return scale_log2; } +template +static void adjust_quant(std::vector &values, int &exp) +{ + uint32_t active_bits = 0; + for (auto &value : values) + for (auto &c : value.data) + active_bits |= c; + + if (active_bits == 0) + return; + + int extra_shift = trailing_zeroes(active_bits); + for (auto &value : values) + for (auto &c : value.data) + c >>= extra_shift; + + exp += extra_shift; +} + static std::vector mesh_extract_position_snorm_exp(const SceneFormats::Mesh &mesh, int &exp) { std::vector encoded_positions; @@ -132,6 +154,8 @@ static std::vector mesh_extract_position_snorm_exp(const SceneFormats:: encoded_positions.push_back(encode_vec3_to_snorm_exp(pos, log2_scale)); exp = -log2_scale; + adjust_quant(encoded_positions, exp); + return encoded_positions; } @@ -260,43 +284,9 @@ static std::vector mesh_extract_uv_snorm_scale(const SceneFormats::Mesh encoded_uvs.push_back(encode_vec2_to_snorm_exp(uv, log2_scale)); exp = -log2_scale; - return encoded_uvs; -} - -static PrimitiveAnalysisResult analyze_primitive_count(std::unordered_map &vertex_remap, - const uint32_t *index_buffer, uint32_t max_num_primitives) -{ - PrimitiveAnalysisResult result = {}; - uint32_t vertex_count = 0; + adjust_quant(encoded_uvs, exp); - // We can reference a maximum of 256 vertices. - vertex_remap.clear(); - - for (uint32_t i = 0; i < max_num_primitives; i++) - { - uint32_t index0 = index_buffer[3 * i + 0]; - uint32_t index1 = index_buffer[3 * i + 1]; - uint32_t index2 = index_buffer[3 * i + 2]; - - vertex_count = uint32_t(vertex_remap.size()); - - vertex_remap.insert({index0, uint32_t(vertex_remap.size())}); - vertex_remap.insert({index1, uint32_t(vertex_remap.size())}); - vertex_remap.insert({index2, uint32_t(vertex_remap.size())}); - - // If this primitive causes us to go out of bounds, reset. - if (vertex_remap.size() > MaxVertices) - { - max_num_primitives = i; - break; - } - - vertex_count = uint32_t(vertex_remap.size()); - } - - result.num_primitives = max_num_primitives; - result.num_attributes = vertex_count; - return result; + return encoded_uvs; } // Analyze bits required to encode a delta. @@ -322,10 +312,10 @@ static void encode_index_stream(std::vector &out_payload_buffer, PayloadB128 p2{}; PayloadB128 p3{}; - for (unsigned i = 0; i < 32; i++) + for (unsigned i = 0; i < ElementsPerChunk; i++) { u8vec3 indices = stream_buffer[i]; - assert(all(lessThan(indices, u8vec3(32)))); + assert(all(lessThan(indices, u8vec3(ElementsPerChunk)))); p0.words[0] |= ((indices.x >> 0u) & 1u) << i; p0.words[1] |= ((indices.x >> 1u) & 1u) << i; @@ -364,35 +354,41 @@ static void encode_attribute_stream(std::vector &out_payload_buffer for (uint32_t i = num_attributes; i < ElementsPerChunk; i++) positions[i] = positions[0]; - u16vec3 lo{0xffff}; - u16vec3 hi{0}; + u16vec3 ulo{0xffff}; + u16vec3 uhi{0}; + i16vec3 slo{0x7fff}; + i16vec3 shi{-0x8000}; for (auto &p : positions) { - lo = min(lo, p); - hi = max(hi, p); + ulo = min(ulo, p); + uhi = max(uhi, p); + slo = min(slo, i16vec3(p)); + shi = max(shi, i16vec3(p)); } - u16vec3 diff = hi - lo; - u16vec3 diff_rev = lo - hi; + const auto max3 = [](u16vec3 v) { return max(max(v.x, v.y), v.z); }; + u16vec3 diff_unsigned = uhi - ulo; + u16vec3 diff_signed = u16vec3(shi) - u16vec3(slo); - unsigned diff3 = max(max(diff.x, diff.y), diff.z); - unsigned diff3_rev = max(max(diff_rev.x, diff_rev.y), diff_rev.z); - if (diff3_rev < diff3) + unsigned diff3_unsigned = max3(diff_unsigned); + unsigned diff3_signed = max3(diff_signed); + if (diff3_signed < diff3_unsigned) { - std::swap(lo, hi); - diff3 = diff3_rev; + ulo = u16vec3(slo); + uhi = u16vec3(shi); + diff3_unsigned = diff3_signed; } - unsigned bits = compute_required_bits_unsigned(diff3); + unsigned bits = compute_required_bits_unsigned(diff3_unsigned); unsigned encoded_bits = (bits + 1) / 2; - stream.bit_plane_config0 |= encoded_bits << (4 * chunk_index); + stream.bit_plane_config |= encoded_bits << (4 * chunk_index); + stream.u.base_value[chunk_index] = uint32_t(ulo.x) | (uint32_t(ulo.y) << 16); + stream.u.base_value[chunk_index / 2 + 8] |= uint32_t(ulo.z) << (16 * (chunk_index & 1)); - stream.base_value_or_vertex_offset[chunk_index] = uint32_t(lo.x) | (uint32_t(lo.y) << 16); - stream.base_value_or_vertex_offset[chunk_index / 2 + 8] |= uint32_t(lo.z) << (16 * (chunk_index & 1)); for (auto &p : positions) - p -= lo; + p -= ulo; if (encoded_bits == 8) { @@ -454,16 +450,19 @@ static void encode_attribute_stream(std::vector &out_payload_buffer if (encoded_bits & 1) { PayloadB128 p[2]{}; + uint32_t *words = &p[0].words[0]; + for (uint32_t i = 0; i < ElementsPerChunk; i++) { u16vec3 d = positions[i]; - - p[0].words[0] |= ((d.x >> bit_offset) & 1u) << i; - p[0].words[1] |= ((d.x >> (bit_offset + 1)) & 1u) << i; - p[0].words[2] |= ((d.y >> bit_offset) & 1u) << i; - p[0].words[3] |= ((d.y >> (bit_offset + 1)) & 1u) << i; - p[1].words[0] |= ((d.z >> bit_offset) & 1u) << i; - p[1].words[1] |= ((d.z >> (bit_offset + 1)) & 1u) << i; + for (int c = 0; c < 3; c++) + { + for (int b = 0; b < 2; b++) + { + int word = c * 2 + b; + words[word] |= ((d[c] >> (bit_offset + b)) & 1u) << i; + } + } } for (auto v : p) @@ -488,81 +487,74 @@ static void encode_mesh(Encoded &encoded, mesh.meshlets.reserve(num_full_meshlets); uint32_t base_vertex_offset = 0; - auto *index_buffer = static_cast(pp_data[0]); - - std::unordered_map vbo_remap; - for (uint32_t full_meshlet_index = 0; full_meshlet_index < num_full_meshlets; full_meshlet_index++) { Metadata out_meshlet = {}; out_meshlet.base_vertex_offset = base_vertex_offset; uint32_t num_chunks = std::min(num_meshlets - full_meshlet_index * NumChunks, NumChunks); - for (uint32_t chunk_index = 0; chunk_index < num_chunks; chunk_index++) - { - auto &meshlet = meshlets[full_meshlet_index * NumChunks + chunk_index]; - - uint32_t primitive_index = meshlets[full_meshlet_index].offset; + out_meshlet.num_chunks = num_chunks; - auto analysis_result = analyze_primitive_count( - vbo_remap, index_buffer + 3 * primitive_index, - meshlet.primitive_count); - assert(analysis_result.num_primitives <= ElementsPerChunk); - assert(analysis_result.num_attributes <= ElementsPerChunk); + { + auto &index_stream = out_meshlet.streams[int(StreamType::Primitive)]; + index_stream.offset_in_b128 = uint32_t(encoded.payload.size()); + uint32_t num_attributes = 0; + uint32_t num_primitives = 0; - // Encode index buffer + for (uint32_t chunk_index = 0; chunk_index < num_chunks; chunk_index++) { + auto &meshlet = meshlets[full_meshlet_index * NumChunks + chunk_index]; + u8vec3 index_stream_buffer[ElementsPerChunk]; - for (uint32_t i = 0; i < analysis_result.num_primitives; i++) - { - uint8_t i0 = vbo_remap.at(index_buffer[3 * (primitive_index + i) + 0]); - uint8_t i1 = vbo_remap.at(index_buffer[3 * (primitive_index + i) + 1]); - uint8_t i2 = vbo_remap.at(index_buffer[3 * (primitive_index + i) + 2]); - index_stream_buffer[i] = u8vec3(i0, i1, i2); - } + for (uint32_t i = 0; i < meshlet.primitive_count; i++) + memcpy(index_stream_buffer[i].data, meshlet.local_indices + 3 * i, 3); + for (uint32_t i = meshlet.primitive_count; i < ElementsPerChunk; i++) + index_stream_buffer[i] = u8vec3(0); + + auto &offsets = index_stream.u.offsets[chunk_index]; + offsets.attr_offset = num_attributes; + offsets.prim_offset = num_primitives; - auto &index_stream = out_meshlet.streams[0]; - index_stream.base_value_or_vertex_offset[chunk_index] = out_meshlet.num_attributes; - index_stream.offset_in_b128 = uint32_t(encoded.payload.size()); encode_index_stream(encoded.payload, index_stream_buffer); + num_primitives += meshlet.primitive_count; + num_attributes += meshlet.vertex_count; } - uint64_t vbo_remapping[ElementsPerChunk]; - unsigned vbo_index = 0; - for (auto &v : vbo_remap) + for (uint32_t chunk_index = num_chunks; chunk_index <= NumChunks; chunk_index++) { - assert(vbo_index < ElementsPerChunk); - vbo_remapping[vbo_index++] = (uint64_t(v.second) << 32) | v.first; + auto &offsets = index_stream.u.offsets[chunk_index]; + offsets.attr_offset = num_attributes; + offsets.prim_offset = num_primitives; } - std::sort(vbo_remapping, vbo_remapping + vbo_index); - uint32_t vbo_table[ElementsPerChunk]; - for (unsigned i = 0; i < ElementsPerChunk; i++) - vbo_table[i] = uint32_t(vbo_remapping[i]); + base_vertex_offset += num_attributes; + } + + for (uint32_t stream_index = 1; stream_index < num_streams; stream_index++) + { + auto &stream = out_meshlet.streams[stream_index]; + stream.aux = p_aux[stream_index]; + stream.offset_in_b128 = uint32_t(encoded.payload.size()); - for (uint32_t stream_index = 1; stream_index < num_streams; stream_index++) + for (uint32_t chunk_index = 0; chunk_index < num_chunks; chunk_index++) { - out_meshlet.streams[stream_index].aux = p_aux[stream_index]; + auto &meshlet = meshlets[full_meshlet_index * NumChunks + chunk_index]; switch (StreamType(stream_index)) { case StreamType::Position: - encode_attribute_stream(encoded.payload, out_meshlet.streams[stream_index], + encode_attribute_stream(encoded.payload, stream, static_cast(pp_data[stream_index]), - chunk_index, vbo_table, analysis_result.num_attributes); + chunk_index, meshlet.attribute_remap, meshlet.vertex_count); break; default: break; } } - - out_meshlet.num_primitives += analysis_result.num_primitives; - out_meshlet.num_attributes += analysis_result.num_attributes; } mesh.meshlets.push_back(out_meshlet); - base_vertex_offset += out_meshlet.num_attributes; } } @@ -632,7 +624,7 @@ static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) } memcpy(ptr, encoded.payload.data(), encoded.payload.size() * sizeof(PayloadB128)); - ptr += encoded.payload.size() * sizeof(uint32_t); + ptr += encoded.payload.size() * sizeof(PayloadB128); memset(ptr, 0, sizeof(PayloadB128)); return true; } @@ -719,12 +711,14 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me for (auto &meshlet : meshlets) { Meshlet m = {}; - m.offset = uint32_t(out_index_buffer.size()); + + auto *local_indices = local_index_buffer.data() + meshlet.triangle_offset; + m.local_indices = local_indices; + m.attribute_remap = out_vertex_redirection_buffer.data() + meshlet.vertex_offset; m.primitive_count = meshlet.triangle_count; m.vertex_count = meshlet.vertex_count; - out_meshlets.push_back(m); + m.global_indices_offset = uint32_t(out_index_buffer.size()); - auto *local_indices = local_index_buffer.data() + meshlet.triangle_offset; for (unsigned i = 0; i < meshlet.triangle_count; i++) { out_index_buffer.emplace_back( @@ -732,9 +726,9 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me out_vertex_redirection_buffer[local_indices[3 * i + 1] + meshlet.vertex_offset], out_vertex_redirection_buffer[local_indices[3 * i + 2] + meshlet.vertex_offset]); } - } - p_data[0] = out_index_buffer.data(); + out_meshlets.push_back(m); + } Encoded encoded; encode_mesh(encoded, out_meshlets.data(), out_meshlets.size(), @@ -757,8 +751,8 @@ bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Me { auto &meshlet = out_meshlets[i + chunk]; memcpy(tmp_indices[total_count].data, - out_index_buffer[meshlet.offset].data, - meshlet.primitive_count * sizeof(tmp_indices[0].data)); + out_index_buffer[meshlet.global_indices_offset].data, + meshlet.primitive_count * sizeof(tmp_indices[0])); total_count += meshlet.primitive_count; } diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 6f5b837b..f7921e5d 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -12,66 +12,152 @@ using namespace Granite; using namespace Vulkan::Meshlet; -static void decode_mesh_setup_buffers( - std::vector &out_index_buffer, std::vector &out_u32_stream, - const MeshView &mesh) +static void decode_mesh_index_buffer(std::vector &out_index_buffer, const MeshView &mesh, uint32_t meshlet_index) { - assert(mesh.format_header->stream_count > 1); + auto &meshlet = mesh.headers[meshlet_index]; + auto &stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(StreamType::Primitive)]; + const auto *pdata = mesh.payload + stream.offset_in_b128; + + for (uint32_t chunk_index = 0; chunk_index < meshlet.num_chunks; chunk_index++) + { + auto p0 = pdata[0]; + auto p1 = pdata[1]; + auto p2 = pdata[2]; + auto p3 = pdata[3]; + + pdata += 4; + + uint32_t num_primitives_for_chunk = stream.u.offsets[chunk_index + 1].prim_offset - + stream.u.offsets[chunk_index].prim_offset; + + for (uint32_t i = 0; i < num_primitives_for_chunk; i++) + { + uint32_t v = 0; + v |= ((p0.words[0] >> i) & 1u) << 0u; + v |= ((p0.words[1] >> i) & 1u) << 1u; + v |= ((p0.words[2] >> i) & 1u) << 2u; + v |= ((p0.words[3] >> i) & 1u) << 3u; - out_index_buffer.clear(); - out_u32_stream.clear(); - out_index_buffer.resize(mesh.total_primitives * 3); - out_u32_stream.resize(mesh.total_vertices * (mesh.format_header->stream_count - 1)); + v |= ((p1.words[0] >> i) & 1u) << 8u; + v |= ((p1.words[1] >> i) & 1u) << 9u; + v |= ((p1.words[2] >> i) & 1u) << 10u; + v |= ((p1.words[3] >> i) & 1u) << 11u; + + v |= ((p2.words[0] >> i) & 1u) << 16u; + v |= ((p2.words[1] >> i) & 1u) << 17u; + v |= ((p2.words[2] >> i) & 1u) << 18u; + v |= ((p2.words[3] >> i) & 1u) << 19u; + + v |= ((p3.words[0] >> i) & 1u) << 4u; + v |= ((p3.words[1] >> i) & 1u) << 12u; + v |= ((p3.words[2] >> i) & 1u) << 20u; + + v += stream.u.offsets[chunk_index].attr_offset * 0x010101u; + + uint32_t x = v & 0xffu; + uint32_t y = (v >> 8u) & 0xffu; + uint32_t z = (v >> 16u) & 0xffu; + + out_index_buffer.push_back(uvec3(x, y, z) + meshlet.base_vertex_offset); + } + } } -static void decode_mesh_index_buffer(std::vector &out_index_buffer, const MeshView &mesh) +template +static void decode_bitfield_block_16(T *block, const PayloadB128 *&pdata, unsigned config) { - out_index_buffer.clear(); - out_index_buffer.reserve(mesh.total_primitives * 3); + unsigned bit_offset = 0; - for (uint32_t meshlet_index = 0; meshlet_index < mesh.format_header->meshlet_count; meshlet_index++) + for (int mask = 4; mask; mask >>= 1) { - auto &meshlet = mesh.headers[meshlet_index]; - auto &stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(StreamType::Primitive)]; - const auto *pdata = mesh.payload + stream.offset_in_b128; - - for (uint32_t i = 0; i < meshlet.num_primitives; i += 32, pdata += 4) + if (config & mask) { - auto p0 = pdata[0]; - auto p1 = pdata[1]; - auto p2 = pdata[2]; - auto p3 = pdata[3]; + const uint32_t *words = &pdata->words[0]; + int bits = mask * 2; - for (uint32_t j = 0; j + i < meshlet.num_primitives && j < 32; j++) + for (uint32_t i = 0; i < ElementsPerChunk; i++) { - uint32_t v = 0; - v |= ((p0.words[0] >> j) & 1u) << 0u; - v |= ((p0.words[1] >> j) & 1u) << 1u; - v |= ((p0.words[2] >> j) & 1u) << 2u; - v |= ((p0.words[3] >> j) & 1u) << 3u; - - v |= ((p1.words[0] >> j) & 1u) << 8u; - v |= ((p1.words[1] >> j) & 1u) << 9u; - v |= ((p1.words[2] >> j) & 1u) << 10u; - v |= ((p1.words[3] >> j) & 1u) << 11u; + T &d = block[i]; + for (int c = 0; c < Components; c++) + { + for (int b = 0; b < bits; b++) + { + int word = c * bits + b; + d[c] |= ((words[word] >> i) & 1u) << (bit_offset + b); + } + } + } - v |= ((p2.words[0] >> j) & 1u) << 16u; - v |= ((p2.words[1] >> j) & 1u) << 17u; - v |= ((p2.words[2] >> j) & 1u) << 18u; - v |= ((p2.words[3] >> j) & 1u) << 19u; + int num_words = (mask * Components + 1) / 2; + pdata += num_words; + bit_offset += bits; + } + } +} - v |= ((p3.words[0] >> j) & 1u) << 4u; - v |= ((p3.words[1] >> j) & 1u) << 12u; - v |= ((p3.words[2] >> j) & 1u) << 20u; +static void decode_attribute_buffer(std::vector &out_positions, const MeshView &mesh, uint32_t meshlet_index, StreamType type) +{ + auto &meshlet = mesh.headers[meshlet_index]; + auto &index_stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(StreamType::Primitive)]; + auto &stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(type)]; + const auto *pdata = mesh.payload + stream.offset_in_b128; - v += stream.base_value_or_vertex_offset[i] * 0x010101u; + for (uint32_t chunk = 0; chunk < meshlet.num_chunks; chunk++) + { + u16vec3 positions[ElementsPerChunk]{}; + unsigned config = (stream.bit_plane_config >> (4 * chunk)) & 0xf; - out_index_buffer.push_back(v); + if (config == 8) + { + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + memcpy(positions[i].data, &pdata[i / 4].words[i % 4], 2 * sizeof(uint16_t)); + memcpy(&positions[i].z, + reinterpret_cast(&pdata[8]) + sizeof(uint16_t) * i, + sizeof(uint16_t)); } + + pdata += 12; + } + else + { + decode_bitfield_block_16<3>(positions, pdata, config); + } + + u16vec3 base; + memcpy(base.data, &stream.u.offsets[chunk].attr_offset, sizeof(uint16_t) * 2); + memcpy(&base.z, reinterpret_cast(&stream.u.offsets[8].attr_offset) + + sizeof(uint16_t) * chunk, + sizeof(uint16_t)); + + for (auto &p : positions) + p += base; + + uint32_t num_attributes_for_chunk = index_stream.u.offsets[chunk + 1].attr_offset - + index_stream.u.offsets[chunk].attr_offset; + + for (uint32_t i = 0; i < num_attributes_for_chunk; i++) + { + vec3 float_pos = vec3(i16vec3(positions[i])); + float_pos.x = ldexpf(float_pos.x, stream.aux); + float_pos.y = ldexpf(float_pos.y, stream.aux); + float_pos.z = ldexpf(float_pos.z, stream.aux); + out_positions.push_back(float_pos); } } } +static void decode_mesh(std::vector &out_index_buffer, + std::vector &out_positions, + const MeshView &mesh) +{ + for (uint32_t meshlet_index = 0; meshlet_index < mesh.format_header->meshlet_count; meshlet_index++) + { + decode_mesh_index_buffer(out_index_buffer, mesh, meshlet_index); + decode_attribute_buffer(out_positions, mesh, meshlet_index, StreamType::Position); + } +} + #if 0 static void decode_mesh_gpu( Vulkan::Device &dev, @@ -178,10 +264,42 @@ int main(int argc, char *argv[]) if (argc != 2) return EXIT_FAILURE; -#if 0 Global::init(Global::MANAGER_FEATURE_FILESYSTEM_BIT); Filesystem::setup_default_filesystem(GRANITE_FILESYSTEM(), ASSET_DIRECTORY); + SceneFormats::Mesh mesh; + vec3 pos[30]; + + mesh.index_type = VK_INDEX_TYPE_UINT8_EXT; + mesh.count = 30; + mesh.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + for (unsigned i = 0; i < mesh.count; i++) + { + mesh.indices.push_back(i); + pos[i] = vec3(float(i)); + } + mesh.attribute_layout[int(MeshAttribute::Position)].format = VK_FORMAT_R32G32B32_SFLOAT; + mesh.position_stride = sizeof(vec3); + mesh.positions.resize(sizeof(pos)); + memcpy(mesh.positions.data(), pos[0].data, sizeof(pos)); + + if (!Meshlet::export_mesh_to_meshlet("/tmp/export.msh2", mesh, MeshStyle::Wireframe)) + return EXIT_FAILURE; + + auto file = GRANITE_FILESYSTEM()->open("/tmp/export.msh2", FileMode::ReadOnly); + if (!file) + return EXIT_FAILURE; + + auto mapped = file->map(); + if (!mapped) + return EXIT_FAILURE; + + std::vector reference_index_buffer; + std::vector reference_positions; + auto view = create_mesh_view(*mapped); + decode_mesh(reference_index_buffer, reference_positions, view); + +#if 0 GLTF::Parser parser(argv[1]); Vulkan::Context ctx; diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index 661f90bb..aa10aee8 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -36,7 +36,7 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping) if (mapping.get_size() < sizeof(magic) + sizeof(FormatHeader)) { - LOGE("MESHLET1 file too small.\n"); + LOGE("MESHLET2 file too small.\n"); return view; } @@ -45,7 +45,7 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping) if (memcmp(ptr, magic, sizeof(magic)) != 0) { - LOGE("Invalid MESHLET1 magic.\n"); + LOGE("Invalid MESHLET2 magic.\n"); return {}; } @@ -72,14 +72,15 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping) if (!view.format_header->payload_size_b128) return {}; - if (end_ptr - ptr < ptrdiff_t(view.format_header->payload_size_b128 * sizeof(uint32_t))) + if (end_ptr - ptr < ptrdiff_t(view.format_header->payload_size_b128 * sizeof(PayloadB128))) return {}; view.payload = reinterpret_cast(ptr); for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) { - view.total_primitives += view.headers[i].num_primitives; - view.total_vertices += view.headers[i].num_attributes; + auto offsets = view.streams[i * view.format_header->stream_count].u.offsets[NumChunks]; + view.total_primitives += offsets.prim_offset; + view.total_vertices += offsets.attr_offset; } return view; @@ -179,7 +180,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) { decode_offsets.push_back({ index_count, 0 }); - index_count += view.headers[i].num_primitives; + index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; for (uint32_t j = 0; j < output_u32_streams; j++) decode_offsets.push_back({ view.headers[i].base_vertex_offset * output_u32_streams + j, output_u32_streams }); } @@ -215,7 +216,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) { decode_offsets.push_back({ index_count, view.headers[i].base_vertex_offset }); - index_count += view.headers[i].num_primitives; + index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; } cmd.set_specialization_constant(1, uint32_t(info.target_style)); diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index bd012ea1..a2b47de3 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -44,23 +44,25 @@ static constexpr unsigned MaxStreams = 8; static constexpr unsigned MaxElements = 256; static constexpr unsigned ElementsPerChunk = 32; static constexpr unsigned NumChunks = MaxElements / ElementsPerChunk; -static constexpr unsigned MaxPrimitives = MaxElements; -static constexpr unsigned MaxVertices = MaxElements; struct Stream { - uint32_t base_value_or_vertex_offset[12]; - uint32_t bit_plane_config0; - uint32_t bit_plane_config1; - uint32_t aux; + union + { + uint32_t base_value[12]; + struct { uint16_t prim_offset; uint16_t attr_offset; } offsets[12]; + } u; + uint32_t bit_plane_config; + uint32_t reserved; + int32_t aux; uint32_t offset_in_b128; }; +static_assert(sizeof(Stream) == 64, "Unexpected Stream size."); struct Header { uint32_t base_vertex_offset; - uint16_t num_primitives; - uint16_t num_attributes; + uint32_t num_chunks; }; // For GPU use From 18ce10d4ec211454123d138880fc1b4edc4f157b Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Wed, 20 Dec 2023 14:46:04 +0100 Subject: [PATCH 24/59] Fix some decode shenanigans. --- tests/meshopt_sandbox.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index f7921e5d..19178855 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -88,7 +88,7 @@ static void decode_bitfield_block_16(T *block, const PayloadB128 *&pdata, unsign } } - int num_words = (mask * Components + 1) / 2; + int num_words = (bits * Components + 3) / 4; pdata += num_words; bit_offset += bits; } @@ -125,8 +125,8 @@ static void decode_attribute_buffer(std::vector &out_positions, const Mesh } u16vec3 base; - memcpy(base.data, &stream.u.offsets[chunk].attr_offset, sizeof(uint16_t) * 2); - memcpy(&base.z, reinterpret_cast(&stream.u.offsets[8].attr_offset) + + memcpy(base.data, &stream.u.base_value[chunk], sizeof(uint16_t) * 2); + memcpy(&base.z, reinterpret_cast(&stream.u.base_value[NumChunks]) + sizeof(uint16_t) * chunk, sizeof(uint16_t)); @@ -268,10 +268,10 @@ int main(int argc, char *argv[]) Filesystem::setup_default_filesystem(GRANITE_FILESYSTEM(), ASSET_DIRECTORY); SceneFormats::Mesh mesh; - vec3 pos[30]; + vec3 pos[255]; mesh.index_type = VK_INDEX_TYPE_UINT8_EXT; - mesh.count = 30; + mesh.count = 255; mesh.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; for (unsigned i = 0; i < mesh.count; i++) { From e21836c52d7d8c0de1273a026a350c1650b7c7c6 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Wed, 20 Dec 2023 15:33:57 +0100 Subject: [PATCH 25/59] Stamp out some tests. --- tests/meshopt_sandbox.cpp | 89 ++++++++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 19178855..49936495 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -1,5 +1,6 @@ #include "logging.hpp" #include +#include #include "math.hpp" #include "device.hpp" #include "context.hpp" @@ -259,6 +260,58 @@ static bool validate_mesh_decode(const std::vector &decoded_index_buff } #endif +static void build_reference_mesh(std::vector &indices, std::vector &positions) +{ + for (unsigned i = 0; i < 256; i++) + positions.push_back(vec3(-40.0f + i)); + + for (unsigned i = 0; i < 254; i++) + indices.push_back(uvec3(i, i + 1, i + 2)); +} + +static bool validate_mesh(std::vector &reference_indices, + std::vector &reference_positions, + std::vector &decoded_indices, + std::vector &decoded_positions) +{ + if (reference_indices.size() != decoded_indices.size()) + { + LOGE("Mismatch in index buffer size.\n"); + return false; + } + + std::sort(reference_indices.begin(), reference_indices.end(), [&](const uvec3 &a, const uvec3 &b) { + float za = reference_positions[a.z].z; + float zb = reference_positions[b.z].z; + return za < zb; + }); + + std::sort(decoded_indices.begin(), decoded_indices.end(), [&](const uvec3 &a, const uvec3 &b) { + float za = decoded_positions[a.z].z; + float zb = decoded_positions[b.z].z; + return za < zb; + }); + + for (size_t i = 0, n = decoded_indices.size(); i < n; i++) + { + uvec3 ref_i = reference_indices[i]; + uvec3 decode_i = decoded_indices[i]; + + for (int c = 0; c < 3; c++) + { + vec3 ref_pos = reference_positions[ref_i[c]]; + vec3 decode_pos = decoded_positions[decode_i[c]]; + if (any(notEqual(ref_pos, decode_pos))) + { + LOGE("Mismatch in primitive %zu, c = %d.\n", i, c); + return false; + } + } + } + + return true; +} + int main(int argc, char *argv[]) { if (argc != 2) @@ -268,22 +321,23 @@ int main(int argc, char *argv[]) Filesystem::setup_default_filesystem(GRANITE_FILESYSTEM(), ASSET_DIRECTORY); SceneFormats::Mesh mesh; - vec3 pos[255]; - mesh.index_type = VK_INDEX_TYPE_UINT8_EXT; - mesh.count = 255; + std::vector reference_indices; + std::vector reference_positions; + build_reference_mesh(reference_indices, reference_positions); + + mesh.index_type = VK_INDEX_TYPE_UINT32; + mesh.count = 3 * reference_indices.size(); mesh.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; - for (unsigned i = 0; i < mesh.count; i++) - { - mesh.indices.push_back(i); - pos[i] = vec3(float(i)); - } + mesh.indices.resize(mesh.count * sizeof(uint32_t)); + memcpy(mesh.indices.data(), reference_indices.data(), mesh.count * sizeof(uint32_t)); + mesh.attribute_layout[int(MeshAttribute::Position)].format = VK_FORMAT_R32G32B32_SFLOAT; mesh.position_stride = sizeof(vec3); - mesh.positions.resize(sizeof(pos)); - memcpy(mesh.positions.data(), pos[0].data, sizeof(pos)); + mesh.positions.resize(reference_positions.size() * sizeof(vec3)); + memcpy(mesh.positions.data(), reference_positions.data(), reference_positions.size() * sizeof(vec3)); - if (!Meshlet::export_mesh_to_meshlet("/tmp/export.msh2", mesh, MeshStyle::Wireframe)) + if (!Meshlet::export_mesh_to_meshlet("/tmp/export.msh2", std::move(mesh), MeshStyle::Wireframe)) return EXIT_FAILURE; auto file = GRANITE_FILESYSTEM()->open("/tmp/export.msh2", FileMode::ReadOnly); @@ -294,11 +348,16 @@ int main(int argc, char *argv[]) if (!mapped) return EXIT_FAILURE; - std::vector reference_index_buffer; - std::vector reference_positions; + std::vector decoded_index_buffer; + std::vector decoded_positions; auto view = create_mesh_view(*mapped); - decode_mesh(reference_index_buffer, reference_positions, view); + decode_mesh(decoded_index_buffer, decoded_positions, view); + + if (!validate_mesh(reference_indices, reference_positions, + decoded_index_buffer, decoded_positions)) + return EXIT_FAILURE; + return 0; #if 0 GLTF::Parser parser(argv[1]); @@ -371,6 +430,4 @@ int main(int argc, char *argv[]) memcpy(ptr + reference_index_buffer.size(), reference_attributes.data(), reference_attributes.size() * sizeof(uint32_t)); } #endif - - return 0; } \ No newline at end of file From 96f2ff92117b9b1279371857bff1647c2f99239e Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Wed, 20 Dec 2023 15:40:54 +0100 Subject: [PATCH 26/59] Test full encode path. --- tests/meshopt_sandbox.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 49936495..61418a9f 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -263,7 +263,16 @@ static bool validate_mesh_decode(const std::vector &decoded_index_buff static void build_reference_mesh(std::vector &indices, std::vector &positions) { for (unsigned i = 0; i < 256; i++) - positions.push_back(vec3(-40.0f + i)); + { + vec3 p; + p.x = -40.0f + float(i); + p.y = float(i); + p.z = -30.0f + float(i); + + if (i == 8) + p.y = 20000.0f; + positions.push_back(p); + } for (unsigned i = 0; i < 254; i++) indices.push_back(uvec3(i, i + 1, i + 2)); From a752473d7be9028ae3b7b9c1ce13732f79c9d61b Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 21 Dec 2023 11:32:19 +0100 Subject: [PATCH 27/59] Refactor out bitplane encode logic. --- scene-export/meshlet_export.cpp | 159 ++++++++++++++++---------------- 1 file changed, 78 insertions(+), 81 deletions(-) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index bffd00b9..21736df2 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -342,6 +342,83 @@ static void encode_index_stream(std::vector &out_payload_buffer, out_payload_buffer.push_back(p3); } +template +static void encode_bitplane_16_inner(std::vector &out_payload_buffer, + const T *values, unsigned encoded_bits) +{ + static_assert(Components == 2 || Components == 3, "Components must be 2 or 3."); + + if (encoded_bits == 8) + { + // Plain write. + PayloadB128 p[4 * Components]; + + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + auto d = values[i].xy(); + p[i / 4].words[i % 4] = uint32_t(d.x) | (uint32_t(d.y) << 16); + } + + if (Components == 3) + { + for (uint32_t i = 0; i < ElementsPerChunk / 2; i++) + { + u16vec2 d = u16vec2(values[2 * i][2], values[2 * i + 1][2]); + p[8 + i / 4].words[i % 4] = uint32_t(d.x) | (uint32_t(d.y) << 16); + } + } + + out_payload_buffer.insert(out_payload_buffer.end(), p, p + 4 * Components); + } + else + { + unsigned bit_offset = 0; + PayloadB128 p[6]; + + for (int mask = 4; mask; mask >>= 1) + { + if (encoded_bits & mask) + { + uint32_t *words = &p[0].words[0]; + int bits = mask * 2; + int num_words = (bits * Components + 3) / 4; + + for (int i = 0; i < num_words; i++) + p[i] = {}; + + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + auto d = values[i]; + for (int c = 0; c < Components; c++) + { + for (int b = 0; b < bits; b++) + { + int word = c * bits + b; + words[word] |= ((d[c] >> (bit_offset + b)) & 1u) << i; + } + } + } + + for (int i = 0; i < num_words; i++) + out_payload_buffer.push_back(p[i]); + bit_offset += bits; + } + } + } +} + +static void encode_bitplane_16(std::vector &out_payload_buffer, + const u16vec3 *values, unsigned encoded_bits) +{ + encode_bitplane_16_inner<3>(out_payload_buffer, values, encoded_bits); +} + +static void encode_bitplane_16(std::vector &out_payload_buffer, + const u16vec2 *values, unsigned encoded_bits) +{ + encode_bitplane_16_inner<2>(out_payload_buffer, values, encoded_bits); +} + static void encode_attribute_stream(std::vector &out_payload_buffer, Stream &stream, const u16vec3 *raw_positions, @@ -376,7 +453,6 @@ static void encode_attribute_stream(std::vector &out_payload_buffer if (diff3_signed < diff3_unsigned) { ulo = u16vec3(slo); - uhi = u16vec3(shi); diff3_unsigned = diff3_signed; } @@ -390,86 +466,7 @@ static void encode_attribute_stream(std::vector &out_payload_buffer for (auto &p : positions) p -= ulo; - if (encoded_bits == 8) - { - // Plain write. - PayloadB128 p[12]; - - for (uint32_t i = 0; i < ElementsPerChunk; i++) - { - u16vec2 d = positions[i].xy(); - p[i / 4].words[i % 4] = uint32_t(d.x) | (uint32_t(d.y) << 16); - } - - for (uint32_t i = 0; i < ElementsPerChunk / 2; i++) - { - u16vec2 d = u16vec2(positions[2 * i].z, positions[2 * i + 1].z); - p[8 + i / 4].words[i % 4] = uint32_t(d.x) | (uint32_t(d.y) << 16); - } - - out_payload_buffer.insert(out_payload_buffer.end(), p, p + 12); - } - else - { - unsigned bit_offset = 0; - - if (encoded_bits & 4) - { - PayloadB128 p[6]{}; - - for (uint32_t i = 0; i < ElementsPerChunk; i++) - { - u16vec3 d = positions[i]; - for (int c = 0; c < 3; c++) - for (int b = 0; b < 8; b++) - p[c * 2 + b / 4].words[b % 4] |= ((d[c] >> (bit_offset + b)) & 1u) << i; - } - - for (auto v : p) - out_payload_buffer.push_back(v); - bit_offset += 8; - } - - if (encoded_bits & 2) - { - PayloadB128 p[3]{}; - - for (uint32_t i = 0; i < ElementsPerChunk; i++) - { - u16vec3 d = positions[i]; - for (int c = 0; c < 3; c++) - for (int b = 0; b < 4; b++) - p[c].words[b] |= ((d[c] >> (bit_offset + b)) & 1u) << i; - } - - for (auto v : p) - out_payload_buffer.push_back(v); - bit_offset += 4; - } - - if (encoded_bits & 1) - { - PayloadB128 p[2]{}; - uint32_t *words = &p[0].words[0]; - - for (uint32_t i = 0; i < ElementsPerChunk; i++) - { - u16vec3 d = positions[i]; - for (int c = 0; c < 3; c++) - { - for (int b = 0; b < 2; b++) - { - int word = c * 2 + b; - words[word] |= ((d[c] >> (bit_offset + b)) & 1u) << i; - } - } - } - - for (auto v : p) - out_payload_buffer.push_back(v); - bit_offset += 2; - } - } + encode_bitplane_16(out_payload_buffer, positions, encoded_bits); } static void encode_mesh(Encoded &encoded, From 8efd3efbd577a9381733108fb85edb3e1c2fd30a Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 21 Dec 2023 11:53:06 +0100 Subject: [PATCH 28/59] Make bitplane encoder more generic. --- scene-export/meshlet_export.cpp | 70 +++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index 21736df2..09293826 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -26,6 +26,8 @@ #include "math.hpp" #include "filesystem.hpp" #include "meshlet.hpp" +#include +#include namespace Granite { @@ -419,49 +421,77 @@ static void encode_bitplane_16(std::vector &out_payload_buffer, encode_bitplane_16_inner<2>(out_payload_buffer, values, encoded_bits); } +template struct to_signed_vector {}; +template struct to_components {}; + +template <> struct to_signed_vector { using type = i16vec3; }; +template <> struct to_signed_vector { using type = i16vec2; }; +template <> struct to_components { enum { components = 3 }; }; +template <> struct to_components { enum { components = 2 }; }; + +template +static auto max_component(T value) -> std::remove_reference_t +{ + std::remove_reference_t val = 0; + for (auto v : value.data) + val = std::max(val, v); + return val; +} + +template static void encode_attribute_stream(std::vector &out_payload_buffer, Stream &stream, - const u16vec3 *raw_positions, + const T *raw_positions, uint32_t chunk_index, const uint32_t *vbo_remap, uint32_t num_attributes) { - u16vec3 positions[ElementsPerChunk]; + using SignedT = typename to_signed_vector::type; + using UnsignedScalar = std::remove_reference_t; + using SignedScalar = std::remove_reference_t; + static_assert(sizeof(T) == 4 || sizeof(T) == 6, "Encoded type must be 32 or 48 bits."); + + T positions[ElementsPerChunk]; for (uint32_t i = 0; i < num_attributes; i++) positions[i] = raw_positions[vbo_remap[i]]; for (uint32_t i = num_attributes; i < ElementsPerChunk; i++) positions[i] = positions[0]; - u16vec3 ulo{0xffff}; - u16vec3 uhi{0}; - i16vec3 slo{0x7fff}; - i16vec3 shi{-0x8000}; + T ulo{std::numeric_limits::max()}; + T uhi{std::numeric_limits::min()}; + SignedT slo{std::numeric_limits::max()}; + SignedT shi{std::numeric_limits::min()}; for (auto &p : positions) { ulo = min(ulo, p); uhi = max(uhi, p); - slo = min(slo, i16vec3(p)); - shi = max(shi, i16vec3(p)); + slo = min(slo, SignedT(p)); + shi = max(shi, SignedT(p)); } - const auto max3 = [](u16vec3 v) { return max(max(v.x, v.y), v.z); }; - u16vec3 diff_unsigned = uhi - ulo; - u16vec3 diff_signed = u16vec3(shi) - u16vec3(slo); + T diff_unsigned = uhi - ulo; + T diff_signed = T(shi) - T(slo); - unsigned diff3_unsigned = max3(diff_unsigned); - unsigned diff3_signed = max3(diff_signed); - if (diff3_signed < diff3_unsigned) + unsigned diff_max_unsigned = max_component(diff_unsigned); + unsigned diff_max_signed = max_component(diff_signed); + if (diff_max_signed < diff_max_unsigned) { - ulo = u16vec3(slo); - diff3_unsigned = diff3_signed; + ulo = T(slo); + diff_max_unsigned = diff_max_signed; } - unsigned bits = compute_required_bits_unsigned(diff3_unsigned); - unsigned encoded_bits = (bits + 1) / 2; + constexpr unsigned bits_per_component = sizeof(UnsignedScalar) * 8; + + unsigned bits = compute_required_bits_unsigned(diff_max_unsigned); + unsigned encoded_bits = (bits + sizeof(UnsignedScalar) - 1) / sizeof(UnsignedScalar); stream.bit_plane_config |= encoded_bits << (4 * chunk_index); - stream.u.base_value[chunk_index] = uint32_t(ulo.x) | (uint32_t(ulo.y) << 16); - stream.u.base_value[chunk_index / 2 + 8] |= uint32_t(ulo.z) << (16 * (chunk_index & 1)); + memcpy(&stream.u.base_value[chunk_index], ulo.data, sizeof(uint32_t)); + if (to_components::components == 3 && bits_per_component == 16) + { + memcpy(reinterpret_cast(&stream.u.base_value[8]) + sizeof(uint16_t) * chunk_index, + &ulo.z, sizeof(uint16_t)); + } for (auto &p : positions) p -= ulo; From bc7c2a52940428b38fc9a4d02c38714fbeaa3ace Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 21 Dec 2023 14:08:14 +0100 Subject: [PATCH 29/59] Rewrite decode shader. --- assets/shaders/decode/meshlet_decode.comp | 159 +++----- assets/shaders/inc/meshlet_payload_decode.h | 400 +++++++++----------- tests/meshopt_sandbox.cpp | 99 ++--- vulkan/mesh/meshlet.cpp | 118 ++---- vulkan/mesh/meshlet.hpp | 5 +- 5 files changed, 293 insertions(+), 488 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index 166ca870..eee4405d 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -2,20 +2,14 @@ #extension GL_EXT_scalar_block_layout : require #include "../inc/meshlet_payload_constants.h" +#extension GL_KHR_shader_subgroup_basic : require -#define MESHLET_PAYLOAD_LARGE_WORKGROUP 1 - -#if MESHLET_PAYLOAD_LARGE_WORKGROUP -#define MESHLET_PAYLOAD_WG_Y MESHLET_PAYLOAD_NUM_CHUNKS -#else -#define MESHLET_PAYLOAD_WG_Y 1 -#endif layout(local_size_x = 256) in; layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; -layout(constant_id = 1) const uint NUM_OUTPUT_U32_STREAMS = 1; -layout(constant_id = 2) const bool RAW_PAYLOAD = false; -#define MESHLET_PAYLOAD_NUM_U32_STREAMS NUM_U32_STREAMS +layout(constant_id = 1) const bool UNROLLED_MESH = false; +layout(constant_id = 2) const uint TARGET_MESH_STYLE = 0; + #define MESHLET_PAYLOAD_DESCRIPTOR_SET 0 #define MESHLET_PAYLOAD_META_BINDING 0 #define MESHLET_PAYLOAD_STREAM_BINDING 1 @@ -23,7 +17,6 @@ layout(constant_id = 2) const bool RAW_PAYLOAD = false; #include "../inc/meshlet_payload_decode.h" #include "../inc/meshlet_attribute_decode.h" -const int MESH_STYLE = int(NUM_OUTPUT_U32_STREAMS); const int MESH_STYLE_WIREFRAME = 0; const int MESH_STYLE_TEXTURED = 1; const int MESH_STYLE_SKINNED = 2; @@ -38,11 +31,6 @@ layout(set = 0, binding = 3, scalar) writeonly buffer OutputIndices8 u8vec3 data[]; } output_indices8; -layout(set = 0, binding = 4, std430) writeonly buffer OutputStream0 -{ - uint data[]; -} output_stream_raw; - layout(set = 0, binding = 4, scalar) writeonly buffer OutputStreamPos { vec3 data[]; @@ -67,8 +55,8 @@ layout(set = 0, binding = 6, std430) writeonly buffer OutputStreamSkin layout(set = 0, binding = 7, std430) readonly buffer OutputOffsets { - uvec2 data[]; -} output_offset_strides; + uint data[]; +} primitive_output_offsets; #if MESHLET_PAYLOAD_RUNTIME_MESH struct IndirectDrawMesh @@ -115,111 +103,80 @@ uint pack_a2bgr10(vec4 v) void main() { uint meshlet_index = gl_WorkGroupID.x; - meshlet_init_workgroup(meshlet_index * NUM_U32_STREAMS); MeshletMetaRaw meta = meshlet_metas_raw.data[meshlet_index]; - if (!RAW_PAYLOAD) + int lane_index; + uint chunk_index; + uint linear_index; + + if (gl_SubgroupSize == 32) + { + chunk_index = gl_SubgroupID; + lane_index = int(gl_SubgroupInvocationID); + linear_index = chunk_index * gl_SubgroupSize + lane_index; + } + else + { + linear_index = gl_LocalInvocationIndex; + chunk_index = linear_index / 32u; + lane_index = int(linear_index & 31u); + } + + uint primitive_output_offset = primitive_output_offsets.data[meshlet_index] + registers.primitive_offset; + + if (!UNROLLED_MESH && gl_LocalInvocationIndex == 0) { + MeshletInfo info = meshlet_get_meshlet_info(meshlet_index * NUM_U32_STREAMS); #if MESHLET_PAYLOAD_RUNTIME_MESH IndirectDrawMesh draw; - draw.primitive_offset = output_offset_strides.data[meshlet_index].x + registers.primitive_offset; + draw.primitive_offset = primitive_output_offset; draw.vertex_offset = meta.base_vertex_offset + registers.vertex_offset; - draw.num_primitives = meta.num_primitives; - draw.num_attributes = meta.num_attributes; + draw.num_primitives = info.num_primitives; + draw.num_attributes = info.num_attributes; indirect_commands_mesh.draws[meshlet_index + registers.meshlet_offset] = draw; #else IndirectIndexedDraw draw; - draw.indexCount = 3 * meta.num_primitives; + draw.indexCount = 3 * info.primitive_count; draw.instanceCount = 1; draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset; - draw.firstIndex = 3 * (output_offset_strides.data[meshlet_index].x + registers.primitive_offset); + draw.firstIndex = 3 * primitive_output_offset; draw.firstInstance = 0; indirect_commands.draws[meshlet_index + registers.meshlet_offset] = draw; #endif } -#define INDEX(linear_index, packed_indices) { \ - uint output_offset; \ - if (RAW_PAYLOAD) { \ - uvec3 indices = uvec4(unpack8(packed_indices)).xyz; \ - indices += meta.base_vertex_offset + registers.vertex_offset; \ - output_offset = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS].x; \ - output_offset += registers.primitive_offset; \ - if (linear_index < uint(meta.num_primitives)) \ - output_indices32.data[output_offset + linear_index] = indices; \ - } else { \ - output_offset = output_offset_strides.data[meshlet_index].x; \ - output_offset += registers.primitive_offset; \ - if (linear_index < uint(meta.num_primitives)) \ - output_indices8.data[output_offset + linear_index] = unpack8(packed_indices).xyz; \ - } \ -} - - { - MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 0, INDEX); - } + if (chunk_index >= meta.num_chunks) + return; - if (RAW_PAYLOAD) - { -#define ATTR(linear_index, packed_decoded) { \ - uvec2 output_offset_stride0 = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS + i]; \ - output_offset_stride0.x += registers.vertex_offset; \ - if (linear_index < uint(meta.num_attributes)) \ - output_stream_raw.data[output_offset_stride0.x + linear_index * output_offset_stride0.y] = packed_decoded; \ -} + MeshletChunkInfo chunk_info = meshlet_get_chunk_info(meshlet_index * NUM_U32_STREAMS, chunk_index); - for (uint i = 1; i < NUM_OUTPUT_U32_STREAMS; i++) - { - MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, i, ATTR); - } - } - else + // Index + if (lane_index < chunk_info.primitive_count) { - uint output_offset = output_offset_strides.data[meshlet_index].y; - output_offset += registers.vertex_offset; + uint decoded_index_buffer = meshlet_decode_index_buffer(meshlet_index * NUM_U32_STREAMS, + chunk_index, lane_index); -#define POS(linear_index, packed_decoded) { \ - if (linear_index < uint(meta.num_attributes)) \ - output_stream_pos.data[output_offset + linear_index] = attribute_decode_snorm_exp_position(packed_decoded); \ -} - -#define NORMAL(linear_index, packed_decoded) { \ - if (linear_index < uint(meta.num_attributes)) { \ - output_stream_textured_attr.data[output_offset + linear_index].normal = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ - } \ -} + primitive_output_offset += chunk_info.primitive_offset; -#define TANGENT(linear_index, packed_decoded) { \ - if (linear_index < uint(meta.num_attributes)) { \ - output_stream_textured_attr.data[output_offset + linear_index].tangent = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ - } \ -} + uvec3 indices; + indices.x = bitfieldExtract(decoded_index_buffer, 0, 8); + indices.y = bitfieldExtract(decoded_index_buffer, 8, 8); + indices.z = bitfieldExtract(decoded_index_buffer, 16, 8); + indices += chunk_info.vertex_offset; -#define UV(linear_index, packed_decoded) { \ - if (linear_index < uint(meta.num_attributes)) { \ - output_stream_textured_attr.data[output_offset + linear_index].uv = attribute_decode_snorm_exp_uv(packed_decoded); \ - } \ -} + if (UNROLLED_MESH) + output_indices32.data[primitive_output_offset + lane_index] = indices + meta.base_vertex_offset + registers.vertex_offset; + else + output_indices8.data[primitive_output_offset + lane_index] = u8vec3(indices); + } -#define SKIN(linear_index, packed_decoded) { \ - if (linear_index < uint(meta.num_attributes)) { \ - output_stream_skin.data[output_offset + linear_index] = packed_decoded; \ - } \ -} - { - MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 1, POS); - } - - if (MESH_STYLE >= MESH_STYLE_TEXTURED) - { - MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 3, NORMAL); - MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 4, TANGENT); - MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 5, UV); - } - - if (MESH_STYLE >= MESH_STYLE_SKINNED) - { - MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 7, SKIN); - } + // Attributes + if (lane_index < chunk_info.vertex_count) + { + int exponent; + i16vec3 pos = meshlet_decode_snorm_scaled_i16x3(meshlet_index * NUM_U32_STREAMS + 1, chunk_index, lane_index, exponent); + vec3 fp_pos = ldexp(vec3(pos), ivec3(exponent)); + uint vertex_output_offset = registers.vertex_offset + meta.base_vertex_offset + chunk_info.vertex_offset; + output_stream_pos.data[vertex_output_offset + lane_index] = fp_pos; } } diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 0e6ee02f..b2af6435 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -5,21 +5,9 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_scalar_block_layout : require #extension GL_EXT_control_flow_attributes : require -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require -#extension GL_KHR_shader_subgroup_shuffle : require -#extension GL_KHR_shader_subgroup_basic : require #include "meshlet_payload_constants.h" -#ifndef MESHLET_PAYLOAD_NUM_U32_STREAMS -#error "Must define MESHLET_PAYLOAD_NUM_U32_STREAMS before including meshlet_payload_decode.h" -#endif - -#ifndef MESHLET_PAYLOAD_LARGE_WORKGROUP -#error "Must define MESHLET_PAYLOAD_LARGE_WORKGROUP" -#endif - #ifndef MESHLET_PAYLOAD_DESCRIPTOR_SET #error "Must define MESHLET_PAYLOAD_DESCRIPTOR_SET" #endif @@ -38,25 +26,37 @@ struct MeshletStream { - u16vec4 predictor_a; - u16vec4 predictor_b; - u8vec4 initial_value; - uint offset_from_base; - uint16_t bitplane_meta[MESHLET_PAYLOAD_NUM_CHUNKS]; + uint base_value_or_offsets[12]; + uint bit_plane_config; + uint reserved; + int aux; + uint offset_in_b128; }; struct MeshletMetaRaw { uint base_vertex_offset; - uint16_t num_primitives; - uint16_t num_attributes; + uint num_chunks; }; struct MeshletMetaRuntime { uint stream_offset; - uint16_t num_primitives; - uint16_t num_attributes; + uint num_chunks; +}; + +struct MeshletChunkInfo +{ + uint primitive_count; + uint primitive_offset; + uint vertex_count; + uint vertex_offset; +}; + +struct MeshletInfo +{ + uint primitive_count; + uint vertex_count; }; layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_META_BINDING, std430) readonly buffer MeshletMetasRaw @@ -76,247 +76,191 @@ layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_STREAM_BI layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_BINDING, std430) readonly buffer Payload { - uint data[]; + uvec4 data[]; } payload; -shared uint shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS][MESHLET_PAYLOAD_NUM_CHUNKS]; -shared uint wave_buffer_x[MESHLET_PAYLOAD_NUM_CHUNKS]; -shared uint wave_buffer_y[MESHLET_PAYLOAD_NUM_CHUNKS]; -shared uint wave_buffer_z[MESHLET_PAYLOAD_NUM_CHUNKS]; -shared uint wave_buffer_w[MESHLET_PAYLOAD_NUM_CHUNKS]; +layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_BINDING, std430) readonly buffer PayloadU32 +{ + uint data[]; +} payload_u32; -uvec2 wgx_inclusive_add(uvec2 v) +layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_BINDING, std430) readonly buffer PayloadU16 { - v &= 0xff00ffu; - v = subgroupInclusiveAdd(v); - if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) - { - wave_buffer_x[gl_SubgroupID] = v.x; - wave_buffer_y[gl_SubgroupID] = v.y; - } + uint16_t data[]; +} payload_u16; + +MeshletInfo meshlet_get_meshlet_info(uint stream_index) +{ + MeshletInfo info; + uint v = meshlet_streams.data[stream_index].base_value_or_offsets[MESHLET_PAYLOAD_NUM_CHUNKS]; + uint prim_offset = bitfieldExtract(v, 0, 16); + uint vert_offset = bitfieldExtract(v, 16, 16); + info.primitive_count = prim_offset; + info.vertex_count = vert_offset; + return info; +} - barrier(); +MeshletChunkInfo meshlet_get_chunk_info(uint stream_index, uint chunk_index) +{ + MeshletChunkInfo info; - for (uint i = 0; i < gl_SubgroupID; i++) - { - v.x += wave_buffer_x[i]; - v.y += wave_buffer_y[i]; - } + uint v0 = meshlet_streams.data[stream_index].base_value_or_offsets[chunk_index]; + uint v1 = meshlet_streams.data[stream_index].base_value_or_offsets[chunk_index + 1]; + + uint prim_offset0 = bitfieldExtract(v0, 0, 16); + uint vert_offset0 = bitfieldExtract(v0, 16, 16); + uint prim_offset1 = bitfieldExtract(v1, 0, 16); + uint vert_offset1 = bitfieldExtract(v1, 16, 16); - return v; + info.primitive_count = prim_offset1 - prim_offset0; + info.primitive_offset = prim_offset0; + info.vertex_count = vert_offset1 - vert_offset0; + info.vertex_offset = vert_offset0; + + return info; } -uvec4 wgx_inclusive_add(uvec4 v) +uint meshlet_decode_index_buffer(uint stream_index, uint chunk_index, int lane_index) { - v &= 0xff00ffu; - v = subgroupInclusiveAdd(v); - if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) - { - wave_buffer_x[gl_SubgroupID] = v.x; - wave_buffer_y[gl_SubgroupID] = v.y; - wave_buffer_z[gl_SubgroupID] = v.z; - wave_buffer_w[gl_SubgroupID] = v.w; - } + uint offset_in_b128 = meshlet_streams.data[stream_index].offset_in_b128; - barrier(); + // Fixed 5-bit encoding. + offset_in_b128 += 4 * chunk_index; - for (uint i = 0; i < gl_SubgroupID; i++) - { - v.x += wave_buffer_x[i]; - v.y += wave_buffer_y[i]; - v.z += wave_buffer_z[i]; - v.w += wave_buffer_w[i]; - } + // Scalar load. 64 bytes in one go. + uvec4 p0 = payload.data[offset_in_b128 + 0]; + uvec4 p1 = payload.data[offset_in_b128 + 1]; + uvec4 p2 = payload.data[offset_in_b128 + 2]; + uvec4 p3 = payload.data[offset_in_b128 + 3]; - return v; -} + uint indices = 0; -// Hardcodes wave32 atm. Need fallback. + indices |= bitfieldExtract(p0.x, lane_index, 1) << 0u; + indices |= bitfieldExtract(p0.y, lane_index, 1) << 1u; + indices |= bitfieldExtract(p0.z, lane_index, 1) << 2u; + indices |= bitfieldExtract(p0.w, lane_index, 1) << 3u; -uvec2 pack_u16vec4_to_uvec2(u16vec4 v) -{ - return uvec2(pack32(v.xy), pack32(v.zw)); -} + indices |= bitfieldExtract(p1.x, lane_index, 1) << 8u; + indices |= bitfieldExtract(p1.y, lane_index, 1) << 9u; + indices |= bitfieldExtract(p1.z, lane_index, 1) << 10u; + indices |= bitfieldExtract(p1.w, lane_index, 1) << 11u; -uint repack_uint(uvec2 v) -{ - u16vec4 v16 = u16vec4(unpack16(v.x), unpack16(v.y)); - return pack32(u8vec4(v16)); -} + indices |= bitfieldExtract(p2.x, lane_index, 1) << 16u; + indices |= bitfieldExtract(p2.y, lane_index, 1) << 17u; + indices |= bitfieldExtract(p2.z, lane_index, 1) << 18u; + indices |= bitfieldExtract(p2.w, lane_index, 1) << 19u; -uvec4 meshlet_decode_bit_counts(uint bitplane_value) -{ - uvec4 out_bit_counts; - out_bit_counts.x = bitfieldExtract(bitplane_value, 0, 4); - out_bit_counts.y = bitfieldExtract(bitplane_value, 4, 4); - out_bit_counts.z = bitfieldExtract(bitplane_value, 8, 4); - out_bit_counts.w = bitfieldExtract(bitplane_value, 12, 4); - return out_bit_counts; -} + indices |= bitfieldExtract(p3.x, lane_index, 1) << 4u; + indices |= bitfieldExtract(p3.y, lane_index, 1) << 12u; + indices |= bitfieldExtract(p3.z, lane_index, 1) << 20u; -void meshlet_compute_stream_counts(uint bitplane_value, out uint out_total_bits) -{ - uvec4 out_bit_counts = meshlet_decode_bit_counts(bitplane_value); - uvec2 bit_counts2 = out_bit_counts.xy + out_bit_counts.zw; - out_total_bits = bit_counts2.x + bit_counts2.y; + return indices; } -void meshlet_init_workgroup(uint base_stream_index) +i16vec3 meshlet_decode_snorm_scaled_i16x3(uint stream_index, uint chunk_index, int lane_index, out int exponent) { -#if 0 - if (gl_LocalInvocationIndex < MESHLET_PAYLOAD_NUM_U32_STREAMS) - { - uint unrolled_stream_index = base_stream_index + gl_LocalInvocationIndex; - uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; - uvec4 bitplane_values0 = uvec4( - meshlet_streams.data[unrolled_stream_index].bitplane_meta[0], - meshlet_streams.data[unrolled_stream_index].bitplane_meta[1], - meshlet_streams.data[unrolled_stream_index].bitplane_meta[2], - meshlet_streams.data[unrolled_stream_index].bitplane_meta[3]); - uvec3 bitplane_values1 = uvec3( - meshlet_streams.data[unrolled_stream_index].bitplane_meta[4], - meshlet_streams.data[unrolled_stream_index].bitplane_meta[5], - meshlet_streams.data[unrolled_stream_index].bitplane_meta[6]); - - uvec4 total_bits; - meshlet_compute_stream_counts(bitplane_values0.x, total_bits.x); - meshlet_compute_stream_counts(bitplane_values0.y, total_bits.y); - meshlet_compute_stream_counts(bitplane_values0.z, total_bits.z); - meshlet_compute_stream_counts(bitplane_values0.w, total_bits.w); - total_bits.y += total_bits.x; - total_bits.z += total_bits.y; - total_bits.w += total_bits.z; - shared_chunk_offset[gl_LocalInvocationIndex][0] = chunk_offset; - shared_chunk_offset[gl_LocalInvocationIndex][1] = chunk_offset + total_bits.x; - shared_chunk_offset[gl_LocalInvocationIndex][2] = chunk_offset + total_bits.y; - shared_chunk_offset[gl_LocalInvocationIndex][3] = chunk_offset + total_bits.z; - chunk_offset += total_bits.w; - - meshlet_compute_stream_counts(bitplane_values1.x, total_bits.x); - meshlet_compute_stream_counts(bitplane_values1.y, total_bits.y); - meshlet_compute_stream_counts(bitplane_values1.z, total_bits.z); - total_bits.y += total_bits.x; - total_bits.z += total_bits.y; - shared_chunk_offset[gl_LocalInvocationIndex][4] = chunk_offset; - shared_chunk_offset[gl_LocalInvocationIndex][5] = chunk_offset + total_bits.x; - shared_chunk_offset[gl_LocalInvocationIndex][6] = chunk_offset + total_bits.y; - shared_chunk_offset[gl_LocalInvocationIndex][7] = chunk_offset + total_bits.z; - } -#else - for (uint i = gl_SubgroupID; i < MESHLET_PAYLOAD_NUM_U32_STREAMS; i += gl_NumSubgroups) + uint offset_in_b128 = meshlet_streams.data[stream_index].offset_in_b128; + uint bit_plane_config = meshlet_streams.data[stream_index].bit_plane_config; + exponent = meshlet_streams.data[stream_index].aux; + + // Scalar math. + if (chunk_index != 0) { - if (gl_SubgroupInvocationID < MESHLET_PAYLOAD_NUM_CHUNKS) - { - uint unrolled_stream_index = base_stream_index + i; - uint chunk_offset = meshlet_streams.data[unrolled_stream_index].offset_from_base; - uint bitplane = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[gl_SubgroupInvocationID]); - uint total_bits; - meshlet_compute_stream_counts(bitplane, total_bits); - shared_chunk_offset[i][gl_SubgroupInvocationID] = chunk_offset + subgroupExclusiveAdd(total_bits); - } + uint prev_bit_mask = bitfieldExtract(bit_plane_config, 0, int((chunk_index - 1) * 4)); + offset_in_b128 += bitCount(prev_bit_mask & 0x88888888) * 12; + offset_in_b128 += bitCount(prev_bit_mask & 0x44444444) * 6; + offset_in_b128 += bitCount(prev_bit_mask & 0x22222222) * 3; + offset_in_b128 += bitCount(prev_bit_mask & 0x11111111) * 2; } -#endif - barrier(); -} -uint meshlet_get_linear_index() -{ - return gl_SubgroupSize * gl_SubgroupID + gl_SubgroupInvocationID; -} + // Scalar math. + uint encoded_bits = bitfieldExtract(bit_plane_config, int(chunk_index * 4), 4); + uint base_value_xy = meshlet_streams.data[stream_index].base_value_or_offsets[chunk_index]; + uint base_value_z = meshlet_streams.data[stream_index].base_value_or_offsets[8 + chunk_index / 2]; + uint base_value_x = bitfieldExtract(base_value_xy, 0, 16); + uint base_value_y = bitfieldExtract(base_value_xy, 16, 16); + base_value_z = bitfieldExtract(base_value_z, int(16 * (chunk_index & 1)), 16); + uvec3 base_value = uvec3(base_value_x, base_value_y, base_value_z); -// Overlap load with consumption. -// Helps RDNA2 quite a lot here! -#define MESHLET_FETCH_BITPLANES(decoded_value, counts, payload_value, offset) \ - for (int i = 0; i < counts; i++) \ - { \ - decoded_value |= bitfieldExtract(payload_value, local_chunk_index, 1) << i; \ - payload_value = payload.data[++offset]; \ - } \ - decoded_value = bitfieldExtract(int(decoded_value), 0, counts) - -// Add some specialized variants. - -#define MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, iter) \ - u16vec4 predictor_a##iter = meshlet_streams.data[unrolled_stream_index].predictor_a; \ - u16vec4 predictor_b##iter = meshlet_streams.data[unrolled_stream_index].predictor_b; \ - u8vec4 initial_value_##iter = meshlet_streams.data[unrolled_stream_index].initial_value; \ - uvec2 initial_value##iter = pack_u16vec4_to_uvec2(u16vec4(initial_value_##iter)) - -#define MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(unrolled_stream_index, stream_index, chunk_id, iter) \ - uint bitplane_offsets##iter = shared_chunk_offset[stream_index][chunk_id]; \ - uint bitplane_value##iter = uint(meshlet_streams.data[unrolled_stream_index].bitplane_meta[chunk_id]); \ - ivec4 bit_counts##iter = ivec4(meshlet_decode_bit_counts(bitplane_value##iter)) - -#define MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, iter) \ - uvec4 decoded##iter = ivec4(0); \ - MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(unrolled_stream_index, stream_index, chunk_id, iter); \ - uint value##iter = payload.data[bitplane_offsets##iter]; \ - MESHLET_FETCH_BITPLANES(decoded##iter.x, bit_counts##iter.x, value##iter, bitplane_offsets##iter); \ - MESHLET_FETCH_BITPLANES(decoded##iter.y, bit_counts##iter.y, value##iter, bitplane_offsets##iter); \ - MESHLET_FETCH_BITPLANES(decoded##iter.z, bit_counts##iter.z, value##iter, bitplane_offsets##iter); \ - MESHLET_FETCH_BITPLANES(decoded##iter.w, bit_counts##iter.w, value##iter, bitplane_offsets##iter); \ - uvec2 packed_decoded##iter = pack_u16vec4_to_uvec2(u16vec4(decoded##iter)) & 0xff00ffu; \ - if (linear_index == 0) \ - packed_decoded##iter += initial_value##iter; \ - packed_decoded##iter += pack_u16vec4_to_uvec2((predictor_a##iter + predictor_b##iter * uint16_t(linear_index)) >> 8us) - -uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) -{ - uint unrolled_stream_index = base_stream_index + stream_index; - uint linear_index = meshlet_get_linear_index(); + uvec3 value = uvec3(0); - int local_chunk_index; - uint chunk_id; - if (gl_SubgroupSize == 32) + if (encoded_bits == 8) { - chunk_id = gl_SubgroupID; - local_chunk_index = int(gl_SubgroupInvocationID); + // Vector loads. + uint value_xy = payload_u32.data[offset_in_b128 * 4 + lane_index]; + uint value_z = uint(payload_u16.data[offset_in_b128 * 8 + 64 + lane_index]); + + value.x = bitfieldExtract(value_xy, 0, 16); + value.y = bitfieldExtract(value_xy, 16, 16); + value.z = value_z; } - else + else if (encoded_bits != 0) { - chunk_id = linear_index / 32u; - local_chunk_index = int(linear_index & 31); - } + uvec4 p0, p1, p2, p3, p4, p5; + + // Scalar loads, vector math. + // Preload early. Also helps compiler prove it can use common descriptor (RADV thing). + p0 = payload.data[offset_in_b128]; + offset_in_b128 += 1; + +#define UNROLL_BITS_4(out_value, bit_offset, p) \ + out_value |= bitfieldExtract(p.x, lane_index, 1) << ((bit_offset) + 0); \ + out_value |= bitfieldExtract(p.y, lane_index, 1) << ((bit_offset) + 1); \ + out_value |= bitfieldExtract(p.z, lane_index, 1) << ((bit_offset) + 2); \ + out_value |= bitfieldExtract(p.w, lane_index, 1) << ((bit_offset) + 3) +#define UNROLL_BITS_8(out_value, bit_offset, p0, p1) \ + UNROLL_BITS_4(out_value, bit_offset, p0); \ + UNROLL_BITS_4(out_value, (bit_offset) + 4, p1) + + int bit_offset = 0; + if ((encoded_bits & 4) != 0) + { + p1 = payload.data[offset_in_b128 + 0]; + p2 = payload.data[offset_in_b128 + 1]; + p3 = payload.data[offset_in_b128 + 2]; + p4 = payload.data[offset_in_b128 + 3]; + p5 = payload.data[offset_in_b128 + 4]; - MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); - MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); - packed_decoded0 = wgx_inclusive_add(packed_decoded0); - return repack_uint(packed_decoded0); -} + UNROLL_BITS_8(value.x, 0, p0, p1); + UNROLL_BITS_8(value.y, 0, p2, p3); + UNROLL_BITS_8(value.z, 0, p4, p5); -uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) -{ - // Dual-pump the computation. VGPR use is quite low either way, so this is fine. - uint unrolled_stream_index = base_stream_index + stream_index; - uint linear_index = meshlet_get_linear_index(); + // Preload for next iteration. + p0 = payload.data[offset_in_b128 + 5]; - int local_chunk_index; - uint chunk_id; - if (gl_SubgroupSize == 32) - { - chunk_id = gl_SubgroupID; - local_chunk_index = int(gl_SubgroupInvocationID); - } - else - { - chunk_id = linear_index / 32u; - local_chunk_index = int(linear_index & 31); - } + offset_in_b128 += 6; + bit_offset += 8; + } - MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); - MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); - MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index, stream_index, chunk_id, 0); - MESHLET_PAYLOAD_PROCESS_CHUNK(unrolled_stream_index + 1, stream_index + 1, chunk_id, 1); - uvec4 packed_decoded = wgx_inclusive_add(uvec4(packed_decoded0, packed_decoded1)); - return uvec2(repack_uint(packed_decoded.xy), repack_uint(packed_decoded.zw)); -} + if ((encoded_bits & 2) != 0) + { + p1 = payload.data[offset_in_b128 + 0]; + p2 = payload.data[offset_in_b128 + 1]; + + UNROLL_BITS_4(value.x, bit_offset, p0); + UNROLL_BITS_4(value.y, bit_offset, p1); + UNROLL_BITS_4(value.z, bit_offset, p2); + + // Preload for next iteration. + p0 = payload.data[offset_in_b128 + 2]; + offset_in_b128 += 3; + bit_offset += 4; + } -#define MESHLET_DECODE_STREAM_32(meshlet_index, stream_index, report_cb) { \ - uint value = meshlet_decode_stream_32_wg256(meshlet_index, stream_index); \ - report_cb(meshlet_get_linear_index(), value); } + if ((encoded_bits & 1) != 0) + { + p1 = payload.data[offset_in_b128]; + value.x |= bitfieldExtract(p0.x, lane_index, 1) << (bit_offset + 0); + value.x |= bitfieldExtract(p0.y, lane_index, 1) << (bit_offset + 1); + value.y |= bitfieldExtract(p0.z, lane_index, 1) << (bit_offset + 0); + value.y |= bitfieldExtract(p0.w, lane_index, 1) << (bit_offset + 1); + value.z |= bitfieldExtract(p1.x, lane_index, 1) << (bit_offset + 0); + value.z |= bitfieldExtract(p1.y, lane_index, 1) << (bit_offset + 1); + } + } -#define MESHLET_DECODE_STREAM_64(meshlet_index, stream_index, report_cb) { \ - uvec2 value = meshlet_decode_stream_64_wg256(meshlet_index, stream_index); \ - report_cb(meshlet_get_linear_index(), value); } + value += base_value; + return i16vec3(value); +} #endif diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 61418a9f..28ac8eed 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -159,28 +159,28 @@ static void decode_mesh(std::vector &out_index_buffer, } } -#if 0 static void decode_mesh_gpu( Vulkan::Device &dev, - std::vector &out_index_buffer, std::vector &out_u32_stream, + std::vector &out_index_buffer, std::vector &out_pos_buffer, const MeshView &mesh) { - decode_mesh_setup_buffers(out_index_buffer, out_u32_stream, mesh); + out_index_buffer.resize(mesh.total_primitives); + out_pos_buffer.resize(mesh.total_vertices); Vulkan::BufferCreateInfo buf_info = {}; buf_info.domain = Vulkan::BufferDomain::LinkedDeviceHost; buf_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - buf_info.size = mesh.format_header->payload_size_words * sizeof(uint32_t); + buf_info.size = mesh.format_header->payload_size_b128 * sizeof(PayloadB128); auto payload_buffer = dev.create_buffer(buf_info, mesh.payload); - buf_info.size = out_index_buffer.size() * sizeof(uint32_t); + buf_info.size = out_index_buffer.size() * sizeof(uvec3); buf_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; buf_info.domain = Vulkan::BufferDomain::CachedHost; auto readback_decoded_index_buffer = dev.create_buffer(buf_info); - buf_info.size = out_u32_stream.size() * sizeof(uint32_t); + buf_info.size = out_pos_buffer.size() * sizeof(vec3); buf_info.domain = Vulkan::BufferDomain::CachedHost; - auto readback_decoded_u32_buffer = dev.create_buffer(buf_info); + auto readback_decoded_pos_buffer = dev.create_buffer(buf_info); bool has_renderdoc = Vulkan::Device::init_renderdoc_capture(); if (has_renderdoc) @@ -190,10 +190,10 @@ static void decode_mesh_gpu( DecodeInfo info = {}; info.ibo = readback_decoded_index_buffer.get(); - info.streams[0] = readback_decoded_u32_buffer.get(); + info.streams[0] = readback_decoded_pos_buffer.get(); info.target_style = mesh.format_header->style; info.payload = payload_buffer.get(); - info.flags = DECODE_MODE_RAW_PAYLOAD; + info.flags = DECODE_MODE_UNROLLED_MESH; decode_mesh(*cmd, info, mesh); cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, @@ -206,59 +206,12 @@ static void decode_mesh_gpu( memcpy(out_index_buffer.data(), dev.map_host_buffer(*readback_decoded_index_buffer, Vulkan::MEMORY_ACCESS_READ_BIT), - out_index_buffer.size() * sizeof(uint32_t)); - - memcpy(out_u32_stream.data(), - dev.map_host_buffer(*readback_decoded_u32_buffer, Vulkan::MEMORY_ACCESS_READ_BIT), - out_u32_stream.size() * sizeof(uint32_t)); -} - -static bool validate_mesh_decode(const std::vector &decoded_index_buffer, - const std::vector &decoded_u32_stream, - const std::vector &reference_index_buffer, - const std::vector &reference_u32_stream, unsigned u32_stride) -{ - std::vector decoded_output; - std::vector reference_output; - - if (decoded_index_buffer.size() != reference_index_buffer.size()) - return false; - - size_t count = decoded_index_buffer.size(); - - decoded_output.reserve(count * u32_stride); - reference_output.reserve(count * u32_stride); - for (size_t i = 0; i < count; i++) - { - uint32_t decoded_index = decoded_index_buffer[i]; - decoded_output.insert(decoded_output.end(), - decoded_u32_stream.data() + decoded_index * u32_stride, - decoded_u32_stream.data() + (decoded_index + 1) * u32_stride); - - uint32_t reference_index = reference_index_buffer[i]; - reference_output.insert(reference_output.end(), - reference_u32_stream.data() + reference_index * u32_stride, - reference_u32_stream.data() + (reference_index + 1) * u32_stride); - } + out_index_buffer.size() * sizeof(uvec3)); - for (size_t i = 0; i < count; i++) - { - for (unsigned j = 0; j < u32_stride; j++) - { - uint32_t decoded_value = decoded_output[i * u32_stride + j]; - uint32_t reference_value = reference_output[i * u32_stride + j]; - if (decoded_value != reference_value) - { - LOGI("Error in index %zu (prim %zu), word %u, expected %x, got %x.\n", - i, i / 3, j, reference_value, decoded_value); - return false; - } - } - } - - return true; + memcpy(out_pos_buffer.data(), + dev.map_host_buffer(*readback_decoded_pos_buffer, Vulkan::MEMORY_ACCESS_READ_BIT), + out_pos_buffer.size() * sizeof(vec3)); } -#endif static void build_reference_mesh(std::vector &indices, std::vector &positions) { @@ -362,14 +315,6 @@ int main(int argc, char *argv[]) auto view = create_mesh_view(*mapped); decode_mesh(decoded_index_buffer, decoded_positions, view); - if (!validate_mesh(reference_indices, reference_positions, - decoded_index_buffer, decoded_positions)) - return EXIT_FAILURE; - - return 0; -#if 0 - GLTF::Parser parser(argv[1]); - Vulkan::Context ctx; Vulkan::Device dev; if (!Vulkan::Context::init_loader(nullptr)) @@ -381,6 +326,24 @@ int main(int argc, char *argv[]) if (!ctx.init_instance_and_device(nullptr, 0, nullptr, 0)) return EXIT_FAILURE; dev.set_context(ctx); + + std::vector gpu_index_buffer; + std::vector gpu_positions; + decode_mesh_gpu(dev, gpu_index_buffer, gpu_positions, view); + + if (!validate_mesh(reference_indices, reference_positions, + decoded_index_buffer, decoded_positions)) + return EXIT_FAILURE; + + if (!validate_mesh(reference_indices, reference_positions, + gpu_index_buffer, gpu_positions)) + return EXIT_FAILURE; + + return 0; +#if 0 + GLTF::Parser parser(argv[1]); + + dev.init_frame_contexts(4); auto mesh = parser.get_meshes().front(); diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index aa10aee8..26e0812c 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -118,22 +118,14 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie buf_info.size = view.format_header->meshlet_count * view.format_header->stream_count * sizeof(*view.streams); auto meshlet_stream_buffer = cmd.get_device().create_buffer(buf_info, view.streams); - // For Raw mode -> offset/stride - // For typed mode -> index offset / vertex offset - struct DecodeOffset { uint32_t arg0, arg1; }; - std::vector decode_offsets; + std::vector decode_offsets; - bool supports_wave32 = cmd.get_device().supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - bool meshlet_runtime = (info.flags & DECODE_MODE_RAW_PAYLOAD) == 0 && info.runtime_style == RuntimeStyle::Meshlet; + bool meshlet_runtime = info.runtime_style == RuntimeStyle::Meshlet; cmd.set_program("builtin://shaders/decode/meshlet_decode.comp", - {{"MESHLET_PAYLOAD_WAVE32", int(supports_wave32) }, - {"MESHLET_PAYLOAD_RUNTIME_MESH", int(meshlet_runtime)}}); + {{"MESHLET_PAYLOAD_RUNTIME_MESH", int(meshlet_runtime)}}); cmd.enable_subgroup_size_control(true); - if (supports_wave32) - cmd.set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - else - cmd.set_subgroup_size_log2(true, 5, 7, VK_SHADER_STAGE_COMPUTE_BIT); + cmd.set_subgroup_size_log2(true, 5, 7); cmd.set_storage_buffer(0, 0, *meshlet_meta_buffer); cmd.set_storage_buffer(0, 1, *meshlet_stream_buffer); @@ -142,92 +134,42 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie cmd.set_specialization_constant_mask(0x7); cmd.set_specialization_constant(0, view.format_header->stream_count); - cmd.set_specialization_constant(2, (info.flags & DECODE_MODE_RAW_PAYLOAD) != 0); + cmd.set_specialization_constant(1, (info.flags & DECODE_MODE_UNROLLED_MESH) != 0); + cmd.set_specialization_constant(2, uint32_t(info.target_style)); - if ((info.flags & DECODE_MODE_RAW_PAYLOAD) != 0) + for (unsigned i = 0; i < 3; i++) + cmd.set_storage_buffer(0, 4 + i, *info.streams[0]); + + switch (info.target_style) { - uint32_t output_u32_streams; - switch (info.target_style) - { - case MeshStyle::Wireframe: - output_u32_streams = 2; - break; - - case MeshStyle::Textured: - output_u32_streams = 6; - break; - - case MeshStyle::Skinned: - output_u32_streams = 8; - break; - - default: - return false; - } - - if (output_u32_streams + 1 > view.format_header->stream_count) - { - LOGE("Trying to decode more streams than exist in payload.\n"); - return false; - } - - for (unsigned i = 0; i < 3; i++) - cmd.set_storage_buffer(0, 4 + i, *info.streams[0]); - - decode_offsets.reserve(view.format_header->meshlet_count * (output_u32_streams + 1)); - uint32_t index_count = 0; - - for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) - { - decode_offsets.push_back({ index_count, 0 }); - index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; - for (uint32_t j = 0; j < output_u32_streams; j++) - decode_offsets.push_back({ view.headers[i].base_vertex_offset * output_u32_streams + j, output_u32_streams }); - } - - cmd.set_specialization_constant(1, output_u32_streams + 1); - - // Dummy bind for indirect_buffer. - cmd.set_storage_buffer(0, 8, *info.streams[0]); + case MeshStyle::Skinned: + cmd.set_storage_buffer(0, 6, *info.streams[2]); + // Fallthrough + case MeshStyle::Textured: + cmd.set_storage_buffer(0, 5, *info.streams[1]); + // Fallthrough + case MeshStyle::Wireframe: + cmd.set_storage_buffer(0, 4, *info.streams[0]); + break; + + default: + return false; } - else + + decode_offsets.reserve(view.format_header->meshlet_count); + uint32_t index_count = 0; + for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) { - for (unsigned i = 0; i < 3; i++) - cmd.set_storage_buffer(0, 4 + i, *info.streams[0]); - - switch (info.target_style) - { - case MeshStyle::Skinned: - cmd.set_storage_buffer(0, 6, *info.streams[2]); - // Fallthrough - case MeshStyle::Textured: - cmd.set_storage_buffer(0, 5, *info.streams[1]); - // Fallthrough - case MeshStyle::Wireframe: - cmd.set_storage_buffer(0, 4, *info.streams[0]); - break; - - default: - return false; - } - - decode_offsets.reserve(view.format_header->meshlet_count); - uint32_t index_count = 0; - for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) - { - decode_offsets.push_back({ index_count, view.headers[i].base_vertex_offset }); - index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; - } - cmd.set_specialization_constant(1, uint32_t(info.target_style)); - - cmd.set_storage_buffer(0, 8, *info.indirect); + decode_offsets.push_back(index_count); + index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; } buf_info.domain = BufferDomain::LinkedDeviceHost; - buf_info.size = decode_offsets.size() * sizeof(DecodeOffset); + buf_info.size = decode_offsets.size() * sizeof(uint32_t); auto output_offset_strides_buffer = cmd.get_device().create_buffer(buf_info, decode_offsets.data()); cmd.set_storage_buffer(0, 7, *output_offset_strides_buffer); + cmd.set_storage_buffer(0, 8, info.indirect ? *info.indirect : *info.streams[0]); // TODO: Split dispatches for big chungus meshes. // (Starts to become a problem around 8-16 million primitives per dispatch). diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index a2b47de3..67cb5055 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -69,8 +69,7 @@ struct Header struct RuntimeHeader { uint32_t stream_offset; - uint16_t num_primitives; - uint16_t num_attributes; + uint32_t num_chunks; }; struct RuntimeHeaderDecoded @@ -135,7 +134,7 @@ MeshView create_mesh_view(const Granite::FileMapping &mapping); enum DecodeModeFlagBits : uint32_t { - DECODE_MODE_RAW_PAYLOAD = 1 << 0, + DECODE_MODE_UNROLLED_MESH = 1 << 0, }; using DecodeModeFlags = uint32_t; From aca0dbfc663dd9714f951dba150e3832a3570d5b Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 21 Dec 2023 14:33:00 +0100 Subject: [PATCH 30/59] Test compute path. --- assets/shaders/inc/meshlet_payload_decode.h | 2 +- tests/meshopt_sandbox.cpp | 38 +++++++++++++-------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index b2af6435..662902f1 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -166,7 +166,7 @@ i16vec3 meshlet_decode_snorm_scaled_i16x3(uint stream_index, uint chunk_index, i // Scalar math. if (chunk_index != 0) { - uint prev_bit_mask = bitfieldExtract(bit_plane_config, 0, int((chunk_index - 1) * 4)); + uint prev_bit_mask = bitfieldExtract(bit_plane_config, 0, int(chunk_index) * 4); offset_in_b128 += bitCount(prev_bit_mask & 0x88888888) * 12; offset_in_b128 += bitCount(prev_bit_mask & 0x44444444) * 6; offset_in_b128 += bitCount(prev_bit_mask & 0x22222222) * 3; diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 28ac8eed..4e5bb53d 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -217,6 +217,7 @@ static void build_reference_mesh(std::vector &indices, std::vector { for (unsigned i = 0; i < 256; i++) { +#if 1 vec3 p; p.x = -40.0f + float(i); p.y = float(i); @@ -224,6 +225,9 @@ static void build_reference_mesh(std::vector &indices, std::vector if (i == 8) p.y = 20000.0f; +#else + vec3 p = vec3(-40.0f + float(i)); +#endif positions.push_back(p); } @@ -234,7 +238,8 @@ static void build_reference_mesh(std::vector &indices, std::vector static bool validate_mesh(std::vector &reference_indices, std::vector &reference_positions, std::vector &decoded_indices, - std::vector &decoded_positions) + std::vector &decoded_positions, + bool need_sorting) { if (reference_indices.size() != decoded_indices.size()) { @@ -242,17 +247,22 @@ static bool validate_mesh(std::vector &reference_indices, return false; } - std::sort(reference_indices.begin(), reference_indices.end(), [&](const uvec3 &a, const uvec3 &b) { - float za = reference_positions[a.z].z; - float zb = reference_positions[b.z].z; - return za < zb; - }); + if (need_sorting) + { + std::sort(reference_indices.begin(), reference_indices.end(), [&](const uvec3 &a, const uvec3 &b) + { + float za = reference_positions[a.z].z; + float zb = reference_positions[b.z].z; + return za < zb; + }); - std::sort(decoded_indices.begin(), decoded_indices.end(), [&](const uvec3 &a, const uvec3 &b) { - float za = decoded_positions[a.z].z; - float zb = decoded_positions[b.z].z; - return za < zb; - }); + std::sort(decoded_indices.begin(), decoded_indices.end(), [&](const uvec3 &a, const uvec3 &b) + { + float za = decoded_positions[a.z].z; + float zb = decoded_positions[b.z].z; + return za < zb; + }); + } for (size_t i = 0, n = decoded_indices.size(); i < n; i++) { @@ -331,12 +341,12 @@ int main(int argc, char *argv[]) std::vector gpu_positions; decode_mesh_gpu(dev, gpu_index_buffer, gpu_positions, view); - if (!validate_mesh(reference_indices, reference_positions, - decoded_index_buffer, decoded_positions)) + if (!validate_mesh(decoded_index_buffer, decoded_positions, + gpu_index_buffer, gpu_positions, false)) return EXIT_FAILURE; if (!validate_mesh(reference_indices, reference_positions, - gpu_index_buffer, gpu_positions)) + decoded_index_buffer, decoded_positions, true)) return EXIT_FAILURE; return 0; From ce15cbc50eb3cefd2f6f89743bbe20d366f98a5f Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 21 Dec 2023 14:40:41 +0100 Subject: [PATCH 31/59] Mesh decode path is working again. --- assets/shaders/decode/meshlet_decode.comp | 47 ++++++++++++----------- tests/meshlet_viewer.cpp | 2 +- vulkan/mesh/meshlet.cpp | 6 +-- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index eee4405d..37f99885 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -9,6 +9,7 @@ layout(local_size_x = 256) in; layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; layout(constant_id = 1) const bool UNROLLED_MESH = false; layout(constant_id = 2) const uint TARGET_MESH_STYLE = 0; +layout(constant_id = 3) const bool RUNTIME_MESH = false; #define MESHLET_PAYLOAD_DESCRIPTOR_SET 0 #define MESHLET_PAYLOAD_META_BINDING 0 @@ -58,7 +59,6 @@ layout(set = 0, binding = 7, std430) readonly buffer OutputOffsets uint data[]; } primitive_output_offsets; -#if MESHLET_PAYLOAD_RUNTIME_MESH struct IndirectDrawMesh { uint primitive_offset; @@ -67,11 +67,11 @@ struct IndirectDrawMesh uint num_attributes; }; -layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommands +layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommandsMesh { IndirectDrawMesh draws[]; } indirect_commands_mesh; -#else + struct IndirectIndexedDraw { uint indexCount; @@ -81,11 +81,10 @@ struct IndirectIndexedDraw uint firstInstance; }; -layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommands +layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommandsMDI { IndirectIndexedDraw draws[]; -} indirect_commands; -#endif +} indirect_commands_mdi; layout(push_constant, std430) uniform Registers { @@ -127,22 +126,26 @@ void main() if (!UNROLLED_MESH && gl_LocalInvocationIndex == 0) { MeshletInfo info = meshlet_get_meshlet_info(meshlet_index * NUM_U32_STREAMS); -#if MESHLET_PAYLOAD_RUNTIME_MESH - IndirectDrawMesh draw; - draw.primitive_offset = primitive_output_offset; - draw.vertex_offset = meta.base_vertex_offset + registers.vertex_offset; - draw.num_primitives = info.num_primitives; - draw.num_attributes = info.num_attributes; - indirect_commands_mesh.draws[meshlet_index + registers.meshlet_offset] = draw; -#else - IndirectIndexedDraw draw; - draw.indexCount = 3 * info.primitive_count; - draw.instanceCount = 1; - draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset; - draw.firstIndex = 3 * primitive_output_offset; - draw.firstInstance = 0; - indirect_commands.draws[meshlet_index + registers.meshlet_offset] = draw; -#endif + + if (RUNTIME_MESH) + { + IndirectDrawMesh draw; + draw.primitive_offset = primitive_output_offset; + draw.vertex_offset = meta.base_vertex_offset + registers.vertex_offset; + draw.num_primitives = info.primitive_count; + draw.num_attributes = info.vertex_count; + indirect_commands_mesh.draws[meshlet_index + registers.meshlet_offset] = draw; + } + else + { + IndirectIndexedDraw draw; + draw.indexCount = 3 * info.primitive_count; + draw.instanceCount = 1; + draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset; + draw.firstIndex = 3 * primitive_output_offset; + draw.firstInstance = 0; + indirect_commands_mdi.draws[meshlet_index + registers.meshlet_offset] = draw; + } } if (chunk_index >= meta.num_chunks) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 6a43ed6f..14084acd 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -172,7 +172,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto &scene_nodes = parser.get_scenes()[parser.get_default_scene()]; auto root = scene.create_node(); -#if 1 +#if 0 for (int z = -10; z <= 10; z++) for (int y = -10; y <= 10; y++) for (int x = -10; x <= 10; x++) diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index 26e0812c..306f38ae 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -121,8 +121,7 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie std::vector decode_offsets; bool meshlet_runtime = info.runtime_style == RuntimeStyle::Meshlet; - cmd.set_program("builtin://shaders/decode/meshlet_decode.comp", - {{"MESHLET_PAYLOAD_RUNTIME_MESH", int(meshlet_runtime)}}); + cmd.set_program("builtin://shaders/decode/meshlet_decode.comp"); cmd.enable_subgroup_size_control(true); cmd.set_subgroup_size_log2(true, 5, 7); @@ -132,10 +131,11 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie cmd.set_storage_buffer(0, 2, *info.payload); cmd.set_storage_buffer(0, 3, *info.ibo); - cmd.set_specialization_constant_mask(0x7); + cmd.set_specialization_constant_mask(0xf); cmd.set_specialization_constant(0, view.format_header->stream_count); cmd.set_specialization_constant(1, (info.flags & DECODE_MODE_UNROLLED_MESH) != 0); cmd.set_specialization_constant(2, uint32_t(info.target_style)); + cmd.set_specialization_constant(3, uint32_t(meshlet_runtime)); for (unsigned i = 0; i < 3; i++) cmd.set_storage_buffer(0, 4 + i, *info.streams[0]); From e3a3300d77943ccdee077f62fa305829c99937ed Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 21 Dec 2023 15:10:49 +0100 Subject: [PATCH 32/59] Prepare to support encoded meshlets again. --- tests/meshlet_viewer.cpp | 48 ++++++++-- vulkan/managers/resource_manager.cpp | 133 ++++++++++++++------------- vulkan/managers/resource_manager.hpp | 15 +-- 3 files changed, 116 insertions(+), 80 deletions(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 14084acd..d67d257d 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -371,8 +371,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } push; push.camera_pos = render_context.get_render_parameters().camera_position; - const bool use_meshlets = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet; - const bool use_preculling = !use_meshlets || true; + const bool use_meshlets = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::VBOAndIBOMDI; + const bool use_preculling = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::MeshletEncoded; if (use_preculling) { @@ -382,17 +382,16 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V else info.size = max_draws * sizeof(VkDrawIndexedIndirectCommand) + 256; - info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT; + info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; info.domain = BufferDomain::Device; indirect_draws = device.create_buffer(info); if (use_meshlets) { cmd->fill_buffer(*indirect_draws, 0, 0, 4); - cmd->fill_buffer(*indirect_draws, 1, 4, 8); + cmd->fill_buffer(*indirect_draws, 1, 4, 4); + cmd->fill_buffer(*indirect_draws, 1, 8, 4); } else { @@ -448,7 +447,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto *pos = manager.get_position_buffer(); auto *attr = manager.get_attribute_buffer(); - if (use_meshlets) + if (use_meshlets && !use_preculling) { cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); @@ -456,6 +455,39 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_opaque_state(); *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; + + memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), + 6 * sizeof(vec4)); + + *cmd->allocate_typed_constant_data(1, 2, 1) = + float(1 << 8 /* shader assumes 8 */) * + vec4(cmd->get_viewport().x + 0.5f * cmd->get_viewport().width - 0.5f, + cmd->get_viewport().y + 0.5f * cmd->get_viewport().height - 0.5f, + 0.5f * cmd->get_viewport().width, + 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); + + cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug.task", + "assets://shaders/meshlet_debug.mesh.frag"); + + cmd->set_storage_buffer(0, 0, *ibo); + cmd->set_storage_buffer(0, 1, *pos); + cmd->set_storage_buffer(0, 2, *attr); + cmd->set_storage_buffer(0, 3, *indirect_draws); + cmd->set_storage_buffer(0, 4, *compacted_params); + cmd->set_storage_buffer(0, 5, *cached_transform_buffer); + GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); + cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + cmd->end_render_pass(); + } + else if (use_meshlets && use_preculling) + { + cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); + camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); + render_context.set_camera(camera); + cmd->set_opaque_state(); + + *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; + memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 740b48ce..fb9811ab 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -39,9 +39,9 @@ ResourceManager::ResourceManager(Device *device_) , index_buffer_allocator(*device_, 256, 17) , attribute_buffer_allocator(*device_, 256, 17) , indirect_buffer_allocator(*device_, 32, 15) - //, mesh_header_allocator(*device_, 32, 15) - //, mesh_stream_allocator(*device_, 8, 17) - //, mesh_payload_allocator(*device_, 128, 17) + , mesh_header_allocator(*device_, 32, 15) + , mesh_stream_allocator(*device_, 8, 17) + , mesh_payload_allocator(*device_, 32, 17) { // Simplified style. index_buffer_allocator.set_element_size(0, 3); // 8-bit indices. @@ -51,9 +51,9 @@ ResourceManager::ResourceManager(Device *device_) attribute_buffer_allocator.set_element_size(2, sizeof(uint32_t) * 2); indirect_buffer_allocator.set_element_size(0, sizeof(VkDrawIndexedIndirectCommand)); - //mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); - //mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); - //mesh_payload_allocator.set_element_size(0, sizeof(uint32_t)); + mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); + mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); + mesh_payload_allocator.set_element_size(0, sizeof(Meshlet::PayloadB128)); assets.reserve(Granite::AssetID::MaxIDs); } @@ -178,20 +178,36 @@ void ResourceManager::init() if (device->get_device_features().mesh_shader_features.taskShader && device->get_device_features().mesh_shader_features.meshShader) { - mesh_encoding = MeshEncoding::Meshlet; + mesh_encoding = MeshEncoding::MeshletDecoded; LOGI("Opting in to meshlet path.\n"); - indirect_buffer_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeaderDecoded)); } - indirect_buffer_allocator.set_soa_count(2); - indirect_buffer_allocator.set_element_size(1, sizeof(Meshlet::Bound)); + if (mesh_encoding == MeshEncoding::MeshletDecoded) + indirect_buffer_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeaderDecoded)); - opaque.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - index_buffer_allocator.prime(&opaque); - opaque.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - attribute_buffer_allocator.prime(&opaque); - opaque.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - indirect_buffer_allocator.prime(&opaque); + if (mesh_encoding == MeshEncoding::MeshletEncoded) + { + mesh_header_allocator.set_soa_count(2); + mesh_header_allocator.set_element_size(1, sizeof(Meshlet::Bound)); + + opaque.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + mesh_header_allocator.prime(&opaque); + mesh_stream_allocator.prime(&opaque); + mesh_payload_allocator.prime(&opaque); + } + else + { + indirect_buffer_allocator.set_soa_count(2); + indirect_buffer_allocator.set_element_size(1, sizeof(Meshlet::Bound)); + + opaque.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + index_buffer_allocator.prime(&opaque); + opaque.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + attribute_buffer_allocator.prime(&opaque); + opaque.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT; + indirect_buffer_allocator.prime(&opaque); + } } ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, Granite::AssetID id) @@ -351,17 +367,7 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me bool ret = true; - //if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) - { - if (ret) - ret = index_buffer_allocator.allocate(view.total_primitives, &asset.mesh.index_or_payload); - if (ret) - ret = attribute_buffer_allocator.allocate(view.total_vertices, &asset.mesh.attr_or_stream); - if (ret) - ret = indirect_buffer_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header); - } -#if 0 - else + if (mesh_encoding == MeshEncoding::MeshletEncoded) { if (ret) ret = mesh_header_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header); @@ -369,14 +375,22 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me if (ret) { ret = mesh_stream_allocator.allocate( - view.format_header->meshlet_count * view.format_header->u32_stream_count, + view.format_header->meshlet_count * view.format_header->stream_count, &asset.mesh.attr_or_stream); } if (ret) - ret = mesh_payload_allocator.allocate(view.format_header->payload_size_words, &asset.mesh.index_or_payload); + ret = mesh_payload_allocator.allocate(view.format_header->payload_size_b128, &asset.mesh.index_or_payload); + } + else + { + if (ret) + ret = index_buffer_allocator.allocate(view.total_primitives, &asset.mesh.index_or_payload); + if (ret) + ret = attribute_buffer_allocator.allocate(view.total_vertices, &asset.mesh.attr_or_stream); + if (ret) + ret = indirect_buffer_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header); } -#endif asset.mesh.draw = { asset.mesh.indirect_or_header.offset, @@ -385,20 +399,18 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me if (!ret) { - //if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) - { - index_buffer_allocator.free(asset.mesh.index_or_payload); - attribute_buffer_allocator.free(asset.mesh.attr_or_stream); - indirect_buffer_allocator.free(asset.mesh.indirect_or_header); - } -#if 0 - else + if (mesh_encoding == MeshEncoding::MeshletEncoded) { mesh_payload_allocator.free(asset.mesh.index_or_payload); mesh_stream_allocator.free(asset.mesh.attr_or_stream); mesh_header_allocator.free(asset.mesh.indirect_or_header); } -#endif + else + { + index_buffer_allocator.free(asset.mesh.index_or_payload); + attribute_buffer_allocator.free(asset.mesh.attr_or_stream); + indirect_buffer_allocator.free(asset.mesh.indirect_or_header); + } asset.mesh = {}; } @@ -429,15 +441,14 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, if (ret) { -#if 0 - if (mesh_encoding == MeshEncoding::Meshlet) + if (mesh_encoding == MeshEncoding::MeshletEncoded) { auto cmd = device->request_command_buffer(CommandBuffer::Type::AsyncTransfer); void *payload_data = cmd->update_buffer(*mesh_payload_allocator.get_buffer(0, 0), - asset.mesh.index_or_payload.offset * sizeof(uint32_t), - view.format_header->payload_size_words * sizeof(uint32_t)); - memcpy(payload_data, view.payload, view.format_header->payload_size_words * sizeof(uint32_t)); + asset.mesh.index_or_payload.offset * sizeof(Meshlet::PayloadB128), + view.format_header->payload_size_b128 * sizeof(Meshlet::PayloadB128)); + memcpy(payload_data, view.payload, view.format_header->payload_size_b128 * sizeof(Meshlet::PayloadB128)); auto *headers = static_cast( cmd->update_buffer(*mesh_header_allocator.get_buffer(0, 0), @@ -446,9 +457,8 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) { - headers[i].stream_offset = asset.mesh.attr_or_stream.offset + i * view.format_header->u32_stream_count; - headers[i].num_attributes = view.headers[i].num_attributes; - headers[i].num_primitives = view.headers[i].num_primitives; + headers[i].stream_offset = asset.mesh.attr_or_stream.offset + i * view.format_header->stream_count; + headers[i].num_chunks = view.headers[i].num_chunks; } auto *bounds = static_cast( @@ -460,13 +470,13 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, auto *streams = static_cast( cmd->update_buffer(*mesh_stream_allocator.get_buffer(0, 0), asset.mesh.attr_or_stream.offset * sizeof(Meshlet::Stream), - view.format_header->meshlet_count * view.format_header->u32_stream_count * + view.format_header->meshlet_count * view.format_header->stream_count * sizeof(Meshlet::Stream))); - for (uint32_t i = 0, n = view.format_header->meshlet_count * view.format_header->u32_stream_count; i < n; i++) + for (uint32_t i = 0, n = view.format_header->meshlet_count * view.format_header->stream_count; i < n; i++) { auto in_stream = view.streams[i]; - in_stream.offset_from_base_u32 += asset.mesh.index_or_payload.offset; + in_stream.offset_in_b128 += asset.mesh.index_or_payload.offset; streams[i] = in_stream; } @@ -480,7 +490,6 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, false); } else -#endif { auto cmd = device->request_command_buffer(CommandBuffer::Type::AsyncCompute); @@ -504,7 +513,7 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, info.push.primitive_offset = asset.mesh.index_or_payload.offset; info.push.vertex_offset = asset.mesh.attr_or_stream.offset; - info.runtime_style = mesh_encoding == MeshEncoding::Meshlet ? + info.runtime_style = mesh_encoding == MeshEncoding::MeshletDecoded ? Meshlet::RuntimeStyle::Meshlet : Meshlet::RuntimeStyle::MDI; auto *bounds = static_cast( @@ -529,16 +538,14 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, uint64_t cost = 0; if (ret) { -#if 0 - if (mesh_encoding == MeshEncoding::Meshlet) + if (mesh_encoding == MeshEncoding::MeshletEncoded) { - cost += view.format_header->payload_size_words * mesh_payload_allocator.get_element_size(0); + cost += view.format_header->payload_size_b128 * mesh_payload_allocator.get_element_size(0); cost += view.format_header->meshlet_count * mesh_header_allocator.get_element_size(0); cost += view.format_header->meshlet_count * mesh_header_allocator.get_element_size(1); - cost += view.format_header->meshlet_count * view.format_header->u32_stream_count * mesh_stream_allocator.get_element_size(0); + cost += view.format_header->meshlet_count * view.format_header->stream_count * mesh_stream_allocator.get_element_size(0); } else -#endif { cost += view.total_primitives * index_buffer_allocator.get_element_size(0); cost += view.total_vertices * attribute_buffer_allocator.get_element_size(0); @@ -626,15 +633,13 @@ void ResourceManager::latch_handles() { { std::lock_guard holder_alloc{mesh_allocator_lock}; -#if 0 - if (mesh_encoding == MeshEncoding::Meshlet) + if (mesh_encoding == MeshEncoding::MeshletEncoded) { mesh_payload_allocator.free(asset.mesh.index_or_payload); mesh_stream_allocator.free(asset.mesh.attr_or_stream); mesh_header_allocator.free(asset.mesh.indirect_or_header); } else -#endif { index_buffer_allocator.free(asset.mesh.index_or_payload); attribute_buffer_allocator.free(asset.mesh.attr_or_stream); @@ -693,7 +698,6 @@ const Buffer *ResourceManager::get_indirect_buffer() const return indirect_buffer_allocator.get_buffer(0, 0); } -#if 0 const Buffer *ResourceManager::get_meshlet_payload_buffer() const { return mesh_payload_allocator.get_buffer(0, 0); @@ -708,13 +712,12 @@ const Buffer *ResourceManager::get_meshlet_stream_header_buffer() const { return mesh_stream_allocator.get_buffer(0, 0); } -#endif const Buffer *ResourceManager::get_cluster_bounds_buffer() const { - //if (mesh_encoding == MeshEncoding::Meshlet) - // return mesh_header_allocator.get_buffer(0, 1); - //else + if (mesh_encoding == MeshEncoding::MeshletEncoded) + return mesh_header_allocator.get_buffer(0, 1); + else return indirect_buffer_allocator.get_buffer(0, 1); } diff --git a/vulkan/managers/resource_manager.hpp b/vulkan/managers/resource_manager.hpp index f246d287..c482aeb3 100644 --- a/vulkan/managers/resource_manager.hpp +++ b/vulkan/managers/resource_manager.hpp @@ -84,7 +84,8 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface enum class MeshEncoding { - Meshlet, + MeshletEncoded, + MeshletDecoded, VBOAndIBOMDI, }; @@ -125,9 +126,9 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface const Buffer *get_skinning_buffer() const; const Buffer *get_indirect_buffer() const; - //const Buffer *get_meshlet_payload_buffer() const; - //const Buffer *get_meshlet_header_buffer() const; - //const Buffer *get_meshlet_stream_header_buffer() const; + const Buffer *get_meshlet_payload_buffer() const; + const Buffer *get_meshlet_header_buffer() const; + const Buffer *get_meshlet_stream_header_buffer() const; const Buffer *get_cluster_bounds_buffer() const; @@ -181,9 +182,9 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface MeshBufferAllocator index_buffer_allocator; MeshBufferAllocator attribute_buffer_allocator; MeshBufferAllocator indirect_buffer_allocator; - //MeshBufferAllocator mesh_header_allocator; - //MeshBufferAllocator mesh_stream_allocator; - //MeshBufferAllocator mesh_payload_allocator; + MeshBufferAllocator mesh_header_allocator; + MeshBufferAllocator mesh_stream_allocator; + MeshBufferAllocator mesh_payload_allocator; MeshEncoding mesh_encoding = MeshEncoding::VBOAndIBOMDI; From 9670244349d4ee2667159ef0f61c4cf0bbc633b1 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 11:11:59 +0100 Subject: [PATCH 33/59] Get rid of any compile time defines for meshlet_cull.comp. --- assets/shaders/inc/meshlet_render_types.h | 7 --- tests/assets/shaders/meshlet_cull.comp | 58 ++++++----------------- tests/meshlet_viewer.cpp | 32 ++++++++----- vulkan/managers/resource_manager.cpp | 2 +- 4 files changed, 37 insertions(+), 62 deletions(-) diff --git a/assets/shaders/inc/meshlet_render_types.h b/assets/shaders/inc/meshlet_render_types.h index d2e785a6..6d0db5d2 100644 --- a/assets/shaders/inc/meshlet_render_types.h +++ b/assets/shaders/inc/meshlet_render_types.h @@ -35,11 +35,4 @@ struct IndirectDrawMesh uint num_attributes; }; -#if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS -struct MeshletDrawCommand -{ - uint payload[MESHLET_RENDER_DRAW_WORDS]; -}; #endif - -#endif \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_cull.comp b/tests/assets/shaders/meshlet_cull.comp index 1b4a5d85..378bc0f1 100644 --- a/tests/assets/shaders/meshlet_cull.comp +++ b/tests/assets/shaders/meshlet_cull.comp @@ -1,10 +1,5 @@ #version 450 #extension GL_EXT_scalar_block_layout : require -#if MESHLET_PAYLOAD_WAVE32 -#extension GL_KHR_shader_subgroup_ballot : require -#extension GL_KHR_shader_subgroup_vote : require -#extension GL_KHR_shader_subgroup_shuffle : require -#endif layout(local_size_x = 32) in; @@ -16,25 +11,19 @@ layout(local_size_x = 32) in; #define MESHLET_RENDER_TASKS_BINDING 2 #include "meshlet_render.h" -#if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS +layout(constant_id = 0) const int MESHLET_RENDER_DRAW_WORDS = 0; +layout(constant_id = 1) const int MESHLET_APPEND_ATOMIC_INDEX = 0; + layout(set = 0, binding = 3, std430) readonly buffer InputDraws { - MeshletDrawCommand data[]; + uint draws[]; } input_draws; layout(set = 0, binding = 4, std430) writeonly buffer OutputDraws { - uint count; - uint padding[256 / 4 - 1]; - MeshletDrawCommand data[]; -} output_draws; -#else -layout(set = 0, binding = 4, std430) writeonly buffer OutputDraws -{ - uint count; - uint y, z; + uint count[3]; + layout(offset = 256) uint draws[]; } output_draws; -#endif layout(set = 0, binding = 5, std430) writeonly buffer CompactedDraws { @@ -47,7 +36,6 @@ layout(push_constant, std430) uniform Registers uint count; } registers; -#if !MESHLET_PAYLOAD_WAVE32 shared uint ballot_value; shared uint global_offset; @@ -74,12 +62,6 @@ uint ballotExclusiveBitCount(uvec4 v) return bitCount(v.x & mask); } #define local_invocation_id gl_LocalInvocationIndex -#else -#define ballot(v) subgroupBallot(v) -#define ballotBitCount(v) subgroupBallotBitCount(v) -#define ballotExclusiveBitCount(v) subgroupBallotExclusiveBitCount(v) -#define local_invocation_id gl_SubgroupInvocationID -#endif void main() { @@ -102,16 +84,10 @@ void main() int lane = findLSB(b); b &= ~(1u << lane); -#if MESHLET_PAYLOAD_WAVE32 - uint node_instance = subgroupShuffle(task.node_instance, lane); - uint node_count_material_index = subgroupShuffle(task.node_count_material_index, lane); - uint mesh_index_count = subgroupShuffle(task.mesh_index_count, lane); -#else TaskInfo tmp_task = task_info.data[gl_WorkGroupID.x * gl_WorkGroupSize.x + lane]; uint node_instance = tmp_task.node_instance; uint node_count_material_index = tmp_task.node_count_material_index; uint mesh_index_count = tmp_task.mesh_index_count; -#endif uint offset = mesh_index_count & ~31u; uint count = bitfieldExtract(mesh_index_count, 0, 5) + 1; @@ -130,24 +106,20 @@ void main() uint draw_count = ballotBitCount(ballot); uint local_offset = ballotExclusiveBitCount(ballot); -#if MESHLET_PAYLOAD_WAVE32 - uint global_offset; - if (subgroupElect()) - global_offset = atomicAdd(output_draws.count, draw_count); - global_offset = subgroupBroadcastFirst(global_offset); -#else // WAR barrier is implied here in earlier ballot. if (gl_LocalInvocationIndex == 0) - global_offset = atomicAdd(output_draws.count, draw_count); + global_offset = atomicAdd(output_draws.count[MESHLET_APPEND_ATOMIC_INDEX], draw_count); barrier(); -#endif if (alloc_draw) { -#if defined(MESHLET_RENDER_DRAW_WORDS) && MESHLET_RENDER_DRAW_WORDS - output_draws.data[global_offset + local_offset] = input_draws.data[meshlet_index]; -#endif - output_draw_info.data[global_offset + local_offset] = CompactedDrawInfo(meshlet_index, node_instance, node_count_material_index); + uint dst_offset = MESHLET_RENDER_DRAW_WORDS * (global_offset + local_offset); + uint src_offset = meshlet_index * MESHLET_RENDER_DRAW_WORDS; + for (int i = 0; i < MESHLET_RENDER_DRAW_WORDS; i++) + output_draws.draws[dst_offset + i] = input_draws.draws[src_offset + i]; + + output_draw_info.data[global_offset + local_offset] = + CompactedDrawInfo(meshlet_index, node_instance, node_count_material_index); } } -} \ No newline at end of file +} diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index d67d257d..bb8be1e0 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -374,6 +374,12 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V const bool use_meshlets = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::VBOAndIBOMDI; const bool use_preculling = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::MeshletEncoded; + uint32_t target_meshlet_workgroup_size = + max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); + target_meshlet_workgroup_size = min(256u, target_meshlet_workgroup_size); + target_meshlet_workgroup_size = 1u << Util::floor_log2(target_meshlet_workgroup_size); + uint32_t num_chunk_workgroups = 256u / target_meshlet_workgroup_size; + if (use_preculling) { BufferCreateInfo info; @@ -389,8 +395,16 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V if (use_meshlets) { - cmd->fill_buffer(*indirect_draws, 0, 0, 4); - cmd->fill_buffer(*indirect_draws, 1, 4, 4); + if (num_chunk_workgroups == 1) + { + cmd->fill_buffer(*indirect_draws, 0, 0, 4); + cmd->fill_buffer(*indirect_draws, 1, 4, 4); + } + else + { + cmd->fill_buffer(*indirect_draws, num_chunk_workgroups, 0, 4); + cmd->fill_buffer(*indirect_draws, 0, 4, 4); + } cmd->fill_buffer(*indirect_draws, 1, 8, 4); } else @@ -407,20 +421,16 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V if (use_preculling) { auto *indirect = manager.get_indirect_buffer(); - bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - - cmd->enable_subgroup_size_control(true); - if (supports_wave32) - cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_COMPUTE_BIT); - else - cmd->set_subgroup_size_log2(true, 5, 7, VK_SHADER_STAGE_COMPUTE_BIT); auto command_words = (use_meshlets ? sizeof(Vulkan::Meshlet::RuntimeHeaderDecoded) : sizeof(VkDrawIndexedIndirectCommand)) / sizeof(uint32_t); - cmd->set_program("assets://shaders/meshlet_cull.comp", - {{"MESHLET_RENDER_DRAW_WORDS", int(command_words)}}); + cmd->set_specialization_constant_mask(3); + cmd->set_specialization_constant(0, uint32_t(command_words)); + cmd->set_specialization_constant(1, (!use_meshlets || num_chunk_workgroups == 1) ? 0 : 1); + + cmd->set_program("assets://shaders/meshlet_cull.comp"); cmd->set_storage_buffer(0, 0, *aabb_buffer); cmd->set_storage_buffer(0, 1, *cached_transform_buffer); cmd->set_storage_buffer(0, 2, *task_buffer); diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index fb9811ab..3c34e50a 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -175,7 +175,7 @@ void ResourceManager::init() opaque.domain = BufferDomain::Device; opaque.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - if (device->get_device_features().mesh_shader_features.taskShader && + if (false && device->get_device_features().mesh_shader_features.taskShader && device->get_device_features().mesh_shader_features.meshShader) { mesh_encoding = MeshEncoding::MeshletDecoded; From 49d38c095510a5b9827b0e6b23a71dd76d9d9f6f Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 12:56:27 +0100 Subject: [PATCH 34/59] Get MeshletDecode path working. --- assets/shaders/decode/meshlet_decode.comp | 34 ++- assets/shaders/inc/meshlet_render_types.h | 2 - tests/assets/shaders/meshlet_debug_plain.mesh | 261 ++++++++---------- tests/meshlet_viewer.cpp | 4 +- vulkan/managers/resource_manager.cpp | 28 +- vulkan/mesh/meshlet.cpp | 12 +- vulkan/mesh/meshlet.hpp | 2 - 7 files changed, 175 insertions(+), 168 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index 37f99885..0ce23779 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -63,8 +63,6 @@ struct IndirectDrawMesh { uint primitive_offset; uint vertex_offset; - uint num_primitives; - uint num_attributes; }; layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommandsMesh @@ -131,9 +129,8 @@ void main() { IndirectDrawMesh draw; draw.primitive_offset = primitive_output_offset; - draw.vertex_offset = meta.base_vertex_offset + registers.vertex_offset; - draw.num_primitives = info.primitive_count; - draw.num_attributes = info.vertex_count; + // Unrolled. Always allocate full 256 entries. + draw.vertex_offset = gl_WorkGroupSize.x * meshlet_index + registers.vertex_offset; indirect_commands_mesh.draws[meshlet_index + registers.meshlet_offset] = draw; } else @@ -148,38 +145,49 @@ void main() } } - if (chunk_index >= meta.num_chunks) - return; - MeshletChunkInfo chunk_info = meshlet_get_chunk_info(meshlet_index * NUM_U32_STREAMS, chunk_index); // Index - if (lane_index < chunk_info.primitive_count) + if (chunk_index < meta.num_chunks && lane_index < chunk_info.primitive_count) { uint decoded_index_buffer = meshlet_decode_index_buffer(meshlet_index * NUM_U32_STREAMS, chunk_index, lane_index); - primitive_output_offset += chunk_info.primitive_offset; + if (RUNTIME_MESH) + primitive_output_offset += 32u * chunk_index; + else + primitive_output_offset += chunk_info.primitive_offset; uvec3 indices; indices.x = bitfieldExtract(decoded_index_buffer, 0, 8); indices.y = bitfieldExtract(decoded_index_buffer, 8, 8); indices.z = bitfieldExtract(decoded_index_buffer, 16, 8); - indices += chunk_info.vertex_offset; + if (!RUNTIME_MESH) + indices += chunk_info.vertex_offset; if (UNROLLED_MESH) output_indices32.data[primitive_output_offset + lane_index] = indices + meta.base_vertex_offset + registers.vertex_offset; else output_indices8.data[primitive_output_offset + lane_index] = u8vec3(indices); } + else if (RUNTIME_MESH) + { + output_indices8.data[primitive_output_offset + 32u * chunk_index + lane_index] = u8vec3(0); + } // Attributes - if (lane_index < chunk_info.vertex_count) + if (chunk_index < meta.num_chunks && lane_index < chunk_info.vertex_count) { int exponent; i16vec3 pos = meshlet_decode_snorm_scaled_i16x3(meshlet_index * NUM_U32_STREAMS + 1, chunk_index, lane_index, exponent); vec3 fp_pos = ldexp(vec3(pos), ivec3(exponent)); - uint vertex_output_offset = registers.vertex_offset + meta.base_vertex_offset + chunk_info.vertex_offset; + uint vertex_output_offset; + if (RUNTIME_MESH) + vertex_output_offset = gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset; + else + vertex_output_offset = registers.vertex_offset + meta.base_vertex_offset + chunk_info.vertex_offset; output_stream_pos.data[vertex_output_offset + lane_index] = fp_pos; } + else if (RUNTIME_MESH) + output_stream_pos.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] = vec3(intBitsToFloat(-1)); } diff --git a/assets/shaders/inc/meshlet_render_types.h b/assets/shaders/inc/meshlet_render_types.h index 6d0db5d2..06db9649 100644 --- a/assets/shaders/inc/meshlet_render_types.h +++ b/assets/shaders/inc/meshlet_render_types.h @@ -31,8 +31,6 @@ struct IndirectDrawMesh { uint primitive_offset; uint vertex_offset; - uint num_primitives; - uint num_attributes; }; #endif diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 390e7ccd..62849ac0 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -3,11 +3,13 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_scalar_block_layout : require -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_basic : require -layout(max_primitives = 256, max_vertices = 255, triangles) out; -layout(local_size_x = 64, local_size_y = 4) in; +#ifndef MESHLET_SIZE +#error "Must define MESHLET_SIZE" +#endif + +layout(max_primitives = MESHLET_SIZE, max_vertices = MESHLET_SIZE, triangles) out; +layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" @@ -49,15 +51,12 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms mat4 data[]; } transforms; -#define MESHLET_CULL 1 - -#if MESHLET_CULL -shared vec2 shared_window_positions[255]; -shared uint8_t shared_clip_code[255]; -shared uint shared_active_vert[8]; -shared uint shared_active_prim[8]; -shared uint shared_active_vert_count[8]; -shared uint shared_active_prim_count[8]; +shared vec2 shared_window_positions[MESHLET_SIZE]; +shared uint8_t shared_clip_code[MESHLET_SIZE]; +shared uint shared_active_vert[MESHLET_SIZE / 32]; +shared uint shared_active_prim[MESHLET_SIZE / 32]; +shared uint shared_active_vert_count[MESHLET_SIZE / 32]; +shared uint shared_active_prim_count[MESHLET_SIZE / 32]; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; @@ -71,45 +70,43 @@ const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; uint compacted_vertex_output(uint index) { - return shared_active_vert_count[index / 32u] + bitCount(shared_active_vert[index / 32u] & ((1u << (index & 31u)) - 1u)); + return shared_active_vert_count[index / 32u] + bitCount(shared_active_vert[index / 32u] & ((1u << (index & 31u)) - 1u)); } uint compacted_index_output(uint index) { - return shared_active_prim_count[index / 32u] + bitCount(shared_active_prim[index / 32u] & ((1u << (index & 31u)) - 1u)); + return shared_active_prim_count[index / 32u] + bitCount(shared_active_prim[index / 32u] & ((1u << (index & 31u)) - 1u)); } bool lane_has_active_vert(uint index) { - return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; + return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; } uvec3 remap_index_buffer(uvec3 prim) { - return uvec3(compacted_vertex_output(prim.x), - compacted_vertex_output(prim.y), - compacted_vertex_output(prim.z)); + return uvec3(compacted_vertex_output(prim.x), compacted_vertex_output(prim.y), compacted_vertex_output(prim.z)); } bool cull_triangle(vec2 a, vec2 b, vec2 c) { - // To be completely accurate, this should be done in fixed point, - // but we can YOLO a bit since glitches in extreme edge cases are considered okay. - precise vec2 ab = b - a; - precise vec2 ac = c - a; + // To be completely accurate, this should be done in fixed point, + // but we can YOLO a bit since glitches in extreme edge cases are considered okay. + precise vec2 ab = b - a; + precise vec2 ac = c - a; - // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. - // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. - precise float pos_area = ab.y * ac.x; - precise float neg_area = ab.x * ac.y; + // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. + // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. + precise float pos_area = ab.y * ac.x; + precise float neg_area = ab.x * ac.y; // If the pos value is (-2^24, +2^24), the FP math is exact, if not, we have to be conservative. // Less-than check is there to ensure that 1.0 delta in neg_area *will* resolve to a different value. bool active_primitive; - if (abs(pos_area) < 16777216.0) - active_primitive = pos_area > neg_area; + if (abs(pos_area) < 16777216.0) + active_primitive = pos_area > neg_area; else - active_primitive = pos_area >= neg_area; + active_primitive = pos_area >= neg_area; if (active_primitive) { @@ -117,151 +114,129 @@ bool cull_triangle(vec2 a, vec2 b, vec2 c) vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); active_primitive = all(notEqual(lo, hi)); - } + } - return active_primitive; + return active_primitive; } -#endif void main() { -#if MESHLET_CULL - if (gl_LocalInvocationIndex < 8) + uint linear_index = gl_LocalInvocationIndex; + + if (linear_index < MESHLET_SIZE / 32) { - shared_active_vert[gl_LocalInvocationIndex] = 0; - shared_active_prim[gl_LocalInvocationIndex] = 0; + shared_active_vert[linear_index] = 0; + shared_active_prim[linear_index] = 0; } + +#if MESHLET_SIZE != 256 + uint meshlet_index = gl_WorkGroupID.y; + uint base_chunk_index = gl_WorkGroupID.x * (MESHLET_SIZE / 32); +#else + uint meshlet_index = gl_WorkGroupID.x; + uint base_chunk_index = 0u; #endif - IndirectDrawMesh meshlet = indirect_commands_mesh.draws[gl_WorkGroupID.x]; - CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; - uint linear_index = gl_LocalInvocationIndex; + IndirectDrawMesh meshlet = indirect_commands_mesh.draws[meshlet_index]; + CompactedDrawInfo task = mesh_payload[meshlet_index]; mat4 M = transforms.data[task.node_offset]; -#if MESHLET_CULL - vec3 world_pos; - vec4 clip_pos; - uvec3 prim; - - if (linear_index < meshlet.num_attributes) - { - vec3 pos = pos.data[meshlet.vertex_offset + linear_index]; - world_pos = (M * vec4(pos, 1.0)).xyz; - clip_pos = VP * vec4(world_pos, 1.0); - - vec2 c = clip_pos.xy / clip_pos.w; - - uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; - if (any(greaterThan(abs(c), vec2(4.0)))) - clip_code |= CLIP_CODE_INACCURATE; - if (c.x <= -1.0) - clip_code |= CLIP_CODE_NEGATIVE_X; - if (c.y <= -1.0) - clip_code |= CLIP_CODE_NEGATIVE_Y; - if (c.x >= 1.0) - clip_code |= CLIP_CODE_POSITIVE_X; - if (c.y >= 1.0) - clip_code |= CLIP_CODE_POSITIVE_Y; - - vec2 window = roundEven(c * viewport.zw + viewport.xy); - shared_window_positions[linear_index] = window; - shared_clip_code[linear_index] = uint8_t(clip_code); - } + // Transform positions. + vec3 pos = pos.data[meshlet.vertex_offset + linear_index + 32u * base_chunk_index]; + vec3 world_pos = (M * vec4(pos, 1.0)).xyz; + vec4 clip_pos = VP * vec4(world_pos, 1.0); + + vec2 c = clip_pos.xy / clip_pos.w; + + uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; + if (any(greaterThan(abs(c), vec2(4.0)))) + clip_code |= CLIP_CODE_INACCURATE; + if (c.x <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_X; + if (c.y <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_Y; + if (c.x >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_X; + if (c.y >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_Y; + + vec2 window = roundEven(c * viewport.zw + viewport.xy); + shared_window_positions[linear_index] = window; + shared_clip_code[linear_index] = uint8_t(clip_code); barrier(); - bool is_active_prim = false; - if (linear_index < meshlet.num_primitives) - { - prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index]); - uint code_a = shared_clip_code[prim.x]; - uint code_b = shared_clip_code[prim.y]; - uint code_c = shared_clip_code[prim.z]; + uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index + 32u * base_chunk_index]); + prim += 32u * gl_LocalInvocationID.y; - uint or_code = code_a | code_b | code_c; - uint and_code = code_a & code_b & code_c; + uint code_a = shared_clip_code[prim.x]; + uint code_b = shared_clip_code[prim.y]; + uint code_c = shared_clip_code[prim.z]; + + uint or_code = code_a | code_b | code_c; + uint and_code = code_a & code_b & code_c; + + bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; + + bool is_active_prim = false; + if (!culled_planes) + { + bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; - bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; + if (!force_accept) + { + vec2 a = shared_window_positions[prim.x]; + vec2 b = shared_window_positions[prim.y]; + vec2 c = shared_window_positions[prim.z]; + force_accept = cull_triangle(a, b, c); + } - if (!culled_planes) + if (force_accept) { - bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; - - if (!force_accept) - { - vec2 a = shared_window_positions[prim.x]; - vec2 b = shared_window_positions[prim.y]; - vec2 c = shared_window_positions[prim.z]; - force_accept = cull_triangle(a, b, c); - } - - if (force_accept) - { - is_active_prim = true; - atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); - atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); - atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); - atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); - } + is_active_prim = true; + atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); + atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); + atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); + atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); } } barrier(); - if (gl_SubgroupInvocationID < 8 && gl_SubgroupID == 0) + if (linear_index == 0) { - uint local_active_prim = bitCount(shared_active_prim[gl_SubgroupInvocationID]); - uint active_prim = subgroupInclusiveAdd(local_active_prim); - shared_active_prim_count[gl_SubgroupInvocationID] = active_prim - local_active_prim; - if (gl_SubgroupInvocationID == 7) - shared_active_prim_count_total = active_prim; - } - else if (gl_SubgroupInvocationID < 8 && gl_SubgroupID == 1) - { - uint local_active_vert = bitCount(shared_active_vert[gl_SubgroupInvocationID]); - uint active_vert = subgroupInclusiveAdd(local_active_vert); - shared_active_vert_count[gl_SubgroupInvocationID] = active_vert - local_active_vert; - if (gl_SubgroupInvocationID == 7) - shared_active_vert_count_total = active_vert; - } + uint active_prim = 0; + uint active_vert = 0; + for (uint i = 0; i < gl_WorkGroupSize.y; i++) + { + shared_active_prim_count[i] = active_prim; + shared_active_vert_count[i] = active_vert; + active_prim += bitCount(shared_active_prim[i]); + active_vert += bitCount(shared_active_vert[i]); + } + + shared_active_prim_count_total = active_prim; + shared_active_vert_count_total = active_vert; + } barrier(); - uint num_verts = shared_active_vert_count_total; - uint num_prims = shared_active_prim_count_total; + uint num_verts = shared_active_vert_count_total; + uint num_prims = shared_active_prim_count_total; SetMeshOutputsEXT(num_verts, num_prims); - if (is_active_prim) - gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); - - if (gl_LocalInvocationIndex < num_prims) - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; - - bool has_active_vert = lane_has_active_vert(linear_index); - if (has_active_vert) - { - uint out_vert_index = compacted_vertex_output(linear_index); - gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - vWorldPos[out_vert_index] = world_pos; - } -#else - SetMeshOutputsEXT(meshlet.num_attributes, meshlet.num_primitives); - if (linear_index < meshlet.num_attributes) - { - vec3 pos = pos.data[meshlet.vertex_offset + linear_index]; - vec3 world_pos = (M * vec4(pos, 1.0)).xyz; - vec4 clip_pos = VP * vec4(world_pos, 1.0); + if (is_active_prim) + gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); - gl_MeshVerticesEXT[linear_index].gl_Position = clip_pos; - vWorldPos[linear_index] = world_pos; - } + if (linear_index < num_prims) + vDrawID[linear_index] = task.meshlet_index; - if (linear_index < meshlet.num_primitives) + bool has_active_vert = lane_has_active_vert(linear_index); + if (has_active_vert) { - uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index]); - gl_PrimitiveTriangleIndicesEXT[linear_index] = prim; - vDrawID[linear_index] = task.meshlet_index; + uint out_vert_index = compacted_vertex_output(linear_index); + gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; + vWorldPos[out_vert_index] = world_pos; } -#endif } diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index bb8be1e0..cb1e4b5b 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -508,8 +508,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); + cmd->set_specialization_constant_mask(1); + cmd->set_specialization_constant(0, target_meshlet_workgroup_size / 32); cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", - "assets://shaders/meshlet_debug.mesh.frag"); + "assets://shaders/meshlet_debug.mesh.frag", {{ "MESHLET_SIZE", int(target_meshlet_workgroup_size) }}); cmd->set_storage_buffer(0, 0, *ibo); cmd->set_storage_buffer(0, 1, *pos); diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 3c34e50a..1debda56 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -175,7 +175,7 @@ void ResourceManager::init() opaque.domain = BufferDomain::Device; opaque.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - if (false && device->get_device_features().mesh_shader_features.taskShader && + if (device->get_device_features().mesh_shader_features.taskShader && device->get_device_features().mesh_shader_features.meshShader) { mesh_encoding = MeshEncoding::MeshletDecoded; @@ -384,10 +384,28 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me } else { - if (ret) - ret = index_buffer_allocator.allocate(view.total_primitives, &asset.mesh.index_or_payload); - if (ret) - ret = attribute_buffer_allocator.allocate(view.total_vertices, &asset.mesh.attr_or_stream); + if (mesh_encoding == MeshEncoding::MeshletDecoded) + { + // Need to lay out meshes in memory so that we can process individual chunks. + // Culling is expected, so primitive buffer will be filled with degenerate primitives as padding. + if (ret) + { + ret = index_buffer_allocator.allocate(view.format_header->meshlet_count * Meshlet::MaxElements, + &asset.mesh.index_or_payload); + } + if (ret) + { + ret = attribute_buffer_allocator.allocate(view.format_header->meshlet_count * Meshlet::MaxElements, + &asset.mesh.attr_or_stream); + } + } + else + { + if (ret) + ret = index_buffer_allocator.allocate(view.total_primitives, &asset.mesh.index_or_payload); + if (ret) + ret = attribute_buffer_allocator.allocate(view.total_vertices, &asset.mesh.attr_or_stream); + } if (ret) ret = indirect_buffer_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header); } diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp index 306f38ae..0ad9580a 100644 --- a/vulkan/mesh/meshlet.cpp +++ b/vulkan/mesh/meshlet.cpp @@ -124,7 +124,10 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie cmd.set_program("builtin://shaders/decode/meshlet_decode.comp"); cmd.enable_subgroup_size_control(true); - cmd.set_subgroup_size_log2(true, 5, 7); + if (cmd.get_device().supports_subgroup_size_log2(true, 5, 5)) + cmd.set_subgroup_size_log2(true, 5, 5); + else + cmd.set_subgroup_size_log2(true, 5, 7); cmd.set_storage_buffer(0, 0, *meshlet_meta_buffer); cmd.set_storage_buffer(0, 1, *meshlet_stream_buffer); @@ -161,7 +164,12 @@ bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &vie for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) { decode_offsets.push_back(index_count); - index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; + + // Unroll all elements as-is. + if (meshlet_runtime) + index_count += MaxElements; + else + index_count += view.streams[i * view.format_header->stream_count].u.offsets[NumChunks].prim_offset; } buf_info.domain = BufferDomain::LinkedDeviceHost; diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp index 67cb5055..11cc4935 100644 --- a/vulkan/mesh/meshlet.hpp +++ b/vulkan/mesh/meshlet.hpp @@ -76,8 +76,6 @@ struct RuntimeHeaderDecoded { uint32_t primitive_offset; uint32_t vertex_offset; - uint32_t num_primitives; - uint32_t num_attributes; }; struct Bound From 6237af76863da5263b6039e5a6a7381a5c02b101 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 13:39:54 +0100 Subject: [PATCH 35/59] Refactor out primitive culling code. --- assets/shaders/inc/meshlet_primitive_cull.h | 196 ++++++++++++++++++ tests/assets/shaders/meshlet_debug_plain.mesh | 174 +--------------- 2 files changed, 204 insertions(+), 166 deletions(-) create mode 100644 assets/shaders/inc/meshlet_primitive_cull.h diff --git a/assets/shaders/inc/meshlet_primitive_cull.h b/assets/shaders/inc/meshlet_primitive_cull.h new file mode 100644 index 00000000..e95ec1b3 --- /dev/null +++ b/assets/shaders/inc/meshlet_primitive_cull.h @@ -0,0 +1,196 @@ +#ifndef MESHLET_PRIMITIVE_CULL_H_ +#define MESHLET_PRIMITIVE_CULL_H_ + +#ifndef MESHLET_SIZE +#error "Must define MESHLET_SIZE" +#endif + +shared vec2 shared_window_positions[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; +shared uint8_t shared_clip_code[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; +shared uint shared_active_vert[gl_WorkGroupSize.y]; +shared uint shared_active_prim[gl_WorkGroupSize.y]; +shared uint shared_active_vert_count[gl_WorkGroupSize.y]; +shared uint shared_active_prim_count[gl_WorkGroupSize.y]; +shared uint shared_active_vert_count_total; +shared uint shared_active_prim_count_total; + +const uint CLIP_CODE_INACCURATE = 1 << 0; +const uint CLIP_CODE_NEGATIVE_W = 1 << 1; +const uint CLIP_CODE_NEGATIVE_X = 1 << 2; +const uint CLIP_CODE_NEGATIVE_Y = 1 << 3; +const uint CLIP_CODE_POSITIVE_X = 1 << 4; +const uint CLIP_CODE_POSITIVE_Y = 1 << 5; +const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; + +uint compacted_vertex_output(uint index) +{ + return shared_active_vert_count[gl_LocalInvocationID.y] + + bitCount(bitfieldExtract(shared_active_vert[gl_LocalInvocationID.y], 0, int(index))); +} + +uint meshlet_compacted_vertex_output() +{ + return compacted_vertex_output(gl_LocalInvocationID.x); +} + +uint compacted_index_output() +{ + return shared_active_prim_count[gl_LocalInvocationID.y] + + bitCount(bitfieldExtract(shared_active_prim[gl_LocalInvocationID.y], 0, int(gl_LocalInvocationID.x))); +} + +bool meshlet_lane_has_active_vert() +{ + return (shared_active_vert[gl_LocalInvocationID.y] & (1u << gl_LocalInvocationID.x)) != 0u; +} + +uvec3 remap_index_buffer(uvec3 prim) +{ + return uvec3(compacted_vertex_output(prim.x), compacted_vertex_output(prim.y), compacted_vertex_output(prim.z)); +} + +bool cull_triangle(vec2 a, vec2 b, vec2 c) +{ + // To be completely accurate, this should be done in fixed point, + // but we can YOLO a bit since glitches in extreme edge cases are considered okay. + precise vec2 ab = b - a; + precise vec2 ac = c - a; + + // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. + // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. + precise float pos_area = ab.y * ac.x; + precise float neg_area = ab.x * ac.y; + + // If the pos value is (-2^24, +2^24), the FP math is exact, if not, we have to be conservative. + // Less-than check is there to ensure that 1.0 delta in neg_area *will* resolve to a different value. + bool active_primitive; + if (abs(pos_area) < 16777216.0) + active_primitive = pos_area > neg_area; + else + active_primitive = pos_area >= neg_area; + + if (active_primitive) + { + // Micropoly test. + const int SUBPIXEL_BITS = 8; + vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-SUBPIXEL_BITS))); + vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-SUBPIXEL_BITS))); + active_primitive = all(notEqual(lo, hi)); + } + + return active_primitive; +} + +void meshlet_init_shared() +{ + if (gl_LocalInvocationIndex < MESHLET_SIZE / 32) + { + shared_active_vert[gl_LocalInvocationIndex] = 0; + shared_active_prim[gl_LocalInvocationIndex] = 0; + } +} + +uint meshlet_get_meshlet_index() +{ +#if MESHLET_SIZE != 256 + return gl_WorkGroupID.y; +#else + return gl_WorkGroupID.x; +#endif +} + +uint meshlet_get_base_chunk_index() +{ +#if MESHLET_SIZE != 256 + return gl_WorkGroupID.x * (MESHLET_SIZE / 32); +#else + return 0; +#endif +} + +void meshlet_emit_clip_pos(vec4 clip_pos, vec4 viewport) +{ + meshlet_init_shared(); + vec2 c = clip_pos.xy / clip_pos.w; + + uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; + if (any(greaterThan(abs(c), vec2(4.0)))) + clip_code |= CLIP_CODE_INACCURATE; + if (c.x <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_X; + if (c.y <= -1.0) + clip_code |= CLIP_CODE_NEGATIVE_Y; + if (c.x >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_X; + if (c.y >= 1.0) + clip_code |= CLIP_CODE_POSITIVE_Y; + + vec2 window = roundEven(c * viewport.zw + viewport.xy); + shared_window_positions[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = window; + shared_clip_code[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = uint8_t(clip_code); + + barrier(); +} + +void meshlet_emit_primitive(uvec3 prim) +{ + uint code_a = shared_clip_code[gl_LocalInvocationID.y][prim.x]; + uint code_b = shared_clip_code[gl_LocalInvocationID.y][prim.y]; + uint code_c = shared_clip_code[gl_LocalInvocationID.y][prim.z]; + + uint or_code = code_a | code_b | code_c; + uint and_code = code_a & code_b & code_c; + + bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; + bool is_active_prim = false; + + if (!culled_planes) + { + bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; + + if (!force_accept) + { + vec2 a = shared_window_positions[gl_LocalInvocationID.y][prim.x]; + vec2 b = shared_window_positions[gl_LocalInvocationID.y][prim.y]; + vec2 c = shared_window_positions[gl_LocalInvocationID.y][prim.z]; + force_accept = cull_triangle(a, b, c); + } + + if (force_accept) + { + is_active_prim = true; + atomicOr(shared_active_prim[gl_LocalInvocationID.y], 1u << gl_LocalInvocationID.x); + atomicOr(shared_active_vert[gl_LocalInvocationID.y], 1u << prim.x); + atomicOr(shared_active_vert[gl_LocalInvocationID.y], 1u << prim.y); + atomicOr(shared_active_vert[gl_LocalInvocationID.y], 1u << prim.z); + } + } + + barrier(); + + if (gl_LocalInvocationIndex == 0) + { + uint active_prim = 0; + uint active_vert = 0; + + for (uint i = 0; i < gl_WorkGroupSize.y; i++) + { + shared_active_prim_count[i] = active_prim; + shared_active_vert_count[i] = active_vert; + active_prim += bitCount(shared_active_prim[i]); + active_vert += bitCount(shared_active_vert[i]); + } + + shared_active_prim_count_total = active_prim; + shared_active_vert_count_total = active_vert; + } + + barrier(); + + SetMeshOutputsEXT(shared_active_vert_count_total, shared_active_prim_count_total); + + if (is_active_prim) + gl_PrimitiveTriangleIndicesEXT[compacted_index_output()] = remap_index_buffer(prim); +} + +#endif diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 62849ac0..dd4a9dac 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -12,6 +12,7 @@ layout(max_primitives = MESHLET_SIZE, max_vertices = MESHLET_SIZE, triangles) ou layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" +#include "meshlet_primitive_cull.h" layout(location = 0) out vec3 vWorldPos[]; layout(location = 1) perprimitiveEXT out uint vDrawID[]; @@ -51,91 +52,11 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms mat4 data[]; } transforms; -shared vec2 shared_window_positions[MESHLET_SIZE]; -shared uint8_t shared_clip_code[MESHLET_SIZE]; -shared uint shared_active_vert[MESHLET_SIZE / 32]; -shared uint shared_active_prim[MESHLET_SIZE / 32]; -shared uint shared_active_vert_count[MESHLET_SIZE / 32]; -shared uint shared_active_prim_count[MESHLET_SIZE / 32]; -shared uint shared_active_vert_count_total; -shared uint shared_active_prim_count_total; - -const uint CLIP_CODE_INACCURATE = 1 << 0; -const uint CLIP_CODE_NEGATIVE_W = 1 << 1; -const uint CLIP_CODE_NEGATIVE_X = 1 << 2; -const uint CLIP_CODE_NEGATIVE_Y = 1 << 3; -const uint CLIP_CODE_POSITIVE_X = 1 << 4; -const uint CLIP_CODE_POSITIVE_Y = 1 << 5; -const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; - -uint compacted_vertex_output(uint index) -{ - return shared_active_vert_count[index / 32u] + bitCount(shared_active_vert[index / 32u] & ((1u << (index & 31u)) - 1u)); -} - -uint compacted_index_output(uint index) -{ - return shared_active_prim_count[index / 32u] + bitCount(shared_active_prim[index / 32u] & ((1u << (index & 31u)) - 1u)); -} - -bool lane_has_active_vert(uint index) -{ - return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; -} - -uvec3 remap_index_buffer(uvec3 prim) -{ - return uvec3(compacted_vertex_output(prim.x), compacted_vertex_output(prim.y), compacted_vertex_output(prim.z)); -} - -bool cull_triangle(vec2 a, vec2 b, vec2 c) -{ - // To be completely accurate, this should be done in fixed point, - // but we can YOLO a bit since glitches in extreme edge cases are considered okay. - precise vec2 ab = b - a; - precise vec2 ac = c - a; - - // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. - // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. - precise float pos_area = ab.y * ac.x; - precise float neg_area = ab.x * ac.y; - - // If the pos value is (-2^24, +2^24), the FP math is exact, if not, we have to be conservative. - // Less-than check is there to ensure that 1.0 delta in neg_area *will* resolve to a different value. - bool active_primitive; - if (abs(pos_area) < 16777216.0) - active_primitive = pos_area > neg_area; - else - active_primitive = pos_area >= neg_area; - - if (active_primitive) - { - // Micropoly test. - vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); - vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); - active_primitive = all(notEqual(lo, hi)); - } - - return active_primitive; -} - void main() { uint linear_index = gl_LocalInvocationIndex; - - if (linear_index < MESHLET_SIZE / 32) - { - shared_active_vert[linear_index] = 0; - shared_active_prim[linear_index] = 0; - } - -#if MESHLET_SIZE != 256 - uint meshlet_index = gl_WorkGroupID.y; - uint base_chunk_index = gl_WorkGroupID.x * (MESHLET_SIZE / 32); -#else - uint meshlet_index = gl_WorkGroupID.x; - uint base_chunk_index = 0u; -#endif + uint meshlet_index = meshlet_get_meshlet_index(); + uint base_chunk_index = meshlet_get_base_chunk_index(); IndirectDrawMesh meshlet = indirect_commands_mesh.draws[meshlet_index]; CompactedDrawInfo task = mesh_payload[meshlet_index]; @@ -146,96 +67,17 @@ void main() vec3 world_pos = (M * vec4(pos, 1.0)).xyz; vec4 clip_pos = VP * vec4(world_pos, 1.0); - vec2 c = clip_pos.xy / clip_pos.w; - - uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; - if (any(greaterThan(abs(c), vec2(4.0)))) - clip_code |= CLIP_CODE_INACCURATE; - if (c.x <= -1.0) - clip_code |= CLIP_CODE_NEGATIVE_X; - if (c.y <= -1.0) - clip_code |= CLIP_CODE_NEGATIVE_Y; - if (c.x >= 1.0) - clip_code |= CLIP_CODE_POSITIVE_X; - if (c.y >= 1.0) - clip_code |= CLIP_CODE_POSITIVE_Y; - - vec2 window = roundEven(c * viewport.zw + viewport.xy); - shared_window_positions[linear_index] = window; - shared_clip_code[linear_index] = uint8_t(clip_code); - - barrier(); + meshlet_emit_clip_pos(clip_pos, viewport); uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index + 32u * base_chunk_index]); - prim += 32u * gl_LocalInvocationID.y; - - uint code_a = shared_clip_code[prim.x]; - uint code_b = shared_clip_code[prim.y]; - uint code_c = shared_clip_code[prim.z]; - - uint or_code = code_a | code_b | code_c; - uint and_code = code_a & code_b & code_c; - - bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; - - bool is_active_prim = false; - if (!culled_planes) - { - bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; - - if (!force_accept) - { - vec2 a = shared_window_positions[prim.x]; - vec2 b = shared_window_positions[prim.y]; - vec2 c = shared_window_positions[prim.z]; - force_accept = cull_triangle(a, b, c); - } - - if (force_accept) - { - is_active_prim = true; - atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); - atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); - atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); - atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); - } - } - - barrier(); - - if (linear_index == 0) - { - uint active_prim = 0; - uint active_vert = 0; - for (uint i = 0; i < gl_WorkGroupSize.y; i++) - { - shared_active_prim_count[i] = active_prim; - shared_active_vert_count[i] = active_vert; - active_prim += bitCount(shared_active_prim[i]); - active_vert += bitCount(shared_active_vert[i]); - } - - shared_active_prim_count_total = active_prim; - shared_active_vert_count_total = active_vert; - } - - barrier(); - - uint num_verts = shared_active_vert_count_total; - uint num_prims = shared_active_prim_count_total; - - SetMeshOutputsEXT(num_verts, num_prims); - - if (is_active_prim) - gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); + meshlet_emit_primitive(prim); - if (linear_index < num_prims) + if (linear_index < shared_active_prim_count_total) vDrawID[linear_index] = task.meshlet_index; - bool has_active_vert = lane_has_active_vert(linear_index); - if (has_active_vert) + if (meshlet_lane_has_active_vert()) { - uint out_vert_index = compacted_vertex_output(linear_index); + uint out_vert_index = meshlet_compacted_vertex_output(); gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; vWorldPos[out_vert_index] = world_pos; } From 58882b42e0d5c8d77d15acf59539f6eaf0bdfbce Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 15:04:27 +0100 Subject: [PATCH 36/59] Task shader path is working for plain mesh. --- assets/shaders/inc/meshlet_render_types.h | 15 ++ tests/assets/shaders/meshlet_debug.task | 137 ++++++++++-------- tests/assets/shaders/meshlet_debug_plain.mesh | 22 ++- tests/meshlet_viewer.cpp | 102 +++++++++---- 4 files changed, 184 insertions(+), 92 deletions(-) diff --git a/assets/shaders/inc/meshlet_render_types.h b/assets/shaders/inc/meshlet_render_types.h index 06db9649..c6953328 100644 --- a/assets/shaders/inc/meshlet_render_types.h +++ b/assets/shaders/inc/meshlet_render_types.h @@ -27,6 +27,21 @@ struct CompactedDrawInfo uint node_count_material_offset; }; +#ifdef MESHLET_RENDER_TASK_HIERARCHICAL +#if MESHLET_RENDER_TASK_HIERARCHICAL +struct CompactedDrawInfoPayload +{ + CompactedDrawInfo infos[32 * 32]; +}; +#else +struct CompactedDrawInfoPayload +{ + CompactedDrawInfo info; + uint8_t offsets[32]; +}; +#endif +#endif + struct IndirectDrawMesh { uint primitive_offset; diff --git a/tests/assets/shaders/meshlet_debug.task b/tests/assets/shaders/meshlet_debug.task index 3f101f72..dbe2969a 100644 --- a/tests/assets/shaders/meshlet_debug.task +++ b/tests/assets/shaders/meshlet_debug.task @@ -1,33 +1,32 @@ #version 450 #extension GL_EXT_mesh_shader : require - -#if MESHLET_PAYLOAD_WAVE32 -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require -#extension GL_KHR_shader_subgroup_shuffle : require -#endif +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require layout(local_size_x = 32) in; #define MESHLET_RENDER_DESCRIPTOR_SET 0 -#define MESHLET_RENDER_AABB_BINDING 0 -#define MESHLET_RENDER_TRANSFORM_BINDING 1 -#define MESHLET_RENDER_TASKS_BINDING 2 -#define MESHLET_RENDER_BOUND_BINDING 7 -#define MESHLET_RENDER_FRUSTUM_BINDING 8 +#define MESHLET_RENDER_TRANSFORM_BINDING 5 +#define MESHLET_RENDER_AABB_BINDING 6 +#define MESHLET_RENDER_TASKS_BINDING 7 +#define MESHLET_RENDER_BOUND_BINDING 8 +#define MESHLET_RENDER_FRUSTUM_BINDING 9 #include "meshlet_render.h" +#ifndef MESHLET_RENDER_TASK_HIERARCHICAL +#error "Must define MESHLET_RENDER_TASK_HIERARCHICAL" +#endif + layout(push_constant, std430) uniform Registers { vec3 camera_pos; uint count; + uint offset; } registers; -taskPayloadSharedEXT CompactedDrawInfo mesh_payload[32 * 32]; +taskPayloadSharedEXT CompactedDrawInfoPayload mesh_payload; -layout(set = 0, binding = 9) buffer Counter { uint task_counter; }; +layout(constant_id = 1) const int NUM_CHUNK_WORKGROUPS = 0; -#if !MESHLET_PAYLOAD_WAVE32 shared uint ballot_value; uvec4 ballot(bool v) { @@ -48,26 +47,57 @@ uint ballotBitCount(uvec4 v) uint ballotExclusiveBitCount(uvec4 v) { - uint mask = (1u << gl_LocalInvocationIndex) - 1u; - return bitCount(v.x & mask); + return bitCount(bitfieldExtract(v.x, 0, int(gl_LocalInvocationIndex))); } -#define local_invocation_id gl_LocalInvocationIndex + +uint payload_offset = 0; + +void process_task(TaskInfo task) +{ + uint node_instance = task.node_instance; + uint node_count_material_index = task.node_count_material_index; + uint mesh_index_count = task.mesh_index_count; + + uint offset = mesh_index_count & ~31u; + uint count = bitfieldExtract(mesh_index_count, 0, 5) + 1; + uint meshlet_index = offset + gl_LocalInvocationIndex; + + bool alloc_draw = false; + if (gl_LocalInvocationIndex < count) + { + mat4 M = transforms.data[node_instance]; + Bound b = bounds.data[meshlet_index]; + alloc_draw = cluster_cull(M, b, registers.camera_pos); + } + + uvec4 ballot = ballot(alloc_draw); + uint draw_count = ballotBitCount(ballot); + uint local_offset = ballotExclusiveBitCount(ballot); + +#if MESHLET_RENDER_TASK_HIERARCHICAL + if (alloc_draw) + { + mesh_payload.infos[payload_offset + local_offset] = + CompactedDrawInfo(meshlet_index, node_instance, node_count_material_index); + } #else -#define ballot(v) subgroupBallot(v) -#define ballotBitCount(v) subgroupBallotBitCount(v) -#define ballotExclusiveBitCount(v) subgroupBallotExclusiveBitCount(v) -#define local_invocation_id gl_SubgroupInvocationID + if (gl_LocalInvocationIndex == 0) + mesh_payload.info = CompactedDrawInfo(offset, node_instance, node_count_material_index); + if (alloc_draw) + mesh_payload.offsets[local_offset] = uint8_t(gl_LocalInvocationIndex); #endif + payload_offset += draw_count; +} + void main() { - TaskInfo task; - uint task_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + local_invocation_id; - +#if MESHLET_RENDER_TASK_HIERARCHICAL + uint task_index = gl_GlobalInvocationID.x + registers.offset * gl_WorkGroupSize.x; bool task_needs_work = false; if (task_index < registers.count) { - task = task_info.data[task_index]; + TaskInfo task = task_info.data[task_index]; // Precull the group. AABB aabb = aabb.data[task.aabb_instance]; @@ -76,48 +106,33 @@ void main() uint b = ballot(task_needs_work).x; - uint payload_offset = 0; while (b != 0) { int lane = findLSB(b); b &= ~(1u << lane); - -#if MESHLET_PAYLOAD_WAVE32 - uint node_instance = subgroupShuffle(task.node_instance, lane); - uint node_count_material_index = subgroupShuffle(task.node_count_material_index, lane); - uint mesh_index_count = subgroupShuffle(task.mesh_index_count, lane); -#else TaskInfo tmp_task = task_info.data[gl_WorkGroupID.x * gl_WorkGroupSize.x + lane]; - uint node_instance = tmp_task.node_instance; - uint node_count_material_index = tmp_task.node_count_material_index; - uint mesh_index_count = tmp_task.mesh_index_count; + process_task(tmp_task); + } +#else + uint task_index = gl_WorkGroupID.x + registers.offset; + if (task_index < registers.count) + { + TaskInfo task = task_info.data[task_index]; + process_task(task); + } #endif - uint offset = mesh_index_count & ~31u; - uint count = bitfieldExtract(mesh_index_count, 0, 5) + 1; - - uint meshlet_index = offset + local_invocation_id; - - bool alloc_draw = false; - if (local_invocation_id < count) - { - mat4 M = transforms.data[node_instance]; - Bound b = bounds.data[meshlet_index]; - alloc_draw = cluster_cull(M, b, registers.camera_pos); - } - - uvec4 ballot = ballot(alloc_draw); - uint draw_count = ballotBitCount(ballot); - uint local_offset = ballotExclusiveBitCount(ballot); - - if (alloc_draw) - mesh_payload[payload_offset + local_offset] = CompactedDrawInfo(meshlet_index, node_instance, node_count_material_index); - - payload_offset += draw_count; + uint wg_x, wg_y; + if (NUM_CHUNK_WORKGROUPS == 1) + { + wg_x = payload_offset; + wg_y = 1; + } + else + { + wg_x = NUM_CHUNK_WORKGROUPS; + wg_y = payload_offset; } - if (gl_LocalInvocationIndex == 0) - atomicAdd(task_counter, payload_offset); - - EmitMeshTasksEXT(payload_offset, 1, 1); -} \ No newline at end of file + EmitMeshTasksEXT(wg_x, wg_y, 1); +} diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index dd4a9dac..8cf81daa 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -39,13 +39,17 @@ layout(set = 0, binding = 1, scalar) readonly buffer VBOPOS layout(set = 0, binding = 3, std430) readonly buffer IndirectCommands { - layout(offset = 256) IndirectDrawMesh draws[]; + IndirectDrawMesh draws[]; } indirect_commands_mesh; +#ifdef MESHLET_RENDER_TASK_HIERARCHICAL +taskPayloadSharedEXT CompactedDrawInfoPayload mesh_payload; +#else layout(set = 0, binding = 4, std430) readonly buffer CompactedDraws { - CompactedDrawInfo mesh_payload[]; -}; + CompactedDrawInfo infos[]; +} mesh_payload; +#endif layout(set = 0, binding = 5, std430) readonly buffer Transforms { @@ -55,11 +59,17 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms void main() { uint linear_index = gl_LocalInvocationIndex; - uint meshlet_index = meshlet_get_meshlet_index(); + uint compacted_meshlet_index = meshlet_get_meshlet_index(); uint base_chunk_index = meshlet_get_base_chunk_index(); - IndirectDrawMesh meshlet = indirect_commands_mesh.draws[meshlet_index]; - CompactedDrawInfo task = mesh_payload[meshlet_index]; +#if defined(MESHLET_RENDER_TASK_HIERARCHICAL) && !MESHLET_RENDER_TASK_HIERARCHICAL + CompactedDrawInfo task = mesh_payload.info; + task.meshlet_index += uint(mesh_payload.offsets[compacted_meshlet_index]); +#else + CompactedDrawInfo task = mesh_payload.infos[compacted_meshlet_index]; +#endif + + IndirectDrawMesh meshlet = indirect_commands_mesh.draws[task.meshlet_index]; mat4 M = transforms.data[task.node_offset]; // Transform positions. diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index cb1e4b5b..2be94f1b 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -172,7 +172,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto &scene_nodes = parser.get_scenes()[parser.get_default_scene()]; auto root = scene.create_node(); -#if 0 +#if 1 for (int z = -10; z <= 10; z++) for (int y = -10; y <= 10; y++) for (int x = -10; x <= 10; x++) @@ -316,14 +316,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V BufferHandle task_buffer, cached_transform_buffer, aabb_buffer, compacted_params, indirect_draws; - { - BufferCreateInfo info; - info.size = max_draws * sizeof(DrawParameters); - info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - info.domain = BufferDomain::Device; - compacted_params = device.create_buffer(info); - } - { BufferCreateInfo info; info.size = task_params.size() * sizeof(task_params.front()); @@ -368,11 +360,12 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V { vec3 camera_pos; uint32_t count; + uint32_t offset; } push; push.camera_pos = render_context.get_render_parameters().camera_position; const bool use_meshlets = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::VBOAndIBOMDI; - const bool use_preculling = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::MeshletEncoded; + const bool use_preculling = !use_meshlets; uint32_t target_meshlet_workgroup_size = max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); @@ -384,7 +377,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V { BufferCreateInfo info; if (use_meshlets) - info.size = max_draws * sizeof(Vulkan::Meshlet::RuntimeHeaderDecoded) + 256; + info.size = sizeof(VkDrawMeshTasksIndirectCommandEXT); else info.size = max_draws * sizeof(VkDrawIndexedIndirectCommand) + 256; @@ -416,6 +409,14 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); + + { + BufferCreateInfo info; + info.size = max_draws * sizeof(DrawParameters); + info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + info.domain = BufferDomain::Device; + compacted_params = device.create_buffer(info); + } } if (use_preculling) @@ -457,7 +458,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto *pos = manager.get_position_buffer(); auto *attr = manager.get_attribute_buffer(); - if (use_meshlets && !use_preculling) + if (manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::MeshletEncoded) { cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); @@ -476,20 +477,30 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug.task", - "assets://shaders/meshlet_debug.mesh.frag"); + if (use_preculling) + { + + } + else + { + cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug.task", + "assets://shaders/meshlet_debug.mesh.frag"); + } cmd->set_storage_buffer(0, 0, *ibo); cmd->set_storage_buffer(0, 1, *pos); cmd->set_storage_buffer(0, 2, *attr); cmd->set_storage_buffer(0, 3, *indirect_draws); - cmd->set_storage_buffer(0, 4, *compacted_params); + if (use_preculling) + cmd->set_storage_buffer(0, 4, *compacted_params); cmd->set_storage_buffer(0, 5, *cached_transform_buffer); + + GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); cmd->end_render_pass(); } - else if (use_meshlets && use_preculling) + else if (manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::MeshletDecoded) { cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); @@ -498,9 +509,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; - memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), - 6 * sizeof(vec4)); - *cmd->allocate_typed_constant_data(1, 2, 1) = float(1 << 8 /* shader assumes 8 */) * vec4(cmd->get_viewport().x + 0.5f * cmd->get_viewport().width - 0.5f, @@ -508,19 +516,63 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - cmd->set_specialization_constant_mask(1); + bool use_hierarchical = device.get_device_features().driver_id != VK_DRIVER_ID_NVIDIA_PROPRIETARY; + + cmd->set_specialization_constant_mask(3); cmd->set_specialization_constant(0, target_meshlet_workgroup_size / 32); - cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", - "assets://shaders/meshlet_debug.mesh.frag", {{ "MESHLET_SIZE", int(target_meshlet_workgroup_size) }}); + cmd->set_specialization_constant(1, num_chunk_workgroups); cmd->set_storage_buffer(0, 0, *ibo); cmd->set_storage_buffer(0, 1, *pos); cmd->set_storage_buffer(0, 2, *attr); - cmd->set_storage_buffer(0, 3, *indirect_draws); - cmd->set_storage_buffer(0, 4, *compacted_params); + cmd->set_storage_buffer(0, 3, *manager.get_indirect_buffer()); + if (use_preculling) + cmd->set_storage_buffer(0, 4, *compacted_params); cmd->set_storage_buffer(0, 5, *cached_transform_buffer); GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); - cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + + if (use_preculling) + { + cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", + "assets://shaders/meshlet_debug.mesh.frag", + { { "MESHLET_SIZE", int(target_meshlet_workgroup_size) } }); + memcpy(cmd->allocate_typed_constant_data(1, 1, 6), + render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); + } + else + { + cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug_plain.mesh", + "assets://shaders/meshlet_debug.mesh.frag", + { { "MESHLET_SIZE", int(target_meshlet_workgroup_size) }, + { "MESHLET_RENDER_TASK_HIERARCHICAL", int(use_hierarchical) } }); + + cmd->set_storage_buffer(0, 6, *aabb_buffer); + cmd->set_storage_buffer(0, 7, *task_buffer); + cmd->set_storage_buffer(0, 8, *manager.get_cluster_bounds_buffer()); + memcpy(cmd->allocate_typed_constant_data(0, 9, 6), + render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); + } + + if (use_preculling) + { + cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + } + else + { + uint32_t workgroups = task_params.size(); + push.count = workgroups; + + if (use_hierarchical) + workgroups = (workgroups + 31) / 32; + + for (uint32_t i = 0; i < workgroups; i += device.get_device_features().mesh_shader_properties.maxTaskWorkGroupCount[0]) + { + uint32_t to_dispatch = std::min(workgroups - i, device.get_device_features().mesh_shader_properties.maxTaskWorkGroupCount[0]); + push.offset = i; + cmd->push_constants(&push, 0, sizeof(push)); + cmd->draw_mesh_tasks(to_dispatch, 1, 1); + } + } cmd->end_render_pass(); } else From fae8bd0647c28236bda59a3c8f2f2979474118e9 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 15:57:23 +0100 Subject: [PATCH 37/59] Get encoded path working. --- tests/assets/shaders/meshlet_debug.mesh | 286 ++++++------------------ tests/meshlet_viewer.cpp | 77 ++----- vulkan/managers/resource_manager.cpp | 2 +- 3 files changed, 87 insertions(+), 278 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index a5a934c6..f30b67a6 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -1,37 +1,28 @@ #version 450 #extension GL_EXT_mesh_shader : require +#extension GL_KHR_shader_subgroup_basic : require -layout(max_primitives = 128, max_vertices = 128, triangles) out; -#include "meshlet_payload_constants.h" -layout(local_size_x = 128) in; +#ifndef MESHLET_SIZE +#error "Must define MESHLET_SIZE" +#endif +layout(max_primitives = MESHLET_SIZE, max_vertices = MESHLET_SIZE, triangles) out; +layout(local_size_x = 32, local_size_y_id = 0) in; + +#include "meshlet_payload_constants.h" layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; -#define MESHLET_PAYLOAD_NUM_U32_STREAMS NUM_U32_STREAMS #define MESHLET_PAYLOAD_DESCRIPTOR_SET 0 -#define MESHLET_PAYLOAD_META_BINDING 3 -#define MESHLET_PAYLOAD_STREAM_BINDING 4 -#define MESHLET_PAYLOAD_PAYLOAD_BINDING 5 +#define MESHLET_PAYLOAD_META_BINDING 0 +#define MESHLET_PAYLOAD_STREAM_BINDING 1 +#define MESHLET_PAYLOAD_PAYLOAD_BINDING 2 #include "meshlet_payload_decode.h" #include "meshlet_attribute_decode.h" +#include "meshlet_render_types.h" +#include "meshlet_primitive_cull.h" -#define MESHLET_RENDER_DESCRIPTOR_SET 0 -#define MESHLET_RENDER_AABB_BINDING 0 -#define MESHLET_RENDER_TRANSFORM_BINDING 1 -#define MESHLET_RENDER_TASKS_BINDING 2 -#define MESHLET_RENDER_BOUND_BINDING 7 -#define MESHLET_RENDER_FRUSTUM_BINDING 8 -#include "meshlet_render.h" - -#if 0 -layout(location = 0) out mediump vec3 vNormal[]; -layout(location = 1) out mediump vec4 vTangent[]; -layout(location = 2) out vec2 vUV[]; -layout(location = 3) perprimitiveEXT out uint MaterialOffset[]; -#else layout(location = 0) out vec3 vWorldPos[]; layout(location = 1) perprimitiveEXT out uint vDrawID[]; -#endif layout(set = 1, binding = 0) uniform UBO { @@ -43,230 +34,79 @@ layout(set = 1, binding = 2) uniform UBOViewport vec4 viewport; }; -#ifndef MESHLET_RENDER_TASK -#error "Must define MESHLET_RENDER_TASK" -#endif - -#if MESHLET_RENDER_TASK -taskPayloadSharedEXT CompactedDrawInfo mesh_payload[32 * 32]; +#ifdef MESHLET_RENDER_TASK_HIERARCHICAL +taskPayloadSharedEXT CompactedDrawInfoPayload mesh_payload; #else -layout(set = 0, binding = 10) readonly buffer DrawInfos +layout(set = 0, binding = 4, std430) readonly buffer CompactedDraws { - CompactedDrawInfo mesh_payload[]; -}; + CompactedDrawInfo infos[]; +} mesh_payload; #endif -shared vec2 shared_window_positions[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; -shared uint8_t shared_clip_code[MESHLET_PAYLOAD_NUM_CHUNKS * 32]; -shared uvec4 shared_active_vert; -shared uvec4 shared_active_prim; -shared uvec4 shared_active_vert_count; -shared uvec4 shared_active_prim_count; -shared uint shared_active_vert_count_total; -shared uint shared_active_prim_count_total; - -const uint CLIP_CODE_INACCURATE = 1 << 0; -const uint CLIP_CODE_NEGATIVE_W = 1 << 1; -const uint CLIP_CODE_NEGATIVE_X = 1 << 2; -const uint CLIP_CODE_NEGATIVE_Y = 1 << 3; -const uint CLIP_CODE_POSITIVE_X = 1 << 4; -const uint CLIP_CODE_POSITIVE_Y = 1 << 5; -const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; - -uint compacted_vertex_output(uint index) +layout(set = 0, binding = 5, std430) readonly buffer Transforms { - return shared_active_vert_count[index / 32u] + bitCount(shared_active_vert[index / 32u] & ((1u << (index & 31u)) - 1u)); -} - -uint compacted_index_output(uint index) -{ - return shared_active_prim_count[index / 32u] + bitCount(shared_active_prim[index / 32u] & ((1u << (index & 31u)) - 1u)); -} + mat4 data[]; +} transforms; -bool lane_has_active_vert(uint index) -{ - return (shared_active_vert[index / 32u] & (1u << (index & 31u))) != 0u; -} - -uvec3 remap_index_buffer(uvec3 prim) +void main() { - return uvec3(compacted_vertex_output(prim.x), - compacted_vertex_output(prim.y), - compacted_vertex_output(prim.z)); -} + uint linear_index = gl_LocalInvocationIndex; + uint compacted_meshlet_index = meshlet_get_meshlet_index(); + uint base_chunk_index = meshlet_get_base_chunk_index(); -bool cull_triangle(vec2 a, vec2 b, vec2 c) -{ - // To be completely accurate, this should be done in fixed point, - // but we can YOLO a bit since glitches in extreme edge cases are considered okay. - precise vec2 ab = b - a; - precise vec2 ac = c - a; +#if defined(MESHLET_RENDER_TASK_HIERARCHICAL) && !MESHLET_RENDER_TASK_HIERARCHICAL + CompactedDrawInfo task = mesh_payload.info; + task.meshlet_index += uint(mesh_payload.offsets[compacted_meshlet_index]); +#else + CompactedDrawInfo task = mesh_payload.infos[compacted_meshlet_index]; +#endif - // This is 100% accurate as long as the primitive is no larger than ~4k subpixels, i.e. 16x16 pixels. - // Normally, we'd be able to do GEQ test, but GE test is conservative, even with FP error in play. - precise float pos_area = ab.y * ac.x; - precise float neg_area = ab.x * ac.y; + MeshletMetaRuntime meta = meshlet_metas_runtime.data[task.meshlet_index]; + mat4 M = transforms.data[task.node_offset]; - // If the pos value is (-2^24, +2^24), the FP math is exact, if not, we have to be conservative. - // Less-than check is there to ensure that 1.0 delta in neg_area *will* resolve to a different value. - bool active_primitive; - if (abs(pos_area) < 16777216.0) - active_primitive = pos_area > neg_area; - else - active_primitive = pos_area >= neg_area; + int lane_index; + uint chunk_index; - if (active_primitive) + if (gl_SubgroupSize == 32) { - // Micropoly test. - vec2 lo = floor(ldexp(min(min(a, b), c), ivec2(-8))); - vec2 hi = floor(ldexp(max(max(a, b), c), ivec2(-8))); - active_primitive = all(notEqual(lo, hi)); - } - - return active_primitive; -} - -void main() -{ - if (gl_LocalInvocationIndex < MESHLET_PAYLOAD_NUM_CHUNKS) + chunk_index = gl_SubgroupID + base_chunk_index; + lane_index = int(gl_SubgroupInvocationID); + } + else { - shared_active_vert[gl_LocalInvocationIndex] = 0; - shared_active_prim[gl_LocalInvocationIndex] = 0; + chunk_index = gl_LocalInvocationID.y + base_chunk_index; + lane_index = int(gl_LocalInvocationID.x); } - CompactedDrawInfo task = mesh_payload[gl_WorkGroupID.x]; - MeshletMetaRuntime meta = meshlet_metas_runtime.data[task.meshlet_index]; - meshlet_init_workgroup(meta.stream_offset); - - uint linear_index = meshlet_get_linear_index(); - mat4 M = transforms.data[task.node_offset]; - - uvec3 prim; -#define INDEX(index, value) prim = uvec3(unpack8(value).xyz) - MESHLET_DECODE_STREAM_32(meta.stream_offset, 0, INDEX); - - vec3 pos; -#define POSITION(index, value) pos = attribute_decode_snorm_exp_position(value) - MESHLET_DECODE_STREAM_64(meta.stream_offset, 1, POSITION); + MeshletChunkInfo index_chunk_info = meshlet_get_chunk_info(meta.stream_offset, chunk_index); + uint decoded_index_buffer = 0; + vec3 world_pos; + vec4 clip_pos = vec4(-1.0); - vec3 world_pos = (M * vec4(pos, 1.0)).xyz; - vec4 clip_pos = VP * vec4(world_pos, 1.0); - vec2 c = clip_pos.xy / clip_pos.w; - - uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; - if (any(greaterThan(abs(c), vec2(4.0)))) - clip_code |= CLIP_CODE_INACCURATE; - if (c.x <= -1.0) - clip_code |= CLIP_CODE_NEGATIVE_X; - if (c.y <= -1.0) - clip_code |= CLIP_CODE_NEGATIVE_Y; - if (c.x >= 1.0) - clip_code |= CLIP_CODE_POSITIVE_X; - if (c.y >= 1.0) - clip_code |= CLIP_CODE_POSITIVE_Y; - - vec2 window = roundEven(c * viewport.zw + viewport.xy); - shared_window_positions[linear_index] = window; - shared_clip_code[linear_index] = uint8_t(clip_code); - - barrier(); - - bool is_active_prim = false; - if (linear_index < meta.num_primitives) + if (chunk_index < meta.num_chunks) { - uint code_a = shared_clip_code[prim.x]; - uint code_b = shared_clip_code[prim.y]; - uint code_c = shared_clip_code[prim.z]; + if (lane_index < index_chunk_info.primitive_count) + decoded_index_buffer = meshlet_decode_index_buffer(meta.stream_offset, chunk_index, lane_index); - uint or_code = code_a | code_b | code_c; - uint and_code = code_a & code_b & code_c; - - bool culled_planes = (and_code & CLIP_CODE_PLANES) != 0; - - if (!culled_planes) + if (lane_index < index_chunk_info.vertex_count) { - bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; - - if (!force_accept) - { - vec2 a = shared_window_positions[prim.x]; - vec2 b = shared_window_positions[prim.y]; - vec2 c = shared_window_positions[prim.z]; - force_accept = cull_triangle(a, b, c); - } - - if (force_accept) - { - is_active_prim = true; - atomicOr(shared_active_prim[linear_index / 32], 1u << (linear_index & 31)); - atomicOr(shared_active_vert[prim.x / 32], 1u << (prim.x & 31)); - atomicOr(shared_active_vert[prim.y / 32], 1u << (prim.y & 31)); - atomicOr(shared_active_vert[prim.z / 32], 1u << (prim.z & 31)); - } + int exponent; + i16vec3 ipos = meshlet_decode_snorm_scaled_i16x3(meta.stream_offset + 1, chunk_index, lane_index, exponent); + vec3 pos = ldexp(vec3(ipos), ivec3(exponent)); + world_pos = (M * vec4(pos, 1.0)).xyz; + clip_pos = VP * vec4(world_pos, 1.0); } } - barrier(); + meshlet_emit_clip_pos(clip_pos, viewport); + meshlet_emit_primitive(unpack8(decoded_index_buffer).xyz); + if (linear_index < shared_active_prim_count_total) + vDrawID[linear_index] = task.meshlet_index; - if (gl_LocalInvocationIndex == 0) + if (meshlet_lane_has_active_vert()) { - uvec3 num_active_prim = bitCount(shared_active_prim.xyz); - num_active_prim.y += num_active_prim.x; - num_active_prim.z += num_active_prim.y; - shared_active_prim_count = uvec4(0, num_active_prim); - - uvec3 num_active_vert = bitCount(shared_active_vert.xyz); - num_active_vert.y += num_active_vert.x; - num_active_vert.z += num_active_vert.y; - shared_active_vert_count = uvec4(0, num_active_vert); - - shared_active_prim_count_total = num_active_prim.z + bitCount(shared_active_prim.w); - shared_active_vert_count_total = num_active_vert.z + bitCount(shared_active_vert.w); - } - - barrier(); - - uint num_verts = shared_active_vert_count_total; - uint num_prims = shared_active_prim_count_total; - - SetMeshOutputsEXT(num_verts, num_prims); - - if (is_active_prim) - gl_PrimitiveTriangleIndicesEXT[compacted_index_output(linear_index)] = remap_index_buffer(prim); - - if (gl_LocalInvocationIndex < num_prims) - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; - - bool has_active_vert = lane_has_active_vert(linear_index); - - if (has_active_vert) - { - uint out_vert_index = compacted_vertex_output(linear_index); - gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - vWorldPos[out_vert_index] = world_pos; - } - -#if 0 -#define NORMAL(index, value) \ - if (index < meta.num_attributes) \ - { \ - vNormal[index] = mat3(M) * attribute_decode_oct8_normal_tangent(value).xyz; \ - } - MESHLET_DECODE_STREAM_32(meta.stream_offset, 3, NORMAL); - -#define TANGENT(index, value) \ - if (index < meta.num_attributes) \ - { \ - mediump vec4 T = attribute_decode_oct8_normal_tangent(value); \ - vTangent[index] = vec4(mat3(M) * T.xyz, T.w); \ + uint out_vert_index = meshlet_compacted_vertex_output(); + gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; + vWorldPos[out_vert_index] = world_pos; } - MESHLET_DECODE_STREAM_32(meta.stream_offset, 4, TANGENT); - -#define UV(index, value) \ - if (index < meta.num_attributes) \ - { \ - vUV[index] = attribute_decode_snorm_exp_uv(value); \ - } - MESHLET_DECODE_STREAM_64(meta.stream_offset, 5, UV); -#endif } diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 2be94f1b..e4c4aaa1 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -423,9 +423,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V { auto *indirect = manager.get_indirect_buffer(); - auto command_words = (use_meshlets ? - sizeof(Vulkan::Meshlet::RuntimeHeaderDecoded) : - sizeof(VkDrawIndexedIndirectCommand)) / sizeof(uint32_t); + auto command_words = use_meshlets ? 0 : (sizeof(VkDrawIndexedIndirectCommand) / sizeof(uint32_t)); cmd->set_specialization_constant_mask(3); cmd->set_specialization_constant(0, uint32_t(command_words)); @@ -435,7 +433,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_storage_buffer(0, 0, *aabb_buffer); cmd->set_storage_buffer(0, 1, *cached_transform_buffer); cmd->set_storage_buffer(0, 2, *task_buffer); - cmd->set_storage_buffer(0, 3, *indirect); + cmd->set_storage_buffer(0, 3, indirect ? *indirect : *indirect_draws); cmd->set_storage_buffer(0, 4, *indirect_draws); cmd->set_storage_buffer(0, 5, *compacted_params); cmd->set_storage_buffer(0, 6, *manager.get_cluster_bounds_buffer()); @@ -458,49 +456,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto *pos = manager.get_position_buffer(); auto *attr = manager.get_attribute_buffer(); - if (manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::MeshletEncoded) - { - cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); - camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); - render_context.set_camera(camera); - cmd->set_opaque_state(); - - *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; - - memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), - 6 * sizeof(vec4)); - - *cmd->allocate_typed_constant_data(1, 2, 1) = - float(1 << 8 /* shader assumes 8 */) * - vec4(cmd->get_viewport().x + 0.5f * cmd->get_viewport().width - 0.5f, - cmd->get_viewport().y + 0.5f * cmd->get_viewport().height - 0.5f, - 0.5f * cmd->get_viewport().width, - 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - - if (use_preculling) - { - - } - else - { - cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug.task", - "assets://shaders/meshlet_debug.mesh.frag"); - } - - cmd->set_storage_buffer(0, 0, *ibo); - cmd->set_storage_buffer(0, 1, *pos); - cmd->set_storage_buffer(0, 2, *attr); - cmd->set_storage_buffer(0, 3, *indirect_draws); - if (use_preculling) - cmd->set_storage_buffer(0, 4, *compacted_params); - cmd->set_storage_buffer(0, 5, *cached_transform_buffer); - - - GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); - cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); - cmd->end_render_pass(); - } - else if (manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::MeshletDecoded) + if (use_meshlets) { cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); @@ -517,31 +473,44 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); bool use_hierarchical = device.get_device_features().driver_id != VK_DRIVER_ID_NVIDIA_PROPRIETARY; + bool use_encoded = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::MeshletEncoded; cmd->set_specialization_constant_mask(3); cmd->set_specialization_constant(0, target_meshlet_workgroup_size / 32); cmd->set_specialization_constant(1, num_chunk_workgroups); - cmd->set_storage_buffer(0, 0, *ibo); - cmd->set_storage_buffer(0, 1, *pos); - cmd->set_storage_buffer(0, 2, *attr); - cmd->set_storage_buffer(0, 3, *manager.get_indirect_buffer()); + if (use_encoded) + { + cmd->set_storage_buffer(0, 0, *manager.get_meshlet_header_buffer()); + cmd->set_storage_buffer(0, 1, *manager.get_meshlet_stream_header_buffer()); + cmd->set_storage_buffer(0, 2, *manager.get_meshlet_payload_buffer()); + } + else + { + cmd->set_storage_buffer(0, 0, *ibo); + cmd->set_storage_buffer(0, 1, *pos); + cmd->set_storage_buffer(0, 2, *attr); + } + + if (!use_encoded) + cmd->set_storage_buffer(0, 3, *manager.get_indirect_buffer()); if (use_preculling) cmd->set_storage_buffer(0, 4, *compacted_params); cmd->set_storage_buffer(0, 5, *cached_transform_buffer); GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); + const char *mesh_path = use_encoded ? "assets://shaders/meshlet_debug.mesh" : "assets://shaders/meshlet_debug_plain.mesh"; + if (use_preculling) { - cmd->set_program("", "assets://shaders/meshlet_debug_plain.mesh", - "assets://shaders/meshlet_debug.mesh.frag", + cmd->set_program("", mesh_path, "assets://shaders/meshlet_debug.mesh.frag", { { "MESHLET_SIZE", int(target_meshlet_workgroup_size) } }); memcpy(cmd->allocate_typed_constant_data(1, 1, 6), render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); } else { - cmd->set_program("assets://shaders/meshlet_debug.task", "assets://shaders/meshlet_debug_plain.mesh", + cmd->set_program("assets://shaders/meshlet_debug.task", mesh_path, "assets://shaders/meshlet_debug.mesh.frag", { { "MESHLET_SIZE", int(target_meshlet_workgroup_size) }, { "MESHLET_RENDER_TASK_HIERARCHICAL", int(use_hierarchical) } }); diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 1debda56..28332ab7 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -178,7 +178,7 @@ void ResourceManager::init() if (device->get_device_features().mesh_shader_features.taskShader && device->get_device_features().mesh_shader_features.meshShader) { - mesh_encoding = MeshEncoding::MeshletDecoded; + mesh_encoding = MeshEncoding::MeshletEncoded; LOGI("Opting in to meshlet path.\n"); } From 8839d78419f3b4af270778b848c101fa7598421e Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 16:13:06 +0100 Subject: [PATCH 38/59] Explicitly set up local invocation. --- assets/shaders/inc/meshlet_primitive_cull.h | 43 +++++++++++-------- tests/assets/shaders/meshlet_debug.mesh | 7 ++- tests/assets/shaders/meshlet_debug_plain.mesh | 2 + tests/meshlet_viewer.cpp | 11 +++++ 4 files changed, 43 insertions(+), 20 deletions(-) diff --git a/assets/shaders/inc/meshlet_primitive_cull.h b/assets/shaders/inc/meshlet_primitive_cull.h index e95ec1b3..a5b852e7 100644 --- a/assets/shaders/inc/meshlet_primitive_cull.h +++ b/assets/shaders/inc/meshlet_primitive_cull.h @@ -22,26 +22,33 @@ const uint CLIP_CODE_POSITIVE_X = 1 << 4; const uint CLIP_CODE_POSITIVE_Y = 1 << 5; const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; +uvec2 LocalInvocationID; + +void meshlet_setup_local_invocation(uvec2 local_id) +{ + LocalInvocationID = local_id; +} + uint compacted_vertex_output(uint index) { - return shared_active_vert_count[gl_LocalInvocationID.y] + - bitCount(bitfieldExtract(shared_active_vert[gl_LocalInvocationID.y], 0, int(index))); + return shared_active_vert_count[LocalInvocationID.y] + + bitCount(bitfieldExtract(shared_active_vert[LocalInvocationID.y], 0, int(index))); } uint meshlet_compacted_vertex_output() { - return compacted_vertex_output(gl_LocalInvocationID.x); + return compacted_vertex_output(LocalInvocationID.x); } uint compacted_index_output() { - return shared_active_prim_count[gl_LocalInvocationID.y] + - bitCount(bitfieldExtract(shared_active_prim[gl_LocalInvocationID.y], 0, int(gl_LocalInvocationID.x))); + return shared_active_prim_count[LocalInvocationID.y] + + bitCount(bitfieldExtract(shared_active_prim[LocalInvocationID.y], 0, int(LocalInvocationID.x))); } bool meshlet_lane_has_active_vert() { - return (shared_active_vert[gl_LocalInvocationID.y] & (1u << gl_LocalInvocationID.x)) != 0u; + return (shared_active_vert[LocalInvocationID.y] & (1u << LocalInvocationID.x)) != 0u; } uvec3 remap_index_buffer(uvec3 prim) @@ -126,17 +133,17 @@ void meshlet_emit_clip_pos(vec4 clip_pos, vec4 viewport) clip_code |= CLIP_CODE_POSITIVE_Y; vec2 window = roundEven(c * viewport.zw + viewport.xy); - shared_window_positions[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = window; - shared_clip_code[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = uint8_t(clip_code); + shared_window_positions[LocalInvocationID.y][LocalInvocationID.x] = window; + shared_clip_code[LocalInvocationID.y][LocalInvocationID.x] = uint8_t(clip_code); barrier(); } void meshlet_emit_primitive(uvec3 prim) { - uint code_a = shared_clip_code[gl_LocalInvocationID.y][prim.x]; - uint code_b = shared_clip_code[gl_LocalInvocationID.y][prim.y]; - uint code_c = shared_clip_code[gl_LocalInvocationID.y][prim.z]; + uint code_a = shared_clip_code[LocalInvocationID.y][prim.x]; + uint code_b = shared_clip_code[LocalInvocationID.y][prim.y]; + uint code_c = shared_clip_code[LocalInvocationID.y][prim.z]; uint or_code = code_a | code_b | code_c; uint and_code = code_a & code_b & code_c; @@ -150,19 +157,19 @@ void meshlet_emit_primitive(uvec3 prim) if (!force_accept) { - vec2 a = shared_window_positions[gl_LocalInvocationID.y][prim.x]; - vec2 b = shared_window_positions[gl_LocalInvocationID.y][prim.y]; - vec2 c = shared_window_positions[gl_LocalInvocationID.y][prim.z]; + vec2 a = shared_window_positions[LocalInvocationID.y][prim.x]; + vec2 b = shared_window_positions[LocalInvocationID.y][prim.y]; + vec2 c = shared_window_positions[LocalInvocationID.y][prim.z]; force_accept = cull_triangle(a, b, c); } if (force_accept) { is_active_prim = true; - atomicOr(shared_active_prim[gl_LocalInvocationID.y], 1u << gl_LocalInvocationID.x); - atomicOr(shared_active_vert[gl_LocalInvocationID.y], 1u << prim.x); - atomicOr(shared_active_vert[gl_LocalInvocationID.y], 1u << prim.y); - atomicOr(shared_active_vert[gl_LocalInvocationID.y], 1u << prim.z); + atomicOr(shared_active_prim[LocalInvocationID.y], 1u << LocalInvocationID.x); + atomicOr(shared_active_vert[LocalInvocationID.y], 1u << prim.x); + atomicOr(shared_active_vert[LocalInvocationID.y], 1u << prim.y); + atomicOr(shared_active_vert[LocalInvocationID.y], 1u << prim.z); } } diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index f30b67a6..fc3464cf 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -69,15 +69,18 @@ void main() if (gl_SubgroupSize == 32) { - chunk_index = gl_SubgroupID + base_chunk_index; + chunk_index = gl_SubgroupID; lane_index = int(gl_SubgroupInvocationID); } else { - chunk_index = gl_LocalInvocationID.y + base_chunk_index; + chunk_index = gl_LocalInvocationID.y; lane_index = int(gl_LocalInvocationID.x); } + meshlet_setup_local_invocation(uvec2(lane_index, chunk_index)); + chunk_index += base_chunk_index; + MeshletChunkInfo index_chunk_info = meshlet_get_chunk_info(meta.stream_offset, chunk_index); uint decoded_index_buffer = 0; vec3 world_pos; diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 8cf81daa..8cf633a3 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -77,6 +77,8 @@ void main() vec3 world_pos = (M * vec4(pos, 1.0)).xyz; vec4 clip_pos = VP * vec4(world_pos, 1.0); + meshlet_setup_local_invocation(gl_LocalInvocationID.xy); + meshlet_emit_clip_pos(clip_pos, viewport); uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index + 32u * base_chunk_index]); diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index e4c4aaa1..ac1f7655 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -522,6 +522,17 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); } + if (device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT)) + { + cmd->enable_subgroup_size_control(true, VK_SHADER_STAGE_MESH_BIT_EXT); + cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); + } + else if (device.supports_subgroup_size_log2(true, 0, 7, VK_SHADER_STAGE_MESH_BIT_EXT)) + { + cmd->enable_subgroup_size_control(true, VK_SHADER_STAGE_MESH_BIT_EXT); + cmd->set_subgroup_size_log2(true, 0, 7, VK_SHADER_STAGE_MESH_BIT_EXT); + } + if (use_preculling) { cmd->draw_mesh_tasks_indirect(*indirect_draws, 0, 1, sizeof(VkDrawMeshTasksIndirectCommandEXT)); From a5cd296ac08de98c0c1d08731884b6bd90f3053c Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Fri, 22 Dec 2023 19:10:39 +0100 Subject: [PATCH 39/59] Fix misc bugs. --- assets/shaders/inc/meshlet_primitive_cull.h | 196 +++++++++++++----- tests/assets/shaders/meshlet_debug.mesh | 10 +- tests/assets/shaders/meshlet_debug_plain.mesh | 18 +- tests/meshlet_viewer.cpp | 8 +- 4 files changed, 177 insertions(+), 55 deletions(-) diff --git a/assets/shaders/inc/meshlet_primitive_cull.h b/assets/shaders/inc/meshlet_primitive_cull.h index a5b852e7..22b57720 100644 --- a/assets/shaders/inc/meshlet_primitive_cull.h +++ b/assets/shaders/inc/meshlet_primitive_cull.h @@ -1,18 +1,49 @@ #ifndef MESHLET_PRIMITIVE_CULL_H_ #define MESHLET_PRIMITIVE_CULL_H_ -#ifndef MESHLET_SIZE -#error "Must define MESHLET_SIZE" +#pragma optimize off + +#define CULL_MODE_WG32 0 +#define CULL_MODE_WAVE32 1 +#define CULL_MODE_GENERIC 2 + +#if defined(MESHLET_PRIMITIVE_CULL_WG32) && MESHLET_PRIMITIVE_CULL_WG32 +#define CULL_MODE CULL_MODE_WG32 +#elif defined(MESHLET_PRIMITIVE_CULL_WAVE32) && MESHLET_PRIMITIVE_CULL_WAVE32 +#define CULL_MODE CULL_MODE_WAVE32 +#else +#define CULL_MODE CULL_MODE_GENERIC #endif -shared vec2 shared_window_positions[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; -shared uint8_t shared_clip_code[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; -shared uint shared_active_vert[gl_WorkGroupSize.y]; -shared uint shared_active_prim[gl_WorkGroupSize.y]; +#if CULL_MODE == CULL_MODE_WG32 +uint shared_active_vert_count_total; +uint shared_active_prim_count_total; +#else shared uint shared_active_vert_count[gl_WorkGroupSize.y]; shared uint shared_active_prim_count[gl_WorkGroupSize.y]; shared uint shared_active_vert_count_total; shared uint shared_active_prim_count_total; +#endif + +#if CULL_MODE != CULL_MODE_GENERIC +uint shared_active_vert_mask; +uint shared_active_prim_offset; +#endif + +#if CULL_MODE == CULL_MODE_GENERIC +shared uint shared_active_vert_mask[gl_WorkGroupSize.y]; +shared uint shared_active_prim_mask[gl_WorkGroupSize.y]; +shared vec2 shared_window_positions[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; +shared uint8_t shared_clip_code[gl_WorkGroupSize.y][gl_WorkGroupSize.x]; +uvec2 LocalInvocationID; + +void meshlet_setup_local_invocation(uvec2 local_id) +{ + LocalInvocationID = local_id; +} +#else +#define meshlet_setup_local_invocation(id) +#endif const uint CLIP_CODE_INACCURATE = 1 << 0; const uint CLIP_CODE_NEGATIVE_W = 1 << 1; @@ -22,17 +53,51 @@ const uint CLIP_CODE_POSITIVE_X = 1 << 4; const uint CLIP_CODE_POSITIVE_Y = 1 << 5; const uint CLIP_CODE_PLANES = uint(-1) & ~CLIP_CODE_INACCURATE; -uvec2 LocalInvocationID; +#if CULL_MODE == CULL_MODE_WG32 +uint compacted_vertex_output(uint index) +{ + return bitCount(bitfieldExtract(shared_active_vert_mask, 0, int(index))); +} -void meshlet_setup_local_invocation(uvec2 local_id) +uint meshlet_compacted_vertex_output() { - LocalInvocationID = local_id; + return compacted_vertex_output(gl_SubgroupInvocationID); +} + +uint compacted_index_output() +{ + return shared_active_prim_offset; +} + +bool meshlet_lane_has_active_vert() +{ + return (shared_active_vert_mask & (1u << gl_SubgroupInvocationID)) != 0u; +} +#elif CULL_MODE == CULL_MODE_WAVE32 +uint compacted_vertex_output(uint index) +{ + return shared_active_vert_count[gl_SubgroupID] + bitCount(bitfieldExtract(shared_active_vert_mask, 0, int(index))); +} + +uint meshlet_compacted_vertex_output() +{ + return compacted_vertex_output(gl_SubgroupInvocationID); +} + +uint compacted_index_output() +{ + return shared_active_prim_count[gl_SubgroupID] + shared_active_prim_offset; } +bool meshlet_lane_has_active_vert() +{ + return (shared_active_vert_mask & (1u << gl_SubgroupInvocationID)) != 0u; +} +#else uint compacted_vertex_output(uint index) { return shared_active_vert_count[LocalInvocationID.y] + - bitCount(bitfieldExtract(shared_active_vert[LocalInvocationID.y], 0, int(index))); + bitCount(bitfieldExtract(shared_active_vert_mask[LocalInvocationID.y], 0, int(index))); } uint meshlet_compacted_vertex_output() @@ -43,13 +108,14 @@ uint meshlet_compacted_vertex_output() uint compacted_index_output() { return shared_active_prim_count[LocalInvocationID.y] + - bitCount(bitfieldExtract(shared_active_prim[LocalInvocationID.y], 0, int(LocalInvocationID.x))); + bitCount(bitfieldExtract(shared_active_prim_mask[LocalInvocationID.y], 0, int(LocalInvocationID.x))); } bool meshlet_lane_has_active_vert() { - return (shared_active_vert[LocalInvocationID.y] & (1u << LocalInvocationID.x)) != 0u; + return (shared_active_vert_mask[LocalInvocationID.y] & (1u << LocalInvocationID.x)) != 0u; } +#endif uvec3 remap_index_buffer(uvec3 prim) { @@ -88,36 +154,35 @@ bool cull_triangle(vec2 a, vec2 b, vec2 c) return active_primitive; } +#if CULL_MODE == CULL_MODE_GENERIC void meshlet_init_shared() { - if (gl_LocalInvocationIndex < MESHLET_SIZE / 32) + if (gl_LocalInvocationIndex < gl_WorkGroupSize.y) { - shared_active_vert[gl_LocalInvocationIndex] = 0; - shared_active_prim[gl_LocalInvocationIndex] = 0; + shared_active_vert_mask[gl_LocalInvocationIndex] = 0; + shared_active_prim_mask[gl_LocalInvocationIndex] = 0; } } +#endif uint meshlet_get_meshlet_index() { -#if MESHLET_SIZE != 256 - return gl_WorkGroupID.y; -#else - return gl_WorkGroupID.x; -#endif + if (gl_WorkGroupSize.y != 8) + return gl_WorkGroupID.y; + else + return gl_WorkGroupID.x; } uint meshlet_get_base_chunk_index() { -#if MESHLET_SIZE != 256 - return gl_WorkGroupID.x * (MESHLET_SIZE / 32); -#else - return 0; -#endif + if (gl_WorkGroupSize.y != 8) + return gl_WorkGroupID.x * gl_WorkGroupSize.y; + else + return 0; } -void meshlet_emit_clip_pos(vec4 clip_pos, vec4 viewport) +void meshlet_emit_primitive(uvec3 prim, vec4 clip_pos, vec4 viewport) { - meshlet_init_shared(); vec2 c = clip_pos.xy / clip_pos.w; uint clip_code = clip_pos.w <= 0.0 ? CLIP_CODE_NEGATIVE_W : 0; @@ -133,17 +198,25 @@ void meshlet_emit_clip_pos(vec4 clip_pos, vec4 viewport) clip_code |= CLIP_CODE_POSITIVE_Y; vec2 window = roundEven(c * viewport.zw + viewport.xy); + +#if CULL_MODE != CULL_MODE_GENERIC + vec2 window_a = subgroupShuffle(window, prim.x); + vec2 window_b = subgroupShuffle(window, prim.y); + vec2 window_c = subgroupShuffle(window, prim.z); + uint code_a = subgroupShuffle(clip_code, prim.x); + uint code_b = subgroupShuffle(clip_code, prim.y); + uint code_c = subgroupShuffle(clip_code, prim.z); +#else + meshlet_init_shared(); shared_window_positions[LocalInvocationID.y][LocalInvocationID.x] = window; shared_clip_code[LocalInvocationID.y][LocalInvocationID.x] = uint8_t(clip_code); barrier(); -} -void meshlet_emit_primitive(uvec3 prim) -{ uint code_a = shared_clip_code[LocalInvocationID.y][prim.x]; uint code_b = shared_clip_code[LocalInvocationID.y][prim.y]; uint code_c = shared_clip_code[LocalInvocationID.y][prim.z]; +#endif uint or_code = code_a | code_b | code_c; uint and_code = code_a & code_b & code_c; @@ -153,26 +226,47 @@ void meshlet_emit_primitive(uvec3 prim) if (!culled_planes) { - bool force_accept = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; + is_active_prim = (or_code & (CLIP_CODE_INACCURATE | CLIP_CODE_NEGATIVE_W)) != 0; - if (!force_accept) + if (!is_active_prim) { - vec2 a = shared_window_positions[LocalInvocationID.y][prim.x]; - vec2 b = shared_window_positions[LocalInvocationID.y][prim.y]; - vec2 c = shared_window_positions[LocalInvocationID.y][prim.z]; - force_accept = cull_triangle(a, b, c); - } - - if (force_accept) - { - is_active_prim = true; - atomicOr(shared_active_prim[LocalInvocationID.y], 1u << LocalInvocationID.x); - atomicOr(shared_active_vert[LocalInvocationID.y], 1u << prim.x); - atomicOr(shared_active_vert[LocalInvocationID.y], 1u << prim.y); - atomicOr(shared_active_vert[LocalInvocationID.y], 1u << prim.z); +#if CULL_MODE == CULL_MODE_GENERIC + vec2 window_a = shared_window_positions[LocalInvocationID.y][prim.x]; + vec2 window_b = shared_window_positions[LocalInvocationID.y][prim.y]; + vec2 window_c = shared_window_positions[LocalInvocationID.y][prim.z]; +#endif + is_active_prim = cull_triangle(window_a, window_b, window_c); } } + uint vert_mask = 0u; + if (is_active_prim) + vert_mask = (1u << prim.x) | (1u << prim.y) | (1u << prim.z); + +#if CULL_MODE != CULL_MODE_GENERIC + uvec4 prim_ballot = subgroupBallot(is_active_prim); + shared_active_prim_offset = subgroupBallotExclusiveBitCount(prim_ballot); + shared_active_vert_mask = subgroupOr(vert_mask); +#endif + +#if CULL_MODE == CULL_MODE_WG32 + shared_active_prim_count_total = subgroupBallotBitCount(prim_ballot); + shared_active_vert_count_total = bitCount(shared_active_vert_mask); +#elif CULL_MODE == CULL_MODE_WAVE32 + if (subgroupElect()) + { + shared_active_prim_count[gl_SubgroupID] = subgroupBallotBitCount(prim_ballot); + shared_active_vert_count[gl_SubgroupID] = bitCount(shared_active_vert_mask); + } +#else + if (is_active_prim) + { + atomicOr(shared_active_prim_mask[LocalInvocationID.y], 1u << LocalInvocationID.x); + atomicOr(shared_active_vert_mask[LocalInvocationID.y], vert_mask); + } +#endif + +#if CULL_MODE != CULL_MODE_WG32 barrier(); if (gl_LocalInvocationIndex == 0) @@ -182,10 +276,17 @@ void meshlet_emit_primitive(uvec3 prim) for (uint i = 0; i < gl_WorkGroupSize.y; i++) { +#if CULL_MODE == CULL_MODE_WAVE32 + uint prim_count = shared_active_prim_count[i]; + uint vert_count = shared_active_vert_count[i]; +#else + uint prim_count = bitCount(shared_active_prim_mask[i]); + uint vert_count = bitCount(shared_active_vert_mask[i]); +#endif shared_active_prim_count[i] = active_prim; shared_active_vert_count[i] = active_vert; - active_prim += bitCount(shared_active_prim[i]); - active_vert += bitCount(shared_active_vert[i]); + active_prim += prim_count; + active_vert += vert_count; } shared_active_prim_count_total = active_prim; @@ -193,6 +294,7 @@ void meshlet_emit_primitive(uvec3 prim) } barrier(); +#endif SetMeshOutputsEXT(shared_active_vert_count_total, shared_active_prim_count_total); diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index fc3464cf..0a2960df 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -1,6 +1,12 @@ #version 450 #extension GL_EXT_mesh_shader : require + #extension GL_KHR_shader_subgroup_basic : require +#if defined(MESHLET_PRIMITIVE_CULL_WAVE32) && MESHLET_PRIMITIVE_CULL_WAVE32 +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#endif #ifndef MESHLET_SIZE #error "Must define MESHLET_SIZE" @@ -10,7 +16,6 @@ layout(max_primitives = MESHLET_SIZE, max_vertices = MESHLET_SIZE, triangles) ou layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_payload_constants.h" -layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; #define MESHLET_PAYLOAD_DESCRIPTOR_SET 0 #define MESHLET_PAYLOAD_META_BINDING 0 @@ -101,8 +106,7 @@ void main() } } - meshlet_emit_clip_pos(clip_pos, viewport); - meshlet_emit_primitive(unpack8(decoded_index_buffer).xyz); + meshlet_emit_primitive(unpack8(decoded_index_buffer).xyz, clip_pos, viewport); if (linear_index < shared_active_prim_count_total) vDrawID[linear_index] = task.meshlet_index; diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 8cf633a3..cc7e5585 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -4,6 +4,13 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_scalar_block_layout : require +#extension GL_KHR_shader_subgroup_basic : require +#if defined(MESHLET_PRIMITIVE_CULL_WAVE32) && MESHLET_PRIMITIVE_CULL_WAVE32 +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#endif + #ifndef MESHLET_SIZE #error "Must define MESHLET_SIZE" #endif @@ -58,7 +65,6 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms void main() { - uint linear_index = gl_LocalInvocationIndex; uint compacted_meshlet_index = meshlet_get_meshlet_index(); uint base_chunk_index = meshlet_get_base_chunk_index(); @@ -69,6 +75,12 @@ void main() CompactedDrawInfo task = mesh_payload.infos[compacted_meshlet_index]; #endif +#if defined(MESHLET_PRIMITIVE_CULL_WAVE32) && MESHLET_PRIMITIVE_CULL_WAVE32 + uint linear_index = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; +#else + uint linear_index = gl_LocalInvocationIndex; +#endif + IndirectDrawMesh meshlet = indirect_commands_mesh.draws[task.meshlet_index]; mat4 M = transforms.data[task.node_offset]; @@ -79,10 +91,8 @@ void main() meshlet_setup_local_invocation(gl_LocalInvocationID.xy); - meshlet_emit_clip_pos(clip_pos, viewport); - uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index + 32u * base_chunk_index]); - meshlet_emit_primitive(prim); + meshlet_emit_primitive(prim, clip_pos, viewport); if (linear_index < shared_active_prim_count_total) vDrawID[linear_index] = task.meshlet_index; diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index ac1f7655..661e0676 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -369,6 +369,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V uint32_t target_meshlet_workgroup_size = max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); + target_meshlet_workgroup_size = min(256u, target_meshlet_workgroup_size); target_meshlet_workgroup_size = 1u << Util::floor_log2(target_meshlet_workgroup_size); uint32_t num_chunk_workgroups = 256u / target_meshlet_workgroup_size; @@ -501,6 +502,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V const char *mesh_path = use_encoded ? "assets://shaders/meshlet_debug.mesh" : "assets://shaders/meshlet_debug_plain.mesh"; + bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); + bool supports_wg32 = supports_wave32 && target_meshlet_workgroup_size == 32; + if (use_preculling) { cmd->set_program("", mesh_path, "assets://shaders/meshlet_debug.mesh.frag", @@ -513,7 +517,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_program("assets://shaders/meshlet_debug.task", mesh_path, "assets://shaders/meshlet_debug.mesh.frag", { { "MESHLET_SIZE", int(target_meshlet_workgroup_size) }, - { "MESHLET_RENDER_TASK_HIERARCHICAL", int(use_hierarchical) } }); + { "MESHLET_RENDER_TASK_HIERARCHICAL", int(use_hierarchical) }, + { "MESHLET_PRIMITIVE_CULL_WG32", int(supports_wg32) }, + { "MESHLET_PRIMITIVE_CULL_WAVE32", int(supports_wave32) } }); cmd->set_storage_buffer(0, 6, *aabb_buffer); cmd->set_storage_buffer(0, 7, *task_buffer); From 9d4176800df6f794428dc007cd2248e982aeeeea Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sat, 23 Dec 2023 14:30:30 +0100 Subject: [PATCH 40/59] Add some statistics. --- assets/shaders/inc/meshlet_primitive_cull.h | 2 - tests/assets/shaders/meshlet_debug.mesh | 14 +++ tests/assets/shaders/meshlet_debug.mesh.frag | 8 +- tests/assets/shaders/meshlet_debug_plain.mesh | 14 +++ tests/meshlet_viewer.cpp | 97 ++++++++++++------- 5 files changed, 95 insertions(+), 40 deletions(-) diff --git a/assets/shaders/inc/meshlet_primitive_cull.h b/assets/shaders/inc/meshlet_primitive_cull.h index 22b57720..f8675e0a 100644 --- a/assets/shaders/inc/meshlet_primitive_cull.h +++ b/assets/shaders/inc/meshlet_primitive_cull.h @@ -1,8 +1,6 @@ #ifndef MESHLET_PRIMITIVE_CULL_H_ #define MESHLET_PRIMITIVE_CULL_H_ -#pragma optimize off - #define CULL_MODE_WG32 0 #define CULL_MODE_WAVE32 1 #define CULL_MODE_GENERIC 2 diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index 0a2960df..b5311c66 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -53,6 +53,13 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms mat4 data[]; } transforms; +layout(set = 0, binding = 10) buffer Stats +{ + uint invocations; + uint prim; + uint vert; +} stats; + void main() { uint linear_index = gl_LocalInvocationIndex; @@ -116,4 +123,11 @@ void main() gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; vWorldPos[out_vert_index] = world_pos; } + + if (gl_LocalInvocationIndex == 0) + { + atomicAdd(stats.invocations, gl_WorkGroupSize.x * gl_WorkGroupSize.y); + atomicAdd(stats.prim, shared_active_prim_count_total); + atomicAdd(stats.vert, shared_active_vert_count_total); + } } diff --git a/tests/assets/shaders/meshlet_debug.mesh.frag b/tests/assets/shaders/meshlet_debug.mesh.frag index a804f252..5cf99a06 100644 --- a/tests/assets/shaders/meshlet_debug.mesh.frag +++ b/tests/assets/shaders/meshlet_debug.mesh.frag @@ -19,11 +19,11 @@ void main() vec3 normal = normalize(cross(vWorldPos[1] - vWorldPos[0], vWorldPos[2] - vWorldPos[0])); - FragColor = 0.1 * (0.5 * normal + 0.5); + FragColor = 0.5 * (0.5 * normal + 0.5); FragColor.rg += 0.2 * highlight; uint hashed = vDrawID ^ (vDrawID * 23423465); - FragColor.r += 0.5 * float(hashed % 19) / 19.0; - FragColor.g += 0.5 * float(hashed % 29) / 29.0; - FragColor.b += 0.5 * float(hashed % 131) / 131.0; + FragColor.r += 0.05 * float(hashed % 19) / 19.0; + FragColor.g += 0.05 * float(hashed % 29) / 29.0; + FragColor.b += 0.05 * float(hashed % 131) / 131.0; } \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index cc7e5585..73a241ed 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -63,6 +63,13 @@ layout(set = 0, binding = 5, std430) readonly buffer Transforms mat4 data[]; } transforms; +layout(set = 0, binding = 10) buffer Stats +{ + uint invocations; + uint prim; + uint vert; +} stats; + void main() { uint compacted_meshlet_index = meshlet_get_meshlet_index(); @@ -103,4 +110,11 @@ void main() gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; vWorldPos[out_vert_index] = world_pos; } + + if (gl_LocalInvocationIndex == 0) + { + atomicAdd(stats.invocations, gl_WorkGroupSize.x * gl_WorkGroupSize.y); + atomicAdd(stats.prim, shared_active_prim_count_total); + atomicAdd(stats.vert, shared_active_vert_count_total); + } } diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 661e0676..8e7ce502 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -36,6 +36,8 @@ #include "render_context.hpp" #include "material_manager.hpp" #include "mesh_util.hpp" +#include "flat_renderer.hpp" +#include "ui_manager.hpp" #include "gltf.hpp" #include #include @@ -45,21 +47,6 @@ using namespace Granite; using namespace Vulkan; using namespace Vulkan::Meshlet; -static uint32_t style_to_u32_streams(MeshStyle style) -{ - switch (style) - { - case MeshStyle::Wireframe: - return 3; - case MeshStyle::Textured: - return 7; - case MeshStyle::Skinned: - return 9; - default: - return 0; - } -} - struct MeshletRenderable : AbstractRenderable { AssetID mesh; @@ -142,7 +129,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto &node_transform = nodeptr->get_transform(); node_transform.translation = node.transform.translation; node_transform.rotation = node.transform.rotation; - node_transform.scale = node.transform.scale; + node_transform.scale = node.transform.scale /** vec3(0.01f) */; nodes.push_back(std::move(nodeptr)); } @@ -173,15 +160,16 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto root = scene.create_node(); #if 1 - for (int z = -10; z <= 10; z++) - for (int y = -10; y <= 10; y++) - for (int x = -10; x <= 10; x++) + for (int z = -3; z <= 3; z++) + for (int y = -3; y <= 3; y++) + for (int x = -3; x <= 3; x++) { if (!x && !y && !z) continue; auto nodeptr = scene.create_node(); auto &node_transform = nodeptr->get_transform(); node_transform.translation = vec3(x, y, z) * 3.0f; + //node_transform.scale = vec3(0.01f); root->add_child(nodeptr); auto renderable = Util::make_handle(); @@ -228,6 +216,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V root->add_child(nodes[scene_node_index]); scene.set_root_node(std::move(root)); + camera.look_at(vec3(1.5f, 1.5f, 1.5f), vec3(0.0f)); EVENT_MANAGER_REGISTER_LATCH(MeshletViewerApplication, on_device_create, on_device_destroy, DeviceCreatedEvent); } @@ -314,6 +303,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V return; } + auto start_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT); + BufferHandle task_buffer, cached_transform_buffer, aabb_buffer, compacted_params, indirect_draws; { @@ -345,9 +336,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V BufferHandle readback_counter, readback; { BufferCreateInfo info; - info.size = 4; + info.size = 12; info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT; - info.domain = BufferDomain::LinkedDeviceHost; + info.domain = BufferDomain::CachedHost; readback = device.create_buffer(info); info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; @@ -367,8 +358,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V const bool use_meshlets = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::VBOAndIBOMDI; const bool use_preculling = !use_meshlets; - uint32_t target_meshlet_workgroup_size = - max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); + //uint32_t target_meshlet_workgroup_size = + // max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); + uint32_t target_meshlet_workgroup_size = 32; target_meshlet_workgroup_size = min(256u, target_meshlet_workgroup_size); target_meshlet_workgroup_size = 1u << Util::floor_log2(target_meshlet_workgroup_size); @@ -438,6 +430,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_storage_buffer(0, 4, *indirect_draws); cmd->set_storage_buffer(0, 5, *compacted_params); cmd->set_storage_buffer(0, 6, *manager.get_cluster_bounds_buffer()); + cmd->set_storage_buffer(0, 10, *readback_counter); memcpy(cmd->allocate_typed_constant_data(0, 7, 6), render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); @@ -498,6 +491,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V if (use_preculling) cmd->set_storage_buffer(0, 4, *compacted_params); cmd->set_storage_buffer(0, 5, *cached_transform_buffer); + cmd->set_storage_buffer(0, 10, *readback_counter); GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); const char *mesh_path = use_encoded ? "assets://shaders/meshlet_debug.mesh" : "assets://shaders/meshlet_debug_plain.mesh"; @@ -559,7 +553,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->draw_mesh_tasks(to_dispatch, 1, 1); } } - cmd->end_render_pass(); } else { @@ -587,22 +580,58 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V 256, max_draws, sizeof(VkDrawIndexedIndirectCommand), *indirect_draws, 0); - - cmd->end_render_pass(); - cmd->barrier(VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, - VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_READ_BIT); - cmd->copy_buffer(*readback, 0, *indirect_draws, 0, sizeof(uint32_t)); - cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); } + flat_renderer.begin(); + flat_renderer.render_quad(vec3(0.0f, 0.0f, 0.5f), vec2(600.0f, 140.0f), vec4(0.0f, 0.0f, 0.0f, 0.5f)); + char text[256]; + snprintf(text, sizeof(text), "Mesh shader invocations: %.3f M / %.3f M", 1e-6 * last_mesh_invocations, 1e-6 * double(max_draws * MaxElements)); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), + text, vec3(10.0f, 10.0f, 0.0f), vec2(1000.0f)); + snprintf(text, sizeof(text), "Primitives: %.3f M", 1e-6 * last_prim); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), + text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); + snprintf(text, sizeof(text), "Vertices: %.3f M", 1e-6 * last_vert); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), + text, vec3(10.0f, 90.0f, 0.0f), vec2(1000.0f)); + flat_renderer.flush(*cmd, vec3(0.0f), vec3(cmd->get_viewport().width, cmd->get_viewport().height, 1.0f)); + + cmd->end_render_pass(); + + auto end_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT); + device.register_time_interval("GPU", std::move(start_ts), std::move(end_ts), "Render"); + + cmd->barrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_READ_BIT); + cmd->copy_buffer(*readback, *readback_counter); + cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); + Fence fence; device.submit(cmd, &fence); - //fence->wait(); - //LOGI("Number of draws: %u\n", - // *static_cast(device.map_host_buffer(*readback, MEMORY_ACCESS_READ_BIT))); + + readback_ring[readback_index] = std::move(readback); + readback_fence[readback_index] = std::move(fence); + readback_index = (readback_index + 1) & 3; + + if (readback_fence[readback_index]) + { + readback_fence[readback_index]->wait(); + auto *mapped = static_cast(device.map_host_buffer(*readback_ring[readback_index], MEMORY_ACCESS_READ_BIT)); + last_mesh_invocations = mapped[0]; + last_prim = mapped[1]; + last_vert = mapped[2]; + } } + BufferHandle readback_ring[4]; + Fence readback_fence[4]; + unsigned readback_index = 0; + unsigned last_mesh_invocations = 0; + unsigned last_prim = 0; + unsigned last_vert = 0; + FlatRenderer flat_renderer; + void message(const std::string &tag, uint32_t code, uint32_t x, uint32_t y, uint32_t z, uint32_t, const Word *words) override { From b68ac93ef2cf149af5474c5734f9605314288cec Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 12:57:45 +0100 Subject: [PATCH 41/59] Add path for classic direct rendering as well. --- assets/shaders/decode/meshlet_decode.comp | 2 +- tests/assets/shaders/meshlet_debug.frag | 23 ++- tests/assets/shaders/meshlet_debug.mesh.frag | 8 +- tests/assets/shaders/meshlet_debug.vert | 30 ++- tests/meshlet_viewer.cpp | 194 +++++++++++++------ vulkan/managers/resource_manager.cpp | 111 ++++++----- vulkan/managers/resource_manager.hpp | 22 ++- 7 files changed, 249 insertions(+), 141 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index 0ce23779..a9dbdf4a 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -166,7 +166,7 @@ void main() indices += chunk_info.vertex_offset; if (UNROLLED_MESH) - output_indices32.data[primitive_output_offset + lane_index] = indices + meta.base_vertex_offset + registers.vertex_offset; + output_indices32.data[primitive_output_offset + lane_index] = indices + meta.base_vertex_offset; else output_indices8.data[primitive_output_offset + lane_index] = u8vec3(indices); } diff --git a/tests/assets/shaders/meshlet_debug.frag b/tests/assets/shaders/meshlet_debug.frag index e19200e6..a23559c2 100644 --- a/tests/assets/shaders/meshlet_debug.frag +++ b/tests/assets/shaders/meshlet_debug.frag @@ -3,12 +3,25 @@ #extension GL_EXT_fragment_shader_barycentric : require layout(location = 0) pervertexEXT in vec3 vWorldPos[]; + +#if !SINGLE_INSTANCE_RENDER layout(location = 1) flat in uint vDrawID; +#else +struct CompactedDrawInfo { uint meshlet_index; uint node_offset; uint material_index; }; +layout(push_constant) uniform Registers +{ + CompactedDrawInfo draw; +} registers; +#endif layout(location = 0) out vec3 FragColor; void main() { +#if SINGLE_INSTANCE_RENDER + uint vDrawID = registers.draw.meshlet_index; +#endif + vec3 dd = fwidth(gl_BaryCoordEXT); float d = max(max(dd.x, dd.y), dd.z); float l = min(min(gl_BaryCoordEXT.x, gl_BaryCoordEXT.y), gl_BaryCoordEXT.z); @@ -18,11 +31,11 @@ void main() vec3 normal = normalize(cross(vWorldPos[1] - vWorldPos[0], vWorldPos[2] - vWorldPos[0])); - FragColor = 0.1 * (0.5 * normal + 0.5); - FragColor.rg += 0.2 * highlight; + FragColor = 0.5 * (0.5 * normal + 0.5); + FragColor.rg += 0.05 * highlight; uint hashed = vDrawID ^ (vDrawID * 23423465); - FragColor.r += 0.5 * float(hashed % 19) / 19.0; - FragColor.g += 0.5 * float(hashed % 29) / 29.0; - FragColor.b += 0.5 * float(hashed % 131) / 131.0; + FragColor.r += 0.01 * float(hashed % 19) / 19.0; + FragColor.g += 0.01 * float(hashed % 29) / 29.0; + FragColor.b += 0.01 * float(hashed % 131) / 131.0; } diff --git a/tests/assets/shaders/meshlet_debug.mesh.frag b/tests/assets/shaders/meshlet_debug.mesh.frag index 5cf99a06..b785c964 100644 --- a/tests/assets/shaders/meshlet_debug.mesh.frag +++ b/tests/assets/shaders/meshlet_debug.mesh.frag @@ -20,10 +20,10 @@ void main() vec3 normal = normalize(cross(vWorldPos[1] - vWorldPos[0], vWorldPos[2] - vWorldPos[0])); FragColor = 0.5 * (0.5 * normal + 0.5); - FragColor.rg += 0.2 * highlight; + FragColor.rg += 0.05 * highlight; uint hashed = vDrawID ^ (vDrawID * 23423465); - FragColor.r += 0.05 * float(hashed % 19) / 19.0; - FragColor.g += 0.05 * float(hashed % 29) / 29.0; - FragColor.b += 0.05 * float(hashed % 131) / 131.0; + FragColor.r += 0.02 * float(hashed % 19) / 19.0; + FragColor.g += 0.02 * float(hashed % 29) / 29.0; + FragColor.b += 0.02 * float(hashed % 131) / 131.0; } \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_debug.vert b/tests/assets/shaders/meshlet_debug.vert index 303b1917..5ba65b65 100644 --- a/tests/assets/shaders/meshlet_debug.vert +++ b/tests/assets/shaders/meshlet_debug.vert @@ -4,19 +4,8 @@ #include "meshlet_render_types.h" layout(location = 0) in vec3 POS; -#if 0 -layout(location = 1) in mediump vec3 N; -layout(location = 2) in mediump vec4 T; -layout(location = 3) in vec2 UV; -#endif - -#if 0 -layout(location = 0) out mediump vec3 vNormal; -layout(location = 1) out mediump vec4 vTangent; -layout(location = 2) out vec2 vUV; -layout(location = 3) flat out uint MaterialOffset; -#else layout(location = 0) out vec3 vWorldPos; +#if !SINGLE_INSTANCE_RENDER layout(location = 1) flat out uint vDrawID; #endif @@ -25,6 +14,12 @@ layout(set = 1, binding = 0) uniform UBO mat4 VP; }; +#if SINGLE_INSTANCE_RENDER +layout(set = 1, binding = 1) uniform DrawParameters +{ + mat4 M; +}; +#else layout(set = 0, binding = 0) readonly buffer DrawParameters { CompactedDrawInfo data[]; @@ -34,19 +29,18 @@ layout(set = 0, binding = 1) readonly buffer Transforms { mat4 data[]; } transforms; +#endif void main() { +#if !SINGLE_INSTANCE_RENDER mat4 M = transforms.data[draw_info.data[gl_DrawIDARB].node_offset]; +#endif vec3 world_pos = (M * vec4(POS, 1.0)).xyz; vWorldPos = world_pos; +#if !SINGLE_INSTANCE_RENDER vDrawID = draw_info.data[gl_DrawIDARB].meshlet_index; +#endif gl_Position = VP * vec4(world_pos, 1.0); -#if 0 - vNormal = mat3(M) * N; - vTangent = vec4(mat3(M) * T.xyz, T.w); - vUV = UV; - MaterialOffset = bitfieldExtract(draw_info.data[gl_DrawIDARB].node_count_material_offset, 8, 24); -#endif } diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 8e7ce502..ac1e1b6e 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -160,9 +160,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto root = scene.create_node(); #if 1 - for (int z = -3; z <= 3; z++) - for (int y = -3; y <= 3; y++) - for (int x = -3; x <= 3; x++) + for (int z = -10; z <= 10; z++) + for (int y = -10; y <= 10; y++) + for (int x = -10; x <= 10; x++) { if (!x && !y && !z) continue; @@ -175,7 +175,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto renderable = Util::make_handle(); renderable->mesh = mesh_assets.front(); renderable->aabb = parser.get_meshes()[0].static_aabb; - renderable->flags |= RENDERABLE_FORCE_VISIBLE_BIT; + //renderable->flags |= RENDERABLE_FORCE_VISIBLE_BIT; scene.create_renderable(std::move(renderable), nodeptr.get()); } #endif @@ -254,6 +254,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V list.clear(); scene.gather_visible_opaque_renderables(render_context.get_visibility_frustum(), list); + bool indirect_rendering = device.get_resource_manager().get_mesh_encoding() != ResourceManager::MeshEncoding::Classic; + struct TaskParameters { uint32_t aabb_instance; @@ -272,41 +274,45 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V std::vector task_params; uint32_t max_draws = 0; - for (auto &vis : list) + if (indirect_rendering) { - auto *meshlet = static_cast(vis.renderable); - auto range = device.get_resource_manager().get_mesh_draw_range(meshlet->mesh); + for (auto &vis: list) + { + auto *meshlet = static_cast(vis.renderable); + auto range = device.get_resource_manager().get_mesh_draw_range(meshlet->mesh); - TaskParameters draw = {}; - draw.aabb_instance = vis.transform->aabb.offset; - auto *node = vis.transform->scene_node; - auto *skin = node->get_skin(); - draw.node_instance = skin ? skin->transform.offset : node->transform.offset; - draw.node_count_material_index = skin ? skin->transform.count : 1; - draw.node_count_material_index |= meshlet->material.texture_offset << 8; - assert((range.offset & 31) == 0); + TaskParameters draw = {}; + draw.aabb_instance = vis.transform->aabb.offset; + auto *node = vis.transform->scene_node; + auto *skin = node->get_skin(); + draw.node_instance = skin ? skin->transform.offset : node->transform.offset; + draw.node_count_material_index = skin ? skin->transform.count : 1; + draw.node_count_material_index |= meshlet->material.texture_offset << 8; + assert((range.meshlet.offset & 31) == 0); - max_draws += range.count; + max_draws += range.meshlet.count; - for (uint32_t i = 0; i < range.count; i += 32) - { - draw.mesh_index_count = range.offset + i + (std::min(range.count - i, 32u) - 1); - task_params.push_back(draw); + for (uint32_t i = 0; i < range.meshlet.count; i += 32) + { + draw.mesh_index_count = range.meshlet.offset + i + (std::min(range.meshlet.count - i, 32u) - 1); + task_params.push_back(draw); + } } - } - if (task_params.empty()) - { - cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); - cmd->end_render_pass(); - device.submit(cmd); - return; + if (task_params.empty()) + { + cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); + cmd->end_render_pass(); + device.submit(cmd); + return; + } } auto start_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT); BufferHandle task_buffer, cached_transform_buffer, aabb_buffer, compacted_params, indirect_draws; + if (indirect_rendering) { BufferCreateInfo info; info.size = task_params.size() * sizeof(task_params.front()); @@ -315,6 +321,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V task_buffer = device.create_buffer(info, task_params.data()); } + if (indirect_rendering) { BufferCreateInfo info; info.size = scene.get_transforms().get_count() * sizeof(*scene.get_transforms().get_cached_transforms()); @@ -323,6 +330,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cached_transform_buffer = device.create_buffer(info, scene.get_transforms().get_cached_transforms()); } + if (indirect_rendering) { BufferCreateInfo info; info.size = scene.get_aabbs().get_count() * sizeof(*scene.get_aabbs().get_aabbs()); @@ -334,6 +342,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto &manager = device.get_resource_manager(); BufferHandle readback_counter, readback; + if (indirect_rendering) { BufferCreateInfo info; info.size = 12; @@ -355,8 +364,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } push; push.camera_pos = render_context.get_render_parameters().camera_position; - const bool use_meshlets = manager.get_mesh_encoding() != Vulkan::ResourceManager::MeshEncoding::VBOAndIBOMDI; - const bool use_preculling = !use_meshlets; + const bool use_meshlets = indirect_rendering && manager.get_mesh_encoding() != ResourceManager::MeshEncoding::VBOAndIBOMDI; + const bool use_preculling = !use_meshlets && indirect_rendering; //uint32_t target_meshlet_workgroup_size = // max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); @@ -403,13 +412,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); - { - BufferCreateInfo info; - info.size = max_draws * sizeof(DrawParameters); - info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - info.domain = BufferDomain::Device; - compacted_params = device.create_buffer(info); - } + info.size = max_draws * sizeof(DrawParameters); + info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + info.domain = BufferDomain::Device; + compacted_params = device.create_buffer(info); } if (use_preculling) @@ -554,7 +560,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } } } - else + else if (manager.get_mesh_encoding() == ResourceManager::MeshEncoding::VBOAndIBOMDI) { cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); @@ -562,7 +568,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; - cmd->set_program("assets://shaders/meshlet_debug.vert", "assets://shaders/meshlet_debug.frag"); + cmd->set_program("assets://shaders/meshlet_debug.vert", "assets://shaders/meshlet_debug.frag", + {{ "SINGLE_INSTANCE_RENDER", 0}}); cmd->set_index_buffer(*ibo, 0, VK_INDEX_TYPE_UINT8_EXT); cmd->set_vertex_binding(0, *pos, 0, 12); cmd->set_vertex_binding(1, *attr, 0, 16); @@ -581,46 +588,111 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V sizeof(VkDrawIndexedIndirectCommand), *indirect_draws, 0); } + else + { + cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); + camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); + cmd->set_opaque_state(); + + *cmd->allocate_typed_constant_data(1, 0, 1) = render_context.get_render_parameters().view_projection; + + cmd->set_program("assets://shaders/meshlet_debug.vert", "assets://shaders/meshlet_debug.frag", + {{ "SINGLE_INSTANCE_RENDER", 1}}); + cmd->set_index_buffer(*ibo, 0, VK_INDEX_TYPE_UINT32); + cmd->set_vertex_binding(0, *pos, 0, 12); + cmd->set_vertex_binding(1, *attr, 0, 16); + cmd->set_vertex_attrib(0, 0, VK_FORMAT_R32G32B32_SFLOAT, 0); + cmd->set_vertex_attrib(1, 1, VK_FORMAT_A2B10G10R10_SNORM_PACK32, 0); + cmd->set_vertex_attrib(2, 1, VK_FORMAT_A2B10G10R10_SNORM_PACK32, 4); + cmd->set_vertex_attrib(3, 1, VK_FORMAT_R32G32_SFLOAT, 8); + + cmd->set_sampler(0, 2, StockSampler::DefaultGeometryFilterWrap); + GRANITE_MATERIAL_MANAGER()->set_bindless(*cmd, 2); + + last_mesh_invocations = 0; + last_vert = 0; + last_prim = 0; + for (auto &draw : list) + { + auto *render = static_cast(draw.renderable); + auto indexed = manager.get_mesh_draw_range(render->mesh).indexed; + + *cmd->allocate_typed_constant_data(1, 1, 1) = draw.transform->get_world_transform(); + + DrawParameters params = {}; + params.meshlet_index = unsigned(&draw - list.data()); + params.node_count = 1; + params.node_instance = 0; + cmd->push_constants(¶ms, 0, sizeof(params)); + + last_mesh_invocations += indexed.indexCount / 3; + + cmd->draw_indexed(indexed.indexCount, indexed.instanceCount, + indexed.firstIndex, indexed.vertexOffset, + indexed.firstInstance); + } + } flat_renderer.begin(); flat_renderer.render_quad(vec3(0.0f, 0.0f, 0.5f), vec2(600.0f, 140.0f), vec4(0.0f, 0.0f, 0.0f, 0.5f)); char text[256]; - snprintf(text, sizeof(text), "Mesh shader invocations: %.3f M / %.3f M", 1e-6 * last_mesh_invocations, 1e-6 * double(max_draws * MaxElements)); + + if (indirect_rendering) + { + snprintf(text, sizeof(text), "Mesh shader invocations: %.3f M / %.3f M", 1e-6 * last_mesh_invocations, + 1e-6 * double(max_draws * MaxElements)); + } + else + { + snprintf(text, sizeof(text), "Direct primitives: %.3f M", 1e-6 * last_mesh_invocations); + } + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), text, vec3(10.0f, 10.0f, 0.0f), vec2(1000.0f)); - snprintf(text, sizeof(text), "Primitives: %.3f M", 1e-6 * last_prim); - flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), - text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); - snprintf(text, sizeof(text), "Vertices: %.3f M", 1e-6 * last_vert); - flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), - text, vec3(10.0f, 90.0f, 0.0f), vec2(1000.0f)); - flat_renderer.flush(*cmd, vec3(0.0f), vec3(cmd->get_viewport().width, cmd->get_viewport().height, 1.0f)); + if (indirect_rendering) + { + snprintf(text, sizeof(text), "Primitives: %.3f M", 1e-6 * last_prim); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), + text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); + snprintf(text, sizeof(text), "Vertices: %.3f M", 1e-6 * last_vert); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), + text, vec3(10.0f, 90.0f, 0.0f), vec2(1000.0f)); + } + + flat_renderer.flush(*cmd, vec3(0.0f), vec3(cmd->get_viewport().width, cmd->get_viewport().height, 1.0f)); cmd->end_render_pass(); auto end_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT); device.register_time_interval("GPU", std::move(start_ts), std::move(end_ts), "Render"); - cmd->barrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_READ_BIT); - cmd->copy_buffer(*readback, *readback_counter); - cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); + if (indirect_rendering) + { + cmd->barrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_READ_BIT); + cmd->copy_buffer(*readback, *readback_counter); + cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); + } Fence fence; device.submit(cmd, &fence); - readback_ring[readback_index] = std::move(readback); - readback_fence[readback_index] = std::move(fence); - readback_index = (readback_index + 1) & 3; - - if (readback_fence[readback_index]) + if (indirect_rendering) { - readback_fence[readback_index]->wait(); - auto *mapped = static_cast(device.map_host_buffer(*readback_ring[readback_index], MEMORY_ACCESS_READ_BIT)); - last_mesh_invocations = mapped[0]; - last_prim = mapped[1]; - last_vert = mapped[2]; + readback_ring[readback_index] = std::move(readback); + readback_fence[readback_index] = std::move(fence); + readback_index = (readback_index + 1) & 3; + + if (readback_fence[readback_index]) + { + readback_fence[readback_index]->wait(); + auto *mapped = static_cast(device.map_host_buffer(*readback_ring[readback_index], + MEMORY_ACCESS_READ_BIT)); + last_mesh_invocations = mapped[0]; + last_prim = mapped[1]; + last_vert = mapped[2]; + } } } diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 28332ab7..930a25ab 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -43,18 +43,6 @@ ResourceManager::ResourceManager(Device *device_) , mesh_stream_allocator(*device_, 8, 17) , mesh_payload_allocator(*device_, 32, 17) { - // Simplified style. - index_buffer_allocator.set_element_size(0, 3); // 8-bit indices. - attribute_buffer_allocator.set_soa_count(3); - attribute_buffer_allocator.set_element_size(0, sizeof(float) * 3); - attribute_buffer_allocator.set_element_size(1, sizeof(float) * 2 + sizeof(uint32_t) * 2); - attribute_buffer_allocator.set_element_size(2, sizeof(uint32_t) * 2); - indirect_buffer_allocator.set_element_size(0, sizeof(VkDrawIndexedIndirectCommand)); - - mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); - mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); - mesh_payload_allocator.set_element_size(0, sizeof(Meshlet::PayloadB128)); - assets.reserve(Granite::AssetID::MaxIDs); } @@ -175,18 +163,47 @@ void ResourceManager::init() opaque.domain = BufferDomain::Device; opaque.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - if (device->get_device_features().mesh_shader_features.taskShader && + if (false && device->get_device_features().mesh_shader_features.taskShader && device->get_device_features().mesh_shader_features.meshShader) { mesh_encoding = MeshEncoding::MeshletEncoded; LOGI("Opting in to meshlet path.\n"); } - if (mesh_encoding == MeshEncoding::MeshletDecoded) - indirect_buffer_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeaderDecoded)); + if (mesh_encoding != MeshEncoding::MeshletEncoded) + { + unsigned index_size = mesh_encoding == MeshEncoding::Classic ? sizeof(uint32_t) : sizeof(uint8_t); + index_buffer_allocator.set_element_size(0, 3 * index_size); // 8-bit or 32-bit indices. + attribute_buffer_allocator.set_soa_count(3); + attribute_buffer_allocator.set_element_size(0, sizeof(float) * 3); + attribute_buffer_allocator.set_element_size(1, sizeof(float) * 2 + sizeof(uint32_t) * 2); + attribute_buffer_allocator.set_element_size(2, sizeof(uint32_t) * 2); - if (mesh_encoding == MeshEncoding::MeshletEncoded) + opaque.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + index_buffer_allocator.prime(&opaque); + opaque.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + attribute_buffer_allocator.prime(&opaque); + + if (mesh_encoding != MeshEncoding::Classic) + { + auto element_size = mesh_encoding == MeshEncoding::MeshletDecoded ? + sizeof(Meshlet::RuntimeHeaderDecoded) : sizeof(VkDrawIndexedIndirectCommand); + + indirect_buffer_allocator.set_soa_count(2); + indirect_buffer_allocator.set_element_size(0, element_size); + indirect_buffer_allocator.set_element_size(1, sizeof(Meshlet::Bound)); + + opaque.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT; + indirect_buffer_allocator.prime(&opaque); + } + } + else { + mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); + mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); + mesh_payload_allocator.set_element_size(0, sizeof(Meshlet::PayloadB128)); + mesh_header_allocator.set_soa_count(2); mesh_header_allocator.set_element_size(1, sizeof(Meshlet::Bound)); @@ -195,19 +212,6 @@ void ResourceManager::init() mesh_stream_allocator.prime(&opaque); mesh_payload_allocator.prime(&opaque); } - else - { - indirect_buffer_allocator.set_soa_count(2); - indirect_buffer_allocator.set_element_size(1, sizeof(Meshlet::Bound)); - - opaque.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - index_buffer_allocator.prime(&opaque); - opaque.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - attribute_buffer_allocator.prime(&opaque); - opaque.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT; - indirect_buffer_allocator.prime(&opaque); - } } ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, Granite::AssetID id) @@ -406,14 +410,27 @@ bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::Me if (ret) ret = attribute_buffer_allocator.allocate(view.total_vertices, &asset.mesh.attr_or_stream); } - if (ret) + + if (ret && mesh_encoding != MeshEncoding::Classic) ret = indirect_buffer_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header); } - asset.mesh.draw = { - asset.mesh.indirect_or_header.offset, - view.format_header->meshlet_count, - }; + if (mesh_encoding == MeshEncoding::Classic) + { + asset.mesh.draw.indexed = { + view.total_primitives * 3, 1, + asset.mesh.index_or_payload.offset, + int32_t(asset.mesh.attr_or_stream.offset), 0, + }; + } + else + { + asset.mesh.draw.meshlet = { + asset.mesh.indirect_or_header.offset, + view.format_header->meshlet_count, + view.format_header->style, + }; + } if (!ret) { @@ -519,13 +536,14 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, Meshlet::DecodeInfo info = {}; info.target_style = Meshlet::MeshStyle::Textured; + if (mesh_encoding == MeshEncoding::Classic) + info.flags |= Meshlet::DECODE_MODE_UNROLLED_MESH; info.ibo = index_buffer_allocator.get_buffer(0, 0); for (unsigned i = 0; i < 3; i++) info.streams[i] = attribute_buffer_allocator.get_buffer(0, i); info.payload = payload.get(); - info.indirect = indirect_buffer_allocator.get_buffer(0, 0); info.push.meshlet_offset = asset.mesh.indirect_or_header.offset; info.push.primitive_offset = asset.mesh.index_or_payload.offset; @@ -534,11 +552,15 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, info.runtime_style = mesh_encoding == MeshEncoding::MeshletDecoded ? Meshlet::RuntimeStyle::Meshlet : Meshlet::RuntimeStyle::MDI; - auto *bounds = static_cast( - cmd->update_buffer(*indirect_buffer_allocator.get_buffer(0, 1), - asset.mesh.indirect_or_header.offset * sizeof(Meshlet::Bound), - view.format_header->meshlet_count * sizeof(Meshlet::Bound))); - memcpy(bounds, view.bounds, view.format_header->meshlet_count * sizeof(Meshlet::Bound)); + if (mesh_encoding != MeshEncoding::Classic) + { + info.indirect = indirect_buffer_allocator.get_buffer(0, 0); + auto *bounds = static_cast( + cmd->update_buffer(*indirect_buffer_allocator.get_buffer(0, 1), + asset.mesh.indirect_or_header.offset * sizeof(Meshlet::Bound), + view.format_header->meshlet_count * sizeof(Meshlet::Bound))); + memcpy(bounds, view.bounds, view.format_header->meshlet_count * sizeof(Meshlet::Bound)); + } Meshlet::decode_mesh(*cmd, info, view); @@ -569,11 +591,12 @@ void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, cost += view.total_vertices * attribute_buffer_allocator.get_element_size(0); cost += view.total_vertices * attribute_buffer_allocator.get_element_size(1); cost += view.total_vertices * attribute_buffer_allocator.get_element_size(2); - cost += view.format_header->meshlet_count * indirect_buffer_allocator.get_element_size(0); - cost += view.format_header->meshlet_count * indirect_buffer_allocator.get_element_size(1); + if (mesh_encoding != MeshEncoding::Classic) + { + cost += view.format_header->meshlet_count * indirect_buffer_allocator.get_element_size(0); + cost += view.format_header->meshlet_count * indirect_buffer_allocator.get_element_size(1); + } } - - asset.mesh.draw.style = view.format_header->style; } std::lock_guard holder{lock}; diff --git a/vulkan/managers/resource_manager.hpp b/vulkan/managers/resource_manager.hpp index c482aeb3..3a44112a 100644 --- a/vulkan/managers/resource_manager.hpp +++ b/vulkan/managers/resource_manager.hpp @@ -87,6 +87,7 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface MeshletEncoded, MeshletDecoded, VBOAndIBOMDI, + Classic }; inline const Vulkan::ImageView *get_image_view(Granite::AssetID id) const @@ -101,13 +102,18 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface struct DrawRange { - uint32_t offset = 0; - uint32_t count = 0; - uint32_t bounds_offset = 0; - Meshlet::MeshStyle style = Meshlet::MeshStyle::Wireframe; + uint32_t offset; + uint32_t count; + Meshlet::MeshStyle style; }; - inline DrawRange get_mesh_draw_range(Granite::AssetID id) const + union DrawCall + { + DrawRange meshlet; + VkDrawIndexedIndirectCommand indexed; + }; + + inline DrawCall get_mesh_draw_range(Granite::AssetID id) const { if (id.id < draws.size()) return draws[id.id]; @@ -150,7 +156,7 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface struct { Util::AllocatedSlice index_or_payload, attr_or_stream, indirect_or_header; - DrawRange draw; + DrawCall draw; } mesh; Granite::AssetClass asset_class = Granite::AssetClass::ImageZeroable; bool latchable = false; @@ -161,7 +167,7 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface std::vector assets; std::vector views; - std::vector draws; + std::vector draws; std::vector updates; ImageHandle fallback_color; @@ -186,7 +192,7 @@ class ResourceManager final : private Granite::AssetInstantiatorInterface MeshBufferAllocator mesh_stream_allocator; MeshBufferAllocator mesh_payload_allocator; - MeshEncoding mesh_encoding = MeshEncoding::VBOAndIBOMDI; + MeshEncoding mesh_encoding = MeshEncoding::Classic; bool allocate_asset_mesh(Granite::AssetID id, const Meshlet::MeshView &view); }; From f4a47645dcb5e9ced1d4c71874423f55bf1d4565 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 13:20:34 +0100 Subject: [PATCH 42/59] Add more metrics and overrides. --- tests/meshlet_viewer.cpp | 84 ++++++++++++++++++---------- vulkan/managers/resource_manager.cpp | 16 +++++- 2 files changed, 69 insertions(+), 31 deletions(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index ac1e1b6e..ee9681eb 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -340,21 +340,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } auto &manager = device.get_resource_manager(); - - BufferHandle readback_counter, readback; - if (indirect_rendering) - { - BufferCreateInfo info; - info.size = 12; - info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT; - info.domain = BufferDomain::CachedHost; - readback = device.create_buffer(info); - - info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - info.domain = BufferDomain::Device; - info.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT; - readback_counter = device.create_buffer(info); - } + const bool use_meshlets = indirect_rendering && manager.get_mesh_encoding() != ResourceManager::MeshEncoding::VBOAndIBOMDI; + const bool use_preculling = !use_meshlets && indirect_rendering; struct { @@ -364,8 +351,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } push; push.camera_pos = render_context.get_render_parameters().camera_position; - const bool use_meshlets = indirect_rendering && manager.get_mesh_encoding() != ResourceManager::MeshEncoding::VBOAndIBOMDI; - const bool use_preculling = !use_meshlets && indirect_rendering; //uint32_t target_meshlet_workgroup_size = // max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); @@ -418,6 +403,24 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V compacted_params = device.create_buffer(info); } + BufferHandle readback_counter, readback; + if (indirect_rendering) + { + BufferCreateInfo info; + info.size = use_meshlets ? 12 : indirect_draws->get_create_info().size; + info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT; + info.domain = BufferDomain::CachedHost; + readback = device.create_buffer(info); + + if (use_meshlets) + { + info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + info.domain = BufferDomain::Device; + info.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT; + readback_counter = device.create_buffer(info); + } + } + if (use_preculling) { auto *indirect = manager.get_indirect_buffer(); @@ -436,7 +439,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->set_storage_buffer(0, 4, *indirect_draws); cmd->set_storage_buffer(0, 5, *compacted_params); cmd->set_storage_buffer(0, 6, *manager.get_cluster_bounds_buffer()); - cmd->set_storage_buffer(0, 10, *readback_counter); memcpy(cmd->allocate_typed_constant_data(0, 7, 6), render_context.get_visibility_frustum().get_planes(), 6 * sizeof(vec4)); @@ -634,30 +636,37 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } flat_renderer.begin(); - flat_renderer.render_quad(vec3(0.0f, 0.0f, 0.5f), vec2(600.0f, 140.0f), vec4(0.0f, 0.0f, 0.0f, 0.5f)); + flat_renderer.render_quad(vec3(0.0f, 0.0f, 0.5f), + vec2(350.0f, 80.0f), + vec4(0.0f, 0.0f, 0.0f, 0.8f)); char text[256]; - if (indirect_rendering) + if (use_meshlets) { snprintf(text, sizeof(text), "Mesh shader invocations: %.3f M / %.3f M", 1e-6 * last_mesh_invocations, 1e-6 * double(max_draws * MaxElements)); } + else if (indirect_rendering) + { + snprintf(text, sizeof(text), "MDI primitives: %.3f M / %.3f M", 1e-6 * last_mesh_invocations, + 1e-6 * double(max_draws * MaxElements)); + } else { snprintf(text, sizeof(text), "Direct primitives: %.3f M", 1e-6 * last_mesh_invocations); } - flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), text, vec3(10.0f, 10.0f, 0.0f), vec2(1000.0f)); - if (indirect_rendering) + if (use_meshlets) { snprintf(text, sizeof(text), "Primitives: %.3f M", 1e-6 * last_prim); - flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), - text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), + text, vec3(10.0f, 30.0f, 0.0f), vec2(1000.0f)); snprintf(text, sizeof(text), "Vertices: %.3f M", 1e-6 * last_vert); - flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Large), - text, vec3(10.0f, 90.0f, 0.0f), vec2(1000.0f)); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), + text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); } flat_renderer.flush(*cmd, vec3(0.0f), vec3(cmd->get_viewport().width, cmd->get_viewport().height, 1.0f)); @@ -670,7 +679,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V { cmd->barrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_READ_BIT); - cmd->copy_buffer(*readback, *readback_counter); + if (use_meshlets) + cmd->copy_buffer(*readback, *readback_counter); + else + cmd->copy_buffer(*readback, *indirect_draws); cmd->barrier(VK_PIPELINE_STAGE_2_COPY_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); } @@ -689,9 +701,21 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V readback_fence[readback_index]->wait(); auto *mapped = static_cast(device.map_host_buffer(*readback_ring[readback_index], MEMORY_ACCESS_READ_BIT)); - last_mesh_invocations = mapped[0]; - last_prim = mapped[1]; - last_vert = mapped[2]; + + if (use_meshlets) + { + last_mesh_invocations = mapped[0]; + last_prim = mapped[1]; + last_vert = mapped[2]; + } + else + { + last_mesh_invocations = 0; + uint32_t draws = mapped[0]; + mapped += 256 / sizeof(uint32_t); + for (uint32_t i = 0; i < draws; i++, mapped += sizeof(VkDrawIndexedIndirectCommand) / sizeof(uint32_t)) + last_mesh_invocations += mapped[0] / 3; + } } } } diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 930a25ab..4dc5e7b2 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -163,13 +163,27 @@ void ResourceManager::init() opaque.domain = BufferDomain::Device; opaque.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - if (false && device->get_device_features().mesh_shader_features.taskShader && + if (device->get_device_features().mesh_shader_features.taskShader && device->get_device_features().mesh_shader_features.meshShader) { mesh_encoding = MeshEncoding::MeshletEncoded; LOGI("Opting in to meshlet path.\n"); } + if (const char *env = getenv("GRANITE_MESH_ENCODING")) + { + if (strcmp(env, "encoded") == 0) + mesh_encoding = MeshEncoding::MeshletEncoded; + else if (strcmp(env, "decoded") == 0) + mesh_encoding = MeshEncoding::MeshletDecoded; + else if (strcmp(env, "mdi") == 0) + mesh_encoding = MeshEncoding::VBOAndIBOMDI; + else if (strcmp(env, "classic") == 0) + mesh_encoding = MeshEncoding::Classic; + else + LOGE("Unknown encoding: %s\n", env); + } + if (mesh_encoding != MeshEncoding::MeshletEncoded) { unsigned index_size = mesh_encoding == MeshEncoding::Classic ? sizeof(uint32_t) : sizeof(uint8_t); From cccc3605ff57922f80a02e467e59dd21113d3be6 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 13:56:49 +0100 Subject: [PATCH 43/59] Report export statistics per stream. --- scene-export/meshlet_export.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index 09293826..c2c88206 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -514,6 +514,8 @@ static void encode_mesh(Encoded &encoded, mesh.meshlets.reserve(num_full_meshlets); uint32_t base_vertex_offset = 0; + uint32_t stream_payload_count[MaxStreams] = {}; + for (uint32_t full_meshlet_index = 0; full_meshlet_index < num_full_meshlets; full_meshlet_index++) { Metadata out_meshlet = {}; @@ -542,7 +544,12 @@ static void encode_mesh(Encoded &encoded, offsets.attr_offset = num_attributes; offsets.prim_offset = num_primitives; + auto start_count = encoded.payload.size(); encode_index_stream(encoded.payload, index_stream_buffer); + auto end_count = encoded.payload.size(); + + stream_payload_count[int(StreamType::Primitive)] += end_count - start_count; + num_primitives += meshlet.primitive_count; num_attributes += meshlet.vertex_count; } @@ -563,6 +570,7 @@ static void encode_mesh(Encoded &encoded, stream.aux = p_aux[stream_index]; stream.offset_in_b128 = uint32_t(encoded.payload.size()); + uint32_t start_count = encoded.payload.size(); for (uint32_t chunk_index = 0; chunk_index < num_chunks; chunk_index++) { auto &meshlet = meshlets[full_meshlet_index * NumChunks + chunk_index]; @@ -579,10 +587,16 @@ static void encode_mesh(Encoded &encoded, break; } } + uint32_t end_count = encoded.payload.size(); + stream_payload_count[stream_index] += end_count - start_count; } mesh.meshlets.push_back(out_meshlet); } + + for (unsigned i = 0; i < MaxStreams; i++) + if (stream_payload_count[i]) + LOGI("Stream %u: %zu bytes.\n", i, stream_payload_count[i] * sizeof(PayloadB128)); } static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) From b2ff0c4c6465e3375d6384cd7f0a32d0b615d105 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 13:57:15 +0100 Subject: [PATCH 44/59] Scale positions directly instead of using nodes in obj-to-gltf. --- scene-export/obj.cpp | 4 ++-- scene-export/obj.hpp | 2 +- tools/obj_to_gltf.cpp | 6 ++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/scene-export/obj.cpp b/scene-export/obj.cpp index 7af375bb..93ee60a1 100644 --- a/scene-export/obj.cpp +++ b/scene-export/obj.cpp @@ -455,7 +455,7 @@ void Parser::load_material_library(const std::string &path) emit_gltf_base_color(base_color, alpha_mask); } -Parser::Parser(const std::string &path) +Parser::Parser(const std::string &path, float position_scale) { std::string obj; if (!GRANITE_FILESYSTEM()->read_file_to_string(path, obj)) @@ -477,7 +477,7 @@ Parser::Parser(const std::string &path) if (ident == "mtllib") load_material_library(Path::relpath(path, elements.at(1))); else if (ident == "v") - positions.push_back(vec3(stof(elements.at(1)), stof(elements.at(2)), stof(elements.at(3)))); + positions.push_back(position_scale * vec3(stof(elements.at(1)), stof(elements.at(2)), stof(elements.at(3)))); else if (ident == "vn") normals.push_back(vec3(stof(elements.at(1)), stof(elements.at(2)), stof(elements.at(3)))); else if (ident == "vt") diff --git a/scene-export/obj.hpp b/scene-export/obj.hpp index 65b2233b..42685f19 100644 --- a/scene-export/obj.hpp +++ b/scene-export/obj.hpp @@ -35,7 +35,7 @@ using namespace Granite::SceneFormats; class Parser { public: - explicit Parser(const std::string &path); + Parser(const std::string &path, float position_scale = 1.0f); const std::vector &get_meshes() const { diff --git a/tools/obj_to_gltf.cpp b/tools/obj_to_gltf.cpp index 63405296..6057f06f 100644 --- a/tools/obj_to_gltf.cpp +++ b/tools/obj_to_gltf.cpp @@ -62,16 +62,14 @@ int main(int argc, char *argv[]) return 1; } - OBJ::Parser parser(args.input); + OBJ::Parser parser(args.input, args.scale); SceneFormats::SceneInformation info; info.materials = parser.get_materials(); info.meshes = parser.get_meshes(); SceneFormats::ExportOptions options; - std::vector nodes = parser.get_nodes(); - nodes.front().transform.scale = vec3(args.scale); - info.nodes = nodes; + info.nodes = parser.get_nodes(); if (!SceneFormats::export_scene_to_glb(info, args.output, options)) { From f4ab98ee3858dadb8e9fdc95b8bad295b10bf06d Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 13:57:32 +0100 Subject: [PATCH 45/59] Report more stats in UI. --- tests/meshlet_viewer.cpp | 41 ++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index ee9681eb..b50ab914 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -129,7 +129,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto &node_transform = nodeptr->get_transform(); node_transform.translation = node.transform.translation; node_transform.rotation = node.transform.rotation; - node_transform.scale = node.transform.scale /** vec3(0.01f) */; + node_transform.scale = node.transform.scale; nodes.push_back(std::move(nodeptr)); } @@ -169,7 +169,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V auto nodeptr = scene.create_node(); auto &node_transform = nodeptr->get_transform(); node_transform.translation = vec3(x, y, z) * 3.0f; - //node_transform.scale = vec3(0.01f); root->add_child(nodeptr); auto renderable = Util::make_handle(); @@ -216,7 +215,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V root->add_child(nodes[scene_node_index]); scene.set_root_node(std::move(root)); - camera.look_at(vec3(1.5f, 1.5f, 1.5f), vec3(0.0f)); + camera.look_at(vec3(0, 0, 50), vec3(0)); EVENT_MANAGER_REGISTER_LATCH(MeshletViewerApplication, on_device_create, on_device_destroy, DeviceCreatedEvent); } @@ -352,11 +351,11 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V push.camera_pos = render_context.get_render_parameters().camera_position; - //uint32_t target_meshlet_workgroup_size = - // max(32u, device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations); uint32_t target_meshlet_workgroup_size = 32; + if (const char *env = getenv("MESHLET_SIZE")) + target_meshlet_workgroup_size = strtoul(env, nullptr, 0); - target_meshlet_workgroup_size = min(256u, target_meshlet_workgroup_size); + target_meshlet_workgroup_size = max(32u, min(256u, target_meshlet_workgroup_size)); target_meshlet_workgroup_size = 1u << Util::floor_log2(target_meshlet_workgroup_size); uint32_t num_chunk_workgroups = 256u / target_meshlet_workgroup_size; @@ -637,10 +636,32 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V flat_renderer.begin(); flat_renderer.render_quad(vec3(0.0f, 0.0f, 0.5f), - vec2(350.0f, 80.0f), + vec2(350.0f, 100.0f), vec4(0.0f, 0.0f, 0.0f, 0.8f)); char text[256]; + switch (manager.get_mesh_encoding()) + { + case ResourceManager::MeshEncoding::MeshletEncoded: + snprintf(text, sizeof(text), "Meshlet (%u prim/vert) | Inline Decoding", target_meshlet_workgroup_size); + break; + + case ResourceManager::MeshEncoding::MeshletDecoded: + snprintf(text, sizeof(text), "Meshlet (%u prim/vert) | VBO Fetch", target_meshlet_workgroup_size); + break; + + case ResourceManager::MeshEncoding::VBOAndIBOMDI: + strcpy(text, "MultiDrawIndirect"); + break; + + default: + strcpy(text, "Classic Direct Draw"); + break; + } + + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), + text, vec3(10.0f, 10.0f, 0.0f), vec2(1000.0f)); + if (use_meshlets) { snprintf(text, sizeof(text), "Mesh shader invocations: %.3f M / %.3f M", 1e-6 * last_mesh_invocations, @@ -657,16 +678,16 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V } flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), - text, vec3(10.0f, 10.0f, 0.0f), vec2(1000.0f)); + text, vec3(10.0f, 30.0f, 0.0f), vec2(1000.0f)); if (use_meshlets) { snprintf(text, sizeof(text), "Primitives: %.3f M", 1e-6 * last_prim); flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), - text, vec3(10.0f, 30.0f, 0.0f), vec2(1000.0f)); + text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); snprintf(text, sizeof(text), "Vertices: %.3f M", 1e-6 * last_vert); flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), - text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); + text, vec3(10.0f, 70.0f, 0.0f), vec2(1000.0f)); } flat_renderer.flush(*cmd, vec3(0.0f), vec3(cmd->get_viewport().width, cmd->get_viewport().height, 1.0f)); From ab8d272d60503bee94e821630a09b53591ca7e2b Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 14:08:23 +0100 Subject: [PATCH 46/59] Report timestamps directly in UI. --- tests/meshlet_viewer.cpp | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index b50ab914..3a35be6f 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -69,7 +69,7 @@ struct MeshletRenderable : AbstractRenderable } }; -struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, Vulkan::DebugChannelInterface +struct MeshletViewerApplication : Granite::Application, Granite::EventHandler //, Vulkan::DebugChannelInterface { explicit MeshletViewerApplication(const char *path) { @@ -643,19 +643,21 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V switch (manager.get_mesh_encoding()) { case ResourceManager::MeshEncoding::MeshletEncoded: - snprintf(text, sizeof(text), "Meshlet (%u prim/vert) | Inline Decoding", target_meshlet_workgroup_size); + snprintf(text, sizeof(text), "%.3f ms | Meshlet (%u prim/vert) | Inline Decoding", + last_frame_time * 1e3, target_meshlet_workgroup_size); break; case ResourceManager::MeshEncoding::MeshletDecoded: - snprintf(text, sizeof(text), "Meshlet (%u prim/vert) | VBO Fetch", target_meshlet_workgroup_size); + snprintf(text, sizeof(text), "%.3f ms | Meshlet (%u prim/vert) | VBO Fetch", + last_frame_time * 1e3, target_meshlet_workgroup_size); break; case ResourceManager::MeshEncoding::VBOAndIBOMDI: - strcpy(text, "MultiDrawIndirect"); + snprintf(text, sizeof(text), "%.3f ms | MultiDrawIndirect", last_frame_time * 1e3); break; default: - strcpy(text, "Classic Direct Draw"); + snprintf(text, sizeof(text), "%.3f ms | Classic Direct Draw", last_frame_time * 1e3); break; } @@ -694,7 +696,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V cmd->end_render_pass(); auto end_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT); - device.register_time_interval("GPU", std::move(start_ts), std::move(end_ts), "Render"); if (indirect_rendering) { @@ -711,12 +712,22 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V Fence fence; device.submit(cmd, &fence); - if (indirect_rendering) + start_timestamps[readback_index] = std::move(start_ts); + end_timestamps[readback_index] = std::move(end_ts); + readback_ring[readback_index] = std::move(readback); + readback_fence[readback_index] = std::move(fence); + readback_index = (readback_index + 1) & 3; + + if (start_timestamps[readback_index] && start_timestamps[readback_index]->is_signalled() && + end_timestamps[readback_index] && end_timestamps[readback_index]->is_signalled()) { - readback_ring[readback_index] = std::move(readback); - readback_fence[readback_index] = std::move(fence); - readback_index = (readback_index + 1) & 3; + last_frame_time = device.convert_device_timestamp_delta( + start_timestamps[readback_index]->get_timestamp_ticks(), + end_timestamps[readback_index]->get_timestamp_ticks()); + } + if (indirect_rendering) + { if (readback_fence[readback_index]) { readback_fence[readback_index]->wait(); @@ -747,8 +758,13 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V unsigned last_mesh_invocations = 0; unsigned last_prim = 0; unsigned last_vert = 0; + double last_frame_time = 0.0; FlatRenderer flat_renderer; + QueryPoolHandle start_timestamps[4]; + QueryPoolHandle end_timestamps[4]; + +#if 0 void message(const std::string &tag, uint32_t code, uint32_t x, uint32_t y, uint32_t z, uint32_t, const Word *words) override { @@ -757,6 +773,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler, V LOGI("%.3f %.3f %.3f %.3f\n", words[0].f32, words[1].f32, words[2].f32, words[3].f32); } +#endif }; namespace Granite From 895e6b6c2aae47c5e8dc38687bd483f7c55d37eb Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 14:53:07 +0100 Subject: [PATCH 47/59] Add more CLI overrides. --- tests/meshlet_viewer.cpp | 48 +++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 3a35be6f..2c731837 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -39,6 +39,7 @@ #include "flat_renderer.hpp" #include "ui_manager.hpp" #include "gltf.hpp" +#include "cli_parser.hpp" #include #include #include @@ -160,9 +161,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // auto root = scene.create_node(); #if 1 - for (int z = -10; z <= 10; z++) - for (int y = -10; y <= 10; y++) - for (int x = -10; x <= 10; x++) + for (int z = -6; z <= 6; z++) + for (int y = -6; y <= 6; y++) + for (int x = -6; x <= 6; x++) { if (!x && !y && !z) continue; @@ -215,7 +216,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // root->add_child(nodes[scene_node_index]); scene.set_root_node(std::move(root)); - camera.look_at(vec3(0, 0, 50), vec3(0)); + camera.look_at(vec3(0, 0, 30), vec3(0)); EVENT_MANAGER_REGISTER_LATCH(MeshletViewerApplication, on_device_create, on_device_destroy, DeviceCreatedEvent); } @@ -340,7 +341,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // auto &manager = device.get_resource_manager(); const bool use_meshlets = indirect_rendering && manager.get_mesh_encoding() != ResourceManager::MeshEncoding::VBOAndIBOMDI; - const bool use_preculling = !use_meshlets && indirect_rendering; + bool use_preculling = !use_meshlets && indirect_rendering; + + if (const char *env = getenv("PRECULL")) + use_preculling = indirect_rendering && strtoul(env, nullptr, 0) != 0; struct { @@ -457,6 +461,9 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // auto *pos = manager.get_position_buffer(); auto *attr = manager.get_attribute_buffer(); + bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); + bool use_hierarchical = device.get_device_features().driver_id != VK_DRIVER_ID_NVIDIA_PROPRIETARY; + if (use_meshlets) { cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); @@ -473,7 +480,6 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // 0.5f * cmd->get_viewport().width, 0.5f * cmd->get_viewport().height) - vec4(1.0f, 1.0f, 0.0f, 0.0f); - bool use_hierarchical = device.get_device_features().driver_id != VK_DRIVER_ID_NVIDIA_PROPRIETARY; bool use_encoded = manager.get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::MeshletEncoded; cmd->set_specialization_constant_mask(3); @@ -503,7 +509,11 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // const char *mesh_path = use_encoded ? "assets://shaders/meshlet_debug.mesh" : "assets://shaders/meshlet_debug_plain.mesh"; - bool supports_wave32 = device.supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); + if (const char *env = getenv("WAVE32")) + supports_wave32 = strtoul(env, nullptr, 0) != 0; + if (const char *hier = getenv("HIER_TASK")) + use_hierarchical = strtoul(hier, nullptr, 0) != 0; + bool supports_wg32 = supports_wave32 && target_meshlet_workgroup_size == 32; if (use_preculling) @@ -636,7 +646,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // flat_renderer.begin(); flat_renderer.render_quad(vec3(0.0f, 0.0f, 0.5f), - vec2(350.0f, 100.0f), + vec2(450.0f, 120.0f), vec4(0.0f, 0.0f, 0.0f, 0.8f)); char text[256]; @@ -682,14 +692,19 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), text, vec3(10.0f, 30.0f, 0.0f), vec2(1000.0f)); + snprintf(text, sizeof(text), "ComputeCull %d | mesh wave32 %d | task hier %d", + int(use_preculling), int(supports_wave32), int(use_hierarchical)); + flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), + text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); + if (use_meshlets) { snprintf(text, sizeof(text), "Primitives: %.3f M", 1e-6 * last_prim); flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), - text, vec3(10.0f, 50.0f, 0.0f), vec2(1000.0f)); + text, vec3(10.0f, 70.0f, 0.0f), vec2(1000.0f)); snprintf(text, sizeof(text), "Vertices: %.3f M", 1e-6 * last_vert); flat_renderer.render_text(GRANITE_UI_MANAGER()->get_font(UI::FontSize::Normal), - text, vec3(10.0f, 70.0f, 0.0f), vec2(1000.0f)); + text, vec3(10.0f, 90.0f, 0.0f), vec2(1000.0f)); } flat_renderer.flush(*cmd, vec3(0.0f), vec3(cmd->get_viewport().width, cmd->get_viewport().height, 1.0f)); @@ -782,7 +797,18 @@ Application *application_create(int argc, char **argv) { GRANITE_APPLICATION_SETUP_FILESYSTEM(); - if (argc != 2) + const char *path = nullptr; + + Util::CLICallbacks cbs; + cbs.add("--size", [](Util::CLIParser &parser) { setenv("MESHLET_SIZE", parser.next_string(), 1); }); + cbs.add("--encoding", [](Util::CLIParser &parser) { setenv("MESHLET_SIZE", parser.next_string(), 1); }); + cbs.add("--hier-task", [](Util::CLIParser &parser) { setenv("HIER_TASK", parser.next_string(), 1); }); + cbs.add("--wave32", [](Util::CLIParser &parser) { setenv("WAVE32", parser.next_string(), 1); }); + cbs.add("--precull", [](Util::CLIParser &parser) { setenv("PRECULL", parser.next_string(), 1); }); + cbs.default_handler = [&](const char *arg) { path = arg; }; + + Util::CLIParser parser(std::move(cbs), argc - 1, argv + 1); + if (!parser.parse() || parser.is_ended_state() || !path) { LOGE("Usage: meshlet-viewer path.msh1\n"); return nullptr; From e4edcbb5508d7c5ce3522bfb6ab673ac4e749210 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Sun, 24 Dec 2023 15:36:33 +0100 Subject: [PATCH 48/59] Fix CLI override. --- tests/meshlet_viewer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 2c731837..88fdba66 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -801,7 +801,7 @@ Application *application_create(int argc, char **argv) Util::CLICallbacks cbs; cbs.add("--size", [](Util::CLIParser &parser) { setenv("MESHLET_SIZE", parser.next_string(), 1); }); - cbs.add("--encoding", [](Util::CLIParser &parser) { setenv("MESHLET_SIZE", parser.next_string(), 1); }); + cbs.add("--encoding", [](Util::CLIParser &parser) { setenv("GRANITE_MESH_ENCODING", parser.next_string(), 1); }); cbs.add("--hier-task", [](Util::CLIParser &parser) { setenv("HIER_TASK", parser.next_string(), 1); }); cbs.add("--wave32", [](Util::CLIParser &parser) { setenv("WAVE32", parser.next_string(), 1); }); cbs.add("--precull", [](Util::CLIParser &parser) { setenv("PRECULL", parser.next_string(), 1); }); From 5d698679e790afc2a93a88603c777f1f162e8936 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 25 Dec 2023 12:18:00 +0100 Subject: [PATCH 49/59] Hoist out environment handling to helper util. --- application/global/global_managers.cpp | 4 +- filesystem/filesystem.cpp | 25 +++---- renderer/post/aa.cpp | 16 +---- tests/meshlet_viewer.cpp | 24 +++---- tests/sampler_precision.cpp | 6 +- threading/thread_group.cpp | 8 ++- util/CMakeLists.txt | 1 + util/environment.cpp | 96 ++++++++++++++++++++++++++ util/environment.hpp | 35 ++++++++++ vulkan/context.cpp | 35 ++++------ vulkan/managers/resource_manager.cpp | 14 ++-- vulkan/wsi.cpp | 19 ++--- 12 files changed, 193 insertions(+), 90 deletions(-) create mode 100644 util/environment.cpp create mode 100644 util/environment.hpp diff --git a/application/global/global_managers.cpp b/application/global/global_managers.cpp index 64e54330..a5515ecb 100644 --- a/application/global/global_managers.cpp +++ b/application/global/global_managers.cpp @@ -21,6 +21,7 @@ */ #include "global_managers.hpp" +#include "environment.hpp" #include "logging.hpp" #include #include @@ -219,8 +220,7 @@ void init(Factory &factory, ManagerFeatureFlags flags, unsigned max_threads, flo if (cpu_threads > max_threads) cpu_threads = max_threads; - if (const char *env = getenv("GRANITE_NUM_WORKER_THREADS")) - cpu_threads = strtoul(env, nullptr, 0); + cpu_threads = Util::get_environment_uint("GRANITE_NUM_WORKER_THREADS", cpu_threads); unsigned background_cpu_threads = (cpu_threads + 1) / 2; diff --git a/filesystem/filesystem.cpp b/filesystem/filesystem.cpp index b740e23e..e0125718 100644 --- a/filesystem/filesystem.cpp +++ b/filesystem/filesystem.cpp @@ -26,6 +26,7 @@ #include "logging.hpp" #include "os_filesystem.hpp" #include "string_helpers.hpp" +#include "environment.hpp" #include #include #include @@ -72,28 +73,28 @@ Filesystem::Filesystem() register_protocol("file", std::unique_ptr(new OSFilesystem("."))); register_protocol("memory", std::unique_ptr(new ScratchFilesystem)); - const char *asset_dir = getenv("GRANITE_DEFAULT_ASSET_DIRECTORY"); #ifdef GRANITE_DEFAULT_ASSET_DIRECTORY - if (!asset_dir) - asset_dir = GRANITE_DEFAULT_ASSET_DIRECTORY; + auto asset_dir = Util::get_environment_string("GRANITE_DEFAULT_ASSET_DIRECTORY", GRANITE_DEFAULT_ASSET_DIRECTORY); +#else + auto asset_dir = Util::get_environment_string("GRANITE_DEFAULT_ASSET_DIRECTORY", ""); #endif - if (asset_dir) + if (!asset_dir.empty()) register_protocol("builtin", std::unique_ptr(new OSFilesystem(asset_dir))); - const char *builtin_dir = getenv("GRANITE_DEFAULT_BUILTIN_DIRECTORY"); #ifdef GRANITE_DEFAULT_BUILTIN_DIRECTORY - if (!builtin_dir) - builtin_dir = GRANITE_DEFAULT_BUILTIN_DIRECTORY; + auto builtin_dir = Util::get_environment_string("GRANITE_DEFAULT_BUILTIN_DIRECTORY", GRANITE_DEFAULT_BUILTIN_DIRECTORY); +#else + auto builtin_dir = Util::get_environment_string("GRANITE_DEFAULT_BUILTIN_DIRECTORY", ""); #endif - if (builtin_dir) + if (!builtin_dir.empty()) register_protocol("builtin", std::unique_ptr(new OSFilesystem(builtin_dir))); - const char *cache_dir = getenv("GRANITE_DEFAULT_CACHE_DIRECTORY"); #ifdef GRANITE_DEFAULT_CACHE_DIRECTORY - if (!cache_dir) - cache_dir = GRANITE_DEFAULT_CACHE_DIRECTORY; + auto cache_dir = Util::get_environment_string("GRANITE_DEFAULT_CACHE_DIRECTORY", GRANITE_DEFAULT_CACHE_DIRECTORY); +#else + auto cache_dir = Util::get_environment_string("GRANITE_DEFAULT_CACHE_DIRECTORY", ""); #endif - if (cache_dir) + if (!cache_dir.empty()) register_protocol("cache", std::unique_ptr(new OSFilesystem(cache_dir))); } diff --git a/renderer/post/aa.cpp b/renderer/post/aa.cpp index 9f406f8d..26b80780 100644 --- a/renderer/post/aa.cpp +++ b/renderer/post/aa.cpp @@ -25,6 +25,7 @@ #include "fxaa.hpp" #include "smaa.hpp" #include "muglm/muglm_impl.hpp" +#include "environment.hpp" #include namespace Granite @@ -115,20 +116,7 @@ bool setup_after_post_chain_upscaling(RenderGraph &graph, const std::string &inp const char *frag = "builtin://shaders/post/ffx-fsr/upscale.frag"; bool fp16 = cmd.get_device().get_device_features().vk12_features.shaderFloat16; - const char *fsr_fp16 = getenv("FIDELITYFX_FSR_FP16"); - if (fsr_fp16) - { - fp16 = strtoul(fsr_fp16, nullptr, 0) != 0; - static bool logged; - if (!logged) - { - if (fp16) - LOGI("Forcing FP16 for FidelityFX FSR path.\n"); - else - LOGI("Forcing FP32 for FidelityFX FSR path.\n"); - logged = true; - } - } + fp16 = Util::get_environment_bool("FIDELITYFX_FSR_FP16", fp16); Vulkan::CommandBufferUtil::draw_fullscreen_quad(cmd, vert, frag, {{ "TARGET_SRGB", srgb ? 1 : 0 }, diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 88fdba66..ea66409a 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -40,6 +40,7 @@ #include "ui_manager.hpp" #include "gltf.hpp" #include "cli_parser.hpp" +#include "environment.hpp" #include #include #include @@ -343,8 +344,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // const bool use_meshlets = indirect_rendering && manager.get_mesh_encoding() != ResourceManager::MeshEncoding::VBOAndIBOMDI; bool use_preculling = !use_meshlets && indirect_rendering; - if (const char *env = getenv("PRECULL")) - use_preculling = indirect_rendering && strtoul(env, nullptr, 0) != 0; + if (indirect_rendering) + use_preculling = Util::get_environment_bool("PRECULL", use_preculling); struct { @@ -356,8 +357,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // push.camera_pos = render_context.get_render_parameters().camera_position; uint32_t target_meshlet_workgroup_size = 32; - if (const char *env = getenv("MESHLET_SIZE")) - target_meshlet_workgroup_size = strtoul(env, nullptr, 0); + target_meshlet_workgroup_size = Util::get_environment_uint("MESHLET_SIZE", target_meshlet_workgroup_size); target_meshlet_workgroup_size = max(32u, min(256u, target_meshlet_workgroup_size)); target_meshlet_workgroup_size = 1u << Util::floor_log2(target_meshlet_workgroup_size); @@ -509,10 +509,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // const char *mesh_path = use_encoded ? "assets://shaders/meshlet_debug.mesh" : "assets://shaders/meshlet_debug_plain.mesh"; - if (const char *env = getenv("WAVE32")) - supports_wave32 = strtoul(env, nullptr, 0) != 0; - if (const char *hier = getenv("HIER_TASK")) - use_hierarchical = strtoul(hier, nullptr, 0) != 0; + supports_wave32 = Util::get_environment_bool("WAVE32", supports_wave32); + use_hierarchical = Util::get_environment_bool("HIER_TASK", use_hierarchical); bool supports_wg32 = supports_wave32 && target_meshlet_workgroup_size == 32; @@ -800,11 +798,11 @@ Application *application_create(int argc, char **argv) const char *path = nullptr; Util::CLICallbacks cbs; - cbs.add("--size", [](Util::CLIParser &parser) { setenv("MESHLET_SIZE", parser.next_string(), 1); }); - cbs.add("--encoding", [](Util::CLIParser &parser) { setenv("GRANITE_MESH_ENCODING", parser.next_string(), 1); }); - cbs.add("--hier-task", [](Util::CLIParser &parser) { setenv("HIER_TASK", parser.next_string(), 1); }); - cbs.add("--wave32", [](Util::CLIParser &parser) { setenv("WAVE32", parser.next_string(), 1); }); - cbs.add("--precull", [](Util::CLIParser &parser) { setenv("PRECULL", parser.next_string(), 1); }); + cbs.add("--size", [](Util::CLIParser &parser) { Util::set_environment("MESHLET_SIZE", parser.next_string()); }); + cbs.add("--encoding", [](Util::CLIParser &parser) { Util::set_environment("GRANITE_MESH_ENCODING", parser.next_string()); }); + cbs.add("--hier-task", [](Util::CLIParser &parser) { Util::set_environment("HIER_TASK", parser.next_string()); }); + cbs.add("--wave32", [](Util::CLIParser &parser) { Util::set_environment("WAVE32", parser.next_string()); }); + cbs.add("--precull", [](Util::CLIParser &parser) { Util::set_environment("PRECULL", parser.next_string()); }); cbs.default_handler = [&](const char *arg) { path = arg; }; Util::CLIParser parser(std::move(cbs), argc - 1, argv + 1); diff --git a/tests/sampler_precision.cpp b/tests/sampler_precision.cpp index b8f9bf7e..eba73090 100644 --- a/tests/sampler_precision.cpp +++ b/tests/sampler_precision.cpp @@ -26,6 +26,7 @@ #include "device.hpp" #include "thread_group.hpp" #include "context.hpp" +#include "environment.hpp" using namespace Granite; using namespace Vulkan; @@ -90,10 +91,7 @@ int main() Global::init(); #ifdef ASSET_DIRECTORY - const char *asset_dir = getenv("ASSET_DIRECTORY"); - if (!asset_dir) - asset_dir = ASSET_DIRECTORY; - + auto asset_dir = Util::get_environment_string("ASSET_DIRECTORY", ASSET_DIRECTORY); GRANITE_FILESYSTEM()->register_protocol("assets", std::unique_ptr(new OSFilesystem(asset_dir))); #endif int ret = main_inner(); diff --git a/threading/thread_group.cpp b/threading/thread_group.cpp index f9781f80..51bdaca5 100644 --- a/threading/thread_group.cpp +++ b/threading/thread_group.cpp @@ -30,6 +30,7 @@ #include "string_helpers.hpp" #include "timeline_trace_file.hpp" #include "thread_name.hpp" +#include "environment.hpp" namespace Granite { @@ -169,10 +170,11 @@ void ThreadGroup::start(unsigned num_threads_foreground, bg.thread_group.resize(num_threads_background); #ifndef GRANITE_SHIPPING - if (const char *env = getenv("GRANITE_TIMELINE_TRACE")) + std::string path; + if (Util::get_environment("GRANITE_TIMELINE_TRACE", path)) { - LOGI("Enabling JSON timeline tracing to %s.\n", env); - timeline_trace_file = std::make_unique(env); + LOGI("Enabling JSON timeline tracing to %s.\n", path.c_str()); + timeline_trace_file = std::make_unique(path); } #endif diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index 17c04488..57c914e6 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -32,6 +32,7 @@ add_granite_internal_lib(granite-util small_callable.hpp radix_sorter.hpp dynamic_array.hpp arena_allocator.hpp arena_allocator.cpp + environment.hpp environment.cpp no_init_pod.hpp) target_include_directories(granite-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(granite-util PUBLIC granite-application-global-interface) diff --git a/util/environment.cpp b/util/environment.cpp new file mode 100644 index 00000000..d2fb2bae --- /dev/null +++ b/util/environment.cpp @@ -0,0 +1,96 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#endif + +#include "environment.hpp" +#include +#include + +namespace Util +{ +bool get_environment(const char *env, std::string &str) +{ +#ifdef _WIN32 + char buf[4096]; + DWORD count = GetEnvironmentVariableA(env, buf, sizeof(buf)); + if (count) + { + str = { buf, buf + count }; + return true; + } + else + return false; +#else + if (const char *v = getenv(env)) + { + str = v; + return true; + } + else + return false; +#endif +} + +void set_environment(const char *env, const char *value) +{ +#ifdef _WIN32 + SetEnvironmentVariableA(env, value); +#else + setenv(env, value, 1); +#endif +} + +std::string get_environment_string(const char *env, const char *default_value) +{ + std::string v; + if (!get_environment(env, v)) + v = default_value; + return v; +} + +unsigned get_environment_uint(const char *env, unsigned default_value) +{ + unsigned value = default_value; + std::string v; + if (get_environment(env, v)) + value = unsigned(std::stoul(v)); + return value; +} + +int get_environment_int(const char *env, int default_value) +{ + int value = default_value; + std::string v; + if (get_environment(env, v)) + value = int(std::stol(v)); + return value; +} + +bool get_environment_bool(const char *env, bool default_value) +{ + return get_environment_int(env, int(default_value)) != 0; +} +} diff --git a/util/environment.hpp b/util/environment.hpp new file mode 100644 index 00000000..de09b34d --- /dev/null +++ b/util/environment.hpp @@ -0,0 +1,35 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include + +namespace Util +{ +bool get_environment(const char *env, std::string &str); +std::string get_environment_string(const char *env, const char *default_value); +unsigned get_environment_uint(const char *env, unsigned default_value); +int get_environment_int(const char *env, int default_value); +bool get_environment_bool(const char *env, bool default_value); +void set_environment(const char *env, const char *value); +} diff --git a/vulkan/context.cpp b/vulkan/context.cpp index 7e6819e4..30cb49e1 100644 --- a/vulkan/context.cpp +++ b/vulkan/context.cpp @@ -24,6 +24,7 @@ #include "context.hpp" #include "limits.hpp" #include "small_vector.hpp" +#include "environment.hpp" #include #include #include @@ -217,9 +218,9 @@ bool Context::init_loader(PFN_vkGetInstanceProcAddr addr) static void *module; if (!module) { - const char *vulkan_path = getenv("GRANITE_VULKAN_LIBRARY"); - if (vulkan_path) - module = dlopen(vulkan_path, RTLD_LOCAL | RTLD_LAZY); + auto vulkan_path = Util::get_environment_string("GRANITE_VULKAN_LIBRARY", ""); + if (!vulkan_path.empty()) + module = dlopen(vulkan_path.c_str(), RTLD_LOCAL | RTLD_LAZY); #ifdef __APPLE__ if (!module) module = dlopen("libvulkan.1.dylib", RTLD_LOCAL | RTLD_LAZY); @@ -428,17 +429,12 @@ bool Context::init_profile() #ifdef GRANITE_VULKAN_PROFILES if (required_profile.empty()) { - if (const char *env = getenv("GRANITE_VULKAN_PROFILE")) - { - required_profile = env; - LOGI("Overriding profile: %s\n", env); - } + if (Util::get_environment("GRANITE_VULKAN_PROFILE", required_profile)) + LOGI("Overriding profile: %s\n", required_profile.c_str()); - if (const char *strict_env = getenv("GRANITE_VULKAN_PROFILE_STRICT")) - { - required_profile_strict = strtoul(strict_env, nullptr, 0) != 0; - LOGI("Overriding profile strictness: %u\n", required_profile_strict); - } + required_profile_strict = Util::get_environment_bool("GRANITE_VULKAN_PROFILE_STRICT", false); + if (required_profile_strict) + LOGI("Using profile strictness.\n"); } if (required_profile.empty()) @@ -639,8 +635,7 @@ bool Context::create_instance(const char * const *instance_ext, uint32_t instanc VkValidationFeaturesEXT validation_features = { VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT }; - if (getenv("GRANITE_VULKAN_NO_VALIDATION")) - force_no_validation = true; + force_no_validation = Util::get_environment_bool("GRANITE_VULKAN_NO_VALIDATION", false); if (!force_no_validation && has_layer("VK_LAYER_KHRONOS_validation")) { @@ -858,13 +853,9 @@ bool Context::create_device(VkPhysicalDevice gpu_, VkSurfaceKHR surface, VK_VERSION_PATCH(props.driverVersion)); } - const char *gpu_index = getenv("GRANITE_VULKAN_DEVICE_INDEX"); - if (gpu_index) - { - unsigned index = strtoul(gpu_index, nullptr, 0); - if (index < gpu_count) - gpu = gpus[index]; - } + int gpu_index = Util::get_environment_int("GRANITE_VULKAN_DEVICE_INDEX", -1); + if (gpu_index >= 0 && gpu_index < int(gpu_count)) + gpu = gpus[gpu_index]; if (gpu != VK_NULL_HANDLE) { diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 4dc5e7b2..c41ca35c 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -30,6 +30,7 @@ #include "thread_group.hpp" #include "meshlet.hpp" #include "aabb.hpp" +#include "environment.hpp" #include namespace Vulkan @@ -170,18 +171,19 @@ void ResourceManager::init() LOGI("Opting in to meshlet path.\n"); } - if (const char *env = getenv("GRANITE_MESH_ENCODING")) + std::string encoding; + if (Util::get_environment("GRANITE_MESH_ENCODING", encoding)) { - if (strcmp(env, "encoded") == 0) + if (encoding == "encoded") mesh_encoding = MeshEncoding::MeshletEncoded; - else if (strcmp(env, "decoded") == 0) + else if (encoding == "decoded") mesh_encoding = MeshEncoding::MeshletDecoded; - else if (strcmp(env, "mdi") == 0) + else if (encoding == "mdi") mesh_encoding = MeshEncoding::VBOAndIBOMDI; - else if (strcmp(env, "classic") == 0) + else if (encoding == "classic") mesh_encoding = MeshEncoding::Classic; else - LOGE("Unknown encoding: %s\n", env); + LOGE("Unknown encoding: %s\n", encoding.c_str()); } if (mesh_encoding != MeshEncoding::MeshletEncoded) diff --git a/vulkan/wsi.cpp b/vulkan/wsi.cpp index cef1e0e5..21097365 100644 --- a/vulkan/wsi.cpp +++ b/vulkan/wsi.cpp @@ -22,7 +22,7 @@ #define NOMINMAX #include "wsi.hpp" -#include "quirks.hpp" +#include "environment.hpp" #if defined(ANDROID) && defined(HAVE_SWAPPY) #include "swappy/swappyVk.h" @@ -46,12 +46,8 @@ WSI::WSI() present_frame_latency = 1; #endif - const char *env = getenv("GRANITE_VULKAN_PRESENT_WAIT_LATENCY"); - if (env) - { - present_frame_latency = uint32_t(strtoul(env, nullptr, 0)); - LOGI("Overriding VK_KHR_present_wait latency to %u frames.\n", present_frame_latency); - } + present_frame_latency = Util::get_environment_uint("GRANITE_VULKAN_PRESENT_WAIT_LATENCY", present_frame_latency); + LOGI("Targeting VK_KHR_present_wait latency to %u frames.\n", present_frame_latency); // Primaries are ST.2020 with D65 whitepoint as specified. hdr_metadata.displayPrimaryRed = { 0.708f, 0.292f }; @@ -1015,8 +1011,7 @@ static bool init_surface_info(Device &device, WSIPlatform &platform, else LOGI("Win32: Not running full-screen.\n"); - const char *exclusive = getenv("GRANITE_EXCLUSIVE_FULL_SCREEN"); - bool prefer_exclusive = (exclusive && strtoul(exclusive, nullptr, 0) != 0) || low_latency_mode_enable; + bool prefer_exclusive = Util::get_environment_bool("GRANITE_EXCLUSIVE_FULL_SCREEN", false) || low_latency_mode_enable; if (ext.driver_id == VK_DRIVER_ID_AMD_PROPRIETARY && format == BackbufferFormat::HDR10) { @@ -1390,12 +1385,8 @@ WSI::SwapchainError WSI::init_swapchain(unsigned width, unsigned height) uint32_t desired_swapchain_images = low_latency_mode_enable && current_present_mode == PresentMode::SyncToVBlank ? 2 : 3; - { - const char *num_images = getenv("GRANITE_VULKAN_SWAPCHAIN_IMAGES"); - if (num_images) - desired_swapchain_images = uint32_t(strtoul(num_images, nullptr, 0)); - } + desired_swapchain_images = Util::get_environment_uint("GRANITE_VULKAN_SWAPCHAIN_IMAGES", desired_swapchain_images); LOGI("Targeting %u swapchain images.\n", desired_swapchain_images); if (desired_swapchain_images < caps.minImageCount) From 36841a9759ddd6cc4eaee5b11d290570061f4a25 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 25 Dec 2023 14:36:51 +0100 Subject: [PATCH 50/59] Begin hooking up UV/N/T encode/decode. --- scene-export/meshlet_export.cpp | 139 ++++++++++++-- tests/meshopt_sandbox.cpp | 317 +++++++++++++++++++++++++------- 2 files changed, 373 insertions(+), 83 deletions(-) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index c2c88206..8e307a0b 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -171,7 +171,7 @@ struct NormalTangent static std::vector mesh_extract_normal_tangent_oct8(const SceneFormats::Mesh &mesh) { std::vector encoded_attributes; - std::vector normals; + std::vector normals; std::vector tangents; auto &normal = mesh.attribute_layout[Util::ecast(MeshAttribute::Normal)]; @@ -191,7 +191,10 @@ static std::vector mesh_extract_normal_tangent_oct8(const SceneFo } } else if (normal.format == VK_FORMAT_UNDEFINED) - return {}; + { + for (auto &n : normals) + n = {}; + } else { LOGE("Unexpected format %u.\n", normal.format); @@ -212,27 +215,30 @@ static std::vector mesh_extract_normal_tangent_oct8(const SceneFo { for (size_t i = 0; i < num_attrs; i++) { - memcpy(normals[i].data, + memcpy(tangents[i].data, mesh.attributes.data() + i * mesh.attribute_stride + tangent.offset, sizeof(float) * 4); } } else if (tangent.format == VK_FORMAT_UNDEFINED) - return {}; + { + for (auto &t : tangents) + t = {}; + } else { LOGE("Unexpected format %u.\n", tangent.format); return {}; } - encoded_attributes.resize(normals.size()); + encoded_attributes.reserve(normals.size()); - std::vector n(encoded_attributes.size()); - std::vector t(encoded_attributes.size()); + std::vector n(normals.size()); + std::vector t(normals.size()); meshopt_encodeFilterOct(n.data(), n.size(), sizeof(i8vec4), 8, normals[0].data); meshopt_encodeFilterOct(t.data(), t.size(), sizeof(i8vec4), 8, tangents[0].data); - for (size_t i = 0, size = encoded_attributes.size(); i < size; i++) + for (size_t i = 0, size = normals.size(); i < size; i++) encoded_attributes.push_back({ n[i].xy(), t[i].xy(), tangents[i].w < 0.0f }); return encoded_attributes; @@ -409,14 +415,61 @@ static void encode_bitplane_16_inner(std::vector &out_payload_buffe } } -static void encode_bitplane_16(std::vector &out_payload_buffer, - const u16vec3 *values, unsigned encoded_bits) +static void encode_bitplane(std::vector &out_payload_buffer, + const u8vec4 *values, unsigned encoded_bits) +{ + if (encoded_bits == 8) + { + // Plain write. + PayloadB128 p[8]; + memcpy(p, values, sizeof(p)); + out_payload_buffer.insert(out_payload_buffer.end(), p, p + 8); + } + else + { + unsigned bit_offset = 0; + PayloadB128 p[4]; + + for (int mask = 4; mask; mask >>= 1) + { + if (encoded_bits & mask) + { + uint32_t *words = &p[0].words[0]; + int bits = mask; + int num_words = bits; + + for (int i = 0; i < num_words; i++) + p[i] = {}; + + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + auto d = values[i]; + for (int c = 0; c < 4; c++) + { + for (int b = 0; b < bits; b++) + { + int word = c * bits + b; + words[word] |= ((d[c] >> (bit_offset + b)) & 1u) << i; + } + } + } + + for (int i = 0; i < num_words; i++) + out_payload_buffer.push_back(p[i]); + bit_offset += bits; + } + } + } +} + +static void encode_bitplane(std::vector &out_payload_buffer, + const u16vec3 *values, unsigned encoded_bits) { encode_bitplane_16_inner<3>(out_payload_buffer, values, encoded_bits); } -static void encode_bitplane_16(std::vector &out_payload_buffer, - const u16vec2 *values, unsigned encoded_bits) +static void encode_bitplane(std::vector &out_payload_buffer, + const u16vec2 *values, unsigned encoded_bits) { encode_bitplane_16_inner<2>(out_payload_buffer, values, encoded_bits); } @@ -428,6 +481,8 @@ template <> struct to_signed_vector { using type = i16vec3; }; template <> struct to_signed_vector { using type = i16vec2; }; template <> struct to_components { enum { components = 3 }; }; template <> struct to_components { enum { components = 2 }; }; +template <> struct to_signed_vector { using type = i8vec4; }; +template <> struct to_components { enum { components = 4 }; }; template static auto max_component(T value) -> std::remove_reference_t @@ -441,7 +496,7 @@ static auto max_component(T value) -> std::remove_reference_t static void encode_attribute_stream(std::vector &out_payload_buffer, Stream &stream, - const T *raw_positions, + const T *raw_attributes, uint32_t chunk_index, const uint32_t *vbo_remap, uint32_t num_attributes) { @@ -450,18 +505,18 @@ static void encode_attribute_stream(std::vector &out_payload_buffer using SignedScalar = std::remove_reference_t; static_assert(sizeof(T) == 4 || sizeof(T) == 6, "Encoded type must be 32 or 48 bits."); - T positions[ElementsPerChunk]; + T attributes[ElementsPerChunk]; for (uint32_t i = 0; i < num_attributes; i++) - positions[i] = raw_positions[vbo_remap[i]]; + attributes[i] = raw_attributes[vbo_remap ? vbo_remap[i] : i]; for (uint32_t i = num_attributes; i < ElementsPerChunk; i++) - positions[i] = positions[0]; + attributes[i] = attributes[0]; T ulo{std::numeric_limits::max()}; T uhi{std::numeric_limits::min()}; SignedT slo{std::numeric_limits::max()}; SignedT shi{std::numeric_limits::min()}; - for (auto &p : positions) + for (auto &p : attributes) { ulo = min(ulo, p); uhi = max(uhi, p); @@ -490,13 +545,13 @@ static void encode_attribute_stream(std::vector &out_payload_buffer if (to_components::components == 3 && bits_per_component == 16) { memcpy(reinterpret_cast(&stream.u.base_value[8]) + sizeof(uint16_t) * chunk_index, - &ulo.z, sizeof(uint16_t)); + &ulo[2], sizeof(uint16_t)); } - for (auto &p : positions) + for (auto &p : attributes) p -= ulo; - encode_bitplane_16(out_payload_buffer, positions, encoded_bits); + encode_bitplane(out_payload_buffer, attributes, encoded_bits); } static void encode_mesh(Encoded &encoded, @@ -583,6 +638,50 @@ static void encode_mesh(Encoded &encoded, chunk_index, meshlet.attribute_remap, meshlet.vertex_count); break; + case StreamType::UV: + encode_attribute_stream(encoded.payload, stream, + static_cast(pp_data[stream_index]), + chunk_index, meshlet.attribute_remap, meshlet.vertex_count); + break; + + case StreamType::NormalTangentOct8: + { + u8vec4 nts[ElementsPerChunk]{}; + uint32_t sign_mask = 0; + auto *nt = static_cast(pp_data[stream_index]); + for (unsigned i = 0; i < meshlet.vertex_count; i++) + { + const auto &mapped_nt = nt[meshlet.attribute_remap[i]]; + sign_mask |= uint32_t(mapped_nt.t_sign) << i; + nts[i] = u8vec4(u8vec2(mapped_nt.n), u8vec2(mapped_nt.t)); + } + + if (meshlet.vertex_count < ElementsPerChunk && sign_mask == (1u << meshlet.vertex_count) - 1) + sign_mask = UINT32_MAX; + + if (sign_mask == 0) + { + stream.aux |= 1 << (2 * chunk_index); + } + else if (sign_mask == UINT32_MAX) + { + stream.aux |= 2 << (2 * chunk_index); + } + else + { + stream.aux |= 3 << (2 * chunk_index); + for (unsigned i = 0; i < meshlet.vertex_count; i++) + { + nts[i].w &= ~1; + nts[i].w |= (sign_mask >> i) & 1u; + } + } + + encode_attribute_stream(encoded.payload, stream, nts, + chunk_index, nullptr, meshlet.vertex_count); + break; + } + default: break; } diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 4e5bb53d..44857938 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -96,6 +96,38 @@ static void decode_bitfield_block_16(T *block, const PayloadB128 *&pdata, unsign } } +template +static void decode_bitfield_block_8(T *block, const PayloadB128 *&pdata, unsigned config) +{ + unsigned bit_offset = 0; + + for (int mask = 4; mask; mask >>= 1) + { + if (config & mask) + { + const uint32_t *words = &pdata->words[0]; + int bits = mask; + + for (uint32_t i = 0; i < ElementsPerChunk; i++) + { + T &d = block[i]; + for (int c = 0; c < Components; c++) + { + for (int b = 0; b < bits; b++) + { + int word = c * bits + b; + d[c] |= ((words[word] >> i) & 1u) << (bit_offset + b); + } + } + } + + int num_words = (bits * Components + 3) / 4; + pdata += num_words; + bit_offset += bits; + } + } +} + static void decode_attribute_buffer(std::vector &out_positions, const MeshView &mesh, uint32_t meshlet_index, StreamType type) { auto &meshlet = mesh.headers[meshlet_index]; @@ -148,14 +180,145 @@ static void decode_attribute_buffer(std::vector &out_positions, const Mesh } } +static void decode_attribute_buffer(std::vector &out_uvs, const MeshView &mesh, uint32_t meshlet_index, StreamType type) +{ + auto &meshlet = mesh.headers[meshlet_index]; + auto &index_stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(StreamType::Primitive)]; + auto &stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(type)]; + const auto *pdata = mesh.payload + stream.offset_in_b128; + + for (uint32_t chunk = 0; chunk < meshlet.num_chunks; chunk++) + { + u16vec2 uvs[ElementsPerChunk]{}; + unsigned config = (stream.bit_plane_config >> (4 * chunk)) & 0xf; + + if (config == 8) + { + for (uint32_t i = 0; i < ElementsPerChunk; i++) + memcpy(uvs[i].data, &pdata[i / 4].words[i % 4], 2 * sizeof(uint16_t)); + + pdata += 8; + } + else + { + decode_bitfield_block_16<2>(uvs, pdata, config); + } + + u16vec2 base; + memcpy(base.data, &stream.u.base_value[chunk], sizeof(uint16_t) * 2); + + for (auto &p : uvs) + p += base; + + uint32_t num_attributes_for_chunk = index_stream.u.offsets[chunk + 1].attr_offset - + index_stream.u.offsets[chunk].attr_offset; + + for (uint32_t i = 0; i < num_attributes_for_chunk; i++) + { + vec2 float_pos = vec2(i16vec2(uvs[i])); + float_pos.x = ldexpf(float_pos.x, stream.aux); + float_pos.y = ldexpf(float_pos.y, stream.aux); + out_uvs.push_back(0.5f * float_pos + 0.5f); + } + } +} + +static vec3 decode_oct8(i8vec2 payload) +{ + vec2 f = vec2(payload) * (1.0f / 127.0f); + vec3 n = vec3(f.x, f.y, 1.0f - abs(f.x) - abs(f.y)); + float t = max(-n.z, 0.0f); + + if (n.x > 0.0f) + n.x -= t; + else + n.x += t; + + if (n.y > 0.0f) + n.y -= t; + else + n.y += t; + + return normalize(n); +} + +static void decode_attribute_buffer(std::vector &out_normals, std::vector &out_tangents, + const MeshView &mesh, uint32_t meshlet_index, StreamType type) +{ + auto &meshlet = mesh.headers[meshlet_index]; + auto &index_stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(StreamType::Primitive)]; + auto &stream = mesh.streams[meshlet_index * mesh.format_header->stream_count + int(type)]; + const auto *pdata = mesh.payload + stream.offset_in_b128; + + for (uint32_t chunk = 0; chunk < meshlet.num_chunks; chunk++) + { + u8vec4 nts[ElementsPerChunk]{}; + uint32_t t_signs = 0; + + unsigned config = (stream.bit_plane_config >> (4 * chunk)) & 0xf; + + if (config == 8) + { + memcpy(nts, pdata, sizeof(nts)); + pdata += 8; + } + else + { + decode_bitfield_block_8<4>(nts, pdata, config); + } + + int aux = (stream.aux >> (2 * chunk)) & 3; + + if (aux == 1) + { + t_signs = 0; + } + else if (aux == 2) + { + t_signs = UINT32_MAX; + } + + u8vec4 base; + memcpy(base.data, &stream.u.base_value[chunk], sizeof(base.data)); + + for (auto &p : nts) + p += base; + + if (aux == 3) + { + for (unsigned i = 0; i < ElementsPerChunk; i++) + { + t_signs |= (nts[i].w & 1u) << i; + nts[i].w &= ~1; + } + } + + uint32_t num_attributes_for_chunk = index_stream.u.offsets[chunk + 1].attr_offset - + index_stream.u.offsets[chunk].attr_offset; + + for (uint32_t i = 0; i < num_attributes_for_chunk; i++) + { + vec3 n = decode_oct8(i8vec2(nts[i].xy())); + vec3 t = decode_oct8(i8vec2(nts[i].zw())); + out_normals.push_back(n); + out_tangents.emplace_back(t, (t_signs & (1u << i)) != 0 ? -1.0f : 1.0f); + } + } +} + static void decode_mesh(std::vector &out_index_buffer, std::vector &out_positions, + std::vector &out_uvs, + std::vector &out_normals, + std::vector &out_tangents, const MeshView &mesh) { for (uint32_t meshlet_index = 0; meshlet_index < mesh.format_header->meshlet_count; meshlet_index++) { decode_mesh_index_buffer(out_index_buffer, mesh, meshlet_index); decode_attribute_buffer(out_positions, mesh, meshlet_index, StreamType::Position); + decode_attribute_buffer(out_uvs, mesh, meshlet_index, StreamType::UV); + decode_attribute_buffer(out_normals, out_tangents, mesh, meshlet_index, StreamType::NormalTangentOct8); } } @@ -195,6 +358,8 @@ static void decode_mesh_gpu( info.payload = payload_buffer.get(); info.flags = DECODE_MODE_UNROLLED_MESH; + info.target_style = MeshStyle::Wireframe; + decode_mesh(*cmd, info, mesh); cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); @@ -213,7 +378,14 @@ static void decode_mesh_gpu( out_pos_buffer.size() * sizeof(vec3)); } -static void build_reference_mesh(std::vector &indices, std::vector &positions) +struct Attr +{ + vec2 uv; + vec3 n; + vec4 t; +}; + +static void build_reference_mesh(std::vector &indices, std::vector &positions, std::vector &attr) { for (unsigned i = 0; i < 256; i++) { @@ -229,6 +401,13 @@ static void build_reference_mesh(std::vector &indices, std::vector vec3 p = vec3(-40.0f + float(i)); #endif positions.push_back(p); + + Attr a = {}; + a.uv.x = 1.0f * float(i); + a.uv.y = a.uv.x * 1.5f; + a.n = normalize(vec3(1.0f)); + a.t = vec4(normalize(vec3(1.0f, -1.0f, 1.0f)), -1.0f); + attr.push_back(a); } for (unsigned i = 0; i < 254; i++) @@ -284,11 +463,51 @@ static bool validate_mesh(std::vector &reference_indices, return true; } -int main(int argc, char *argv[]) +template +static auto max_component(T value) -> std::remove_reference_t { - if (argc != 2) - return EXIT_FAILURE; + std::remove_reference_t val = 0; + for (auto v : value.data) + val = std::max(val, v); + return val; +} + +template +static bool validate_mesh_attribute(const std::vector &reference_indices, + const std::vector &reference_attr, + const std::vector &decoded_indices, + const std::vector &decoded_attr, ScalarT tolerance) +{ + if (reference_indices.size() != decoded_indices.size()) + { + LOGE("Mismatch in index buffer size.\n"); + return false; + } + for (size_t i = 0, n = decoded_indices.size(); i < n; i++) + { + uvec3 ref_i = reference_indices[i]; + uvec3 decode_i = decoded_indices[i]; + + for (int c = 0; c < 3; c++) + { + auto ref_attr = reference_attr[ref_i[c]]; + auto decode_attr = decoded_attr[decode_i[c]]; + auto d = abs(ref_attr - decode_attr); + auto max_d = max_component(d); + if (max_d > tolerance) + { + LOGE("Mismatch in primitive %zu, c = %d.\n", i, c); + return false; + } + } + } + + return true; +} + +int main() +{ Global::init(Global::MANAGER_FEATURE_FILESYSTEM_BIT); Filesystem::setup_default_filesystem(GRANITE_FILESYSTEM(), ASSET_DIRECTORY); @@ -296,7 +515,8 @@ int main(int argc, char *argv[]) std::vector reference_indices; std::vector reference_positions; - build_reference_mesh(reference_indices, reference_positions); + std::vector reference_attributes; + build_reference_mesh(reference_indices, reference_positions, reference_attributes); mesh.index_type = VK_INDEX_TYPE_UINT32; mesh.count = 3 * reference_indices.size(); @@ -309,7 +529,17 @@ int main(int argc, char *argv[]) mesh.positions.resize(reference_positions.size() * sizeof(vec3)); memcpy(mesh.positions.data(), reference_positions.data(), reference_positions.size() * sizeof(vec3)); - if (!Meshlet::export_mesh_to_meshlet("/tmp/export.msh2", std::move(mesh), MeshStyle::Wireframe)) + mesh.attribute_layout[int(MeshAttribute::UV)].format = VK_FORMAT_R32G32_SFLOAT; + mesh.attribute_layout[int(MeshAttribute::UV)].offset = offsetof(Attr, uv); + mesh.attribute_layout[int(MeshAttribute::Normal)].format = VK_FORMAT_R32G32B32_SFLOAT; + mesh.attribute_layout[int(MeshAttribute::Normal)].offset = offsetof(Attr, n); + mesh.attribute_layout[int(MeshAttribute::Tangent)].format = VK_FORMAT_R32G32B32A32_SFLOAT; + mesh.attribute_layout[int(MeshAttribute::Tangent)].offset = offsetof(Attr, t); + mesh.attribute_stride = sizeof(Attr); + mesh.attributes.resize(mesh.attribute_stride * reference_attributes.size()); + memcpy(mesh.attributes.data(), reference_attributes.data(), mesh.attributes.size()); + + if (!Meshlet::export_mesh_to_meshlet("/tmp/export.msh2", std::move(mesh), MeshStyle::Textured)) return EXIT_FAILURE; auto file = GRANITE_FILESYSTEM()->open("/tmp/export.msh2", FileMode::ReadOnly); @@ -322,8 +552,11 @@ int main(int argc, char *argv[]) std::vector decoded_index_buffer; std::vector decoded_positions; + std::vector decoded_uvs; + std::vector decoded_normals; + std::vector decoded_tangents; auto view = create_mesh_view(*mapped); - decode_mesh(decoded_index_buffer, decoded_positions, view); + decode_mesh(decoded_index_buffer, decoded_positions, decoded_uvs, decoded_normals, decoded_tangents, view); Vulkan::Context ctx; Vulkan::Device dev; @@ -349,67 +582,25 @@ int main(int argc, char *argv[]) decoded_index_buffer, decoded_positions, true)) return EXIT_FAILURE; - return 0; -#if 0 - GLTF::Parser parser(argv[1]); - - - dev.init_frame_contexts(4); - - auto mesh = parser.get_meshes().front(); - - if (!Meshlet::export_mesh_to_meshlet("export.msh1", - mesh, MeshStyle::Textured)) + std::vector reference_uvs; + std::vector reference_normals; + std::vector reference_tangents; + reference_uvs.reserve(reference_attributes.size()); + reference_normals.reserve(reference_attributes.size()); + reference_tangents.reserve(reference_attributes.size()); + for (auto &a : reference_attributes) { - return EXIT_FAILURE; + reference_uvs.push_back(a.uv); + reference_normals.push_back(a.n); + reference_tangents.push_back(a.t); } - auto file = GRANITE_FILESYSTEM()->open("export.msh1", FileMode::ReadOnly); - if (!file) + if (!validate_mesh_attribute(reference_indices, reference_uvs, decoded_index_buffer, decoded_uvs, 0.0f)) return EXIT_FAILURE; - - auto mapped = file->map(); - if (!mapped) + if (!validate_mesh_attribute(reference_indices, reference_normals, decoded_index_buffer, decoded_normals, 0.02f)) return EXIT_FAILURE; - - auto view = create_mesh_view(*mapped); - - std::vector reference_index_buffer; - std::vector reference_attributes; - std::vector gpu_index_buffer; - std::vector gpu_attributes; - - decode_mesh(reference_index_buffer, reference_attributes, view); - decode_mesh_gpu(dev, gpu_index_buffer, gpu_attributes, view); - - if (!validate_mesh_decode(gpu_index_buffer, gpu_attributes, - reference_index_buffer, reference_attributes, - view.format_header->u32_stream_count - 1)) - { + if (!validate_mesh_attribute(reference_indices, reference_tangents, decoded_index_buffer, decoded_tangents, 0.02f)) return EXIT_FAILURE; - } - - { - LOGI("Total primitives: %u\n", view.total_primitives); - LOGI("Total vertices: %u\n", view.total_vertices); - LOGI("Payload size: %llu bytes.\n", static_cast(view.format_header->payload_size_words * sizeof(uint32_t))); - - unsigned long long uncompressed_mesh_size = - view.total_primitives * sizeof(uint32_t) * 3 + - view.total_vertices * (view.format_header->u32_stream_count - 1) * sizeof(uint32_t); - unsigned long long uncompressed_payload_size = - view.total_primitives * sizeof(uint32_t) + - view.total_vertices * (view.format_header->u32_stream_count - 1) * sizeof(uint32_t); - LOGI("Uncompressed mesh size: %llu bytes.\n", uncompressed_mesh_size); - LOGI("Uncompressed payload size: %llu bytes.\n", uncompressed_payload_size); - } - { - file = GRANITE_FILESYSTEM()->open("export.bin", FileMode::WriteOnly); - mapped = file->map_write((reference_index_buffer.size() + reference_attributes.size()) * sizeof(uint32_t)); - auto *ptr = mapped->mutable_data(); - memcpy(ptr, reference_index_buffer.data(), reference_index_buffer.size() * sizeof(uint32_t)); - memcpy(ptr + reference_index_buffer.size(), reference_attributes.data(), reference_attributes.size() * sizeof(uint32_t)); - } -#endif + return 0; } \ No newline at end of file From 81d9ab4e1bf9afaf9347b56f18caad023b269b38 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 25 Dec 2023 14:46:54 +0100 Subject: [PATCH 51/59] Test more interesting N/T patterns. --- scene-export/meshlet_export.cpp | 6 ------ tests/meshopt_sandbox.cpp | 6 ++++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index 8e307a0b..e83a645f 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -65,12 +65,6 @@ struct Meshlet const uint32_t *attribute_remap; }; -struct PrimitiveAnalysisResult -{ - uint32_t num_primitives; - uint32_t num_attributes; -}; - static i16vec3 encode_vec3_to_snorm_exp(vec3 v, int scale_log2) { v.x = ldexpf(v.x, scale_log2); diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 44857938..002c5df6 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -405,8 +405,10 @@ static void build_reference_mesh(std::vector &indices, std::vector Attr a = {}; a.uv.x = 1.0f * float(i); a.uv.y = a.uv.x * 1.5f; - a.n = normalize(vec3(1.0f)); - a.t = vec4(normalize(vec3(1.0f, -1.0f, 1.0f)), -1.0f); + a.n = normalize(vec3(1.0f + float(i), 1.0f, -0.3f)); + a.t = vec4(a.n.y, -a.n.z, a.n.x, +1.0f); + if (i & 1) + a.t.w = -1.0f; attr.push_back(a); } From b4290074221111076b249dcb45b95974f94d3f19 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 25 Dec 2023 21:50:55 +0100 Subject: [PATCH 52/59] Actually use path. --- tests/meshlet_viewer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index ea66409a..5401d15f 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -814,7 +814,7 @@ Application *application_create(int argc, char **argv) try { - auto *app = new MeshletViewerApplication(argv[1]); + auto *app = new MeshletViewerApplication(path); return app; } catch (const std::exception &e) From 666f1ad7cddeaf46bcbe491e44ee7acbc7a261a6 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Mon, 25 Dec 2023 22:41:05 +0100 Subject: [PATCH 53/59] Starting sketching out n/t/uv decoder. --- assets/shaders/decode/meshlet_decode.comp | 30 ++- .../shaders/inc/meshlet_payload_constants.h | 7 + assets/shaders/inc/meshlet_payload_decode.h | 191 ++++++++++++++++++ tests/meshopt_sandbox.cpp | 47 ++++- 4 files changed, 269 insertions(+), 6 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index a9dbdf4a..21f2e5cf 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -150,7 +150,8 @@ void main() // Index if (chunk_index < meta.num_chunks && lane_index < chunk_info.primitive_count) { - uint decoded_index_buffer = meshlet_decode_index_buffer(meshlet_index * NUM_U32_STREAMS, + uint decoded_index_buffer = meshlet_decode_index_buffer( + meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_PRIMITIVE, chunk_index, lane_index); if (RUNTIME_MESH) @@ -179,7 +180,10 @@ void main() if (chunk_index < meta.num_chunks && lane_index < chunk_info.vertex_count) { int exponent; - i16vec3 pos = meshlet_decode_snorm_scaled_i16x3(meshlet_index * NUM_U32_STREAMS + 1, chunk_index, lane_index, exponent); + i16vec3 pos = meshlet_decode_snorm_scaled_i16x3( + meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_POSITION, + chunk_index, lane_index, exponent); + vec3 fp_pos = ldexp(vec3(pos), ivec3(exponent)); uint vertex_output_offset; if (RUNTIME_MESH) @@ -187,7 +191,27 @@ void main() else vertex_output_offset = registers.vertex_offset + meta.base_vertex_offset + chunk_info.vertex_offset; output_stream_pos.data[vertex_output_offset + lane_index] = fp_pos; + + if (TARGET_MESH_STYLE >= MESH_STYLE_TEXTURED) + { + bool t_sign; + u8vec4 nt = meshlet_decode_normal_tangent_oct8( + meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8, t_sign); + i16vec2 uv = meshlet_decode_snorm_scaled_i16x2(meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_UV, exponent); + vec2 fp_uv = ldexp(vec2(uv), ivec2(exponent)); + output_stream_textured_attr.data[vertex_output_offset + lane_index] = + TexturedAttr(pack_a2bgr10(nt[0]), pack_a2bgr10(nt[1]), fp_uv); + } } else if (RUNTIME_MESH) - output_stream_pos.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] = vec3(intBitsToFloat(-1)); + { + output_stream_pos.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] = + vec3(intBitsToFloat(-1)); + + if (TARGET_MESH_STYLE >= MESH_STYLE_TEXTURED) + { + output_stream_textured_attr.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] = + TexturedAttr(0, 0, vec2(0.0)); + } + } } diff --git a/assets/shaders/inc/meshlet_payload_constants.h b/assets/shaders/inc/meshlet_payload_constants.h index 2a91ff53..ae1c8daa 100644 --- a/assets/shaders/inc/meshlet_payload_constants.h +++ b/assets/shaders/inc/meshlet_payload_constants.h @@ -5,4 +5,11 @@ #define MESHLET_PAYLOAD_NUM_CHUNKS 8 #define MESHLET_PAYLOAD_MAX_STREAMS 16 +const int MESHLET_STERAM_TYPE_PRIMITIVE = 0; +const int MESHLET_STERAM_TYPE_POSITION = 1; +const int MESHLET_STERAM_TYPE_NORMAL_TANGENT_OCT8 = 2; +const int MESHLET_STERAM_TYPE_UV = 3; +const int MESHLET_STERAM_TYPE_BONE_INDICES = 4; +const int MESHLET_STERAM_TYPE_BONE_WEIGHTS = 5; + #endif \ No newline at end of file diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h index 662902f1..db5a04ca 100644 --- a/assets/shaders/inc/meshlet_payload_decode.h +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -263,4 +263,195 @@ i16vec3 meshlet_decode_snorm_scaled_i16x3(uint stream_index, uint chunk_index, i return i16vec3(value); } +i16vec2 meshlet_decode_snorm_scaled_i16x2(uint stream_index, uint chunk_index, int lane_index, out int exponent) +{ + uint offset_in_b128 = meshlet_streams.data[stream_index].offset_in_b128; + uint bit_plane_config = meshlet_streams.data[stream_index].bit_plane_config; + exponent = meshlet_streams.data[stream_index].aux; + + // Scalar math. + if (chunk_index != 0) + { + uint prev_bit_mask = bitfieldExtract(bit_plane_config, 0, int(chunk_index) * 4); + offset_in_b128 += bitCount(prev_bit_mask & 0x88888888) * 8; + offset_in_b128 += bitCount(prev_bit_mask & 0x44444444) * 4; + offset_in_b128 += bitCount(prev_bit_mask & 0x22222222) * 2; + offset_in_b128 += bitCount(prev_bit_mask & 0x11111111) * 1; + } + + // Scalar math. + uint encoded_bits = bitfieldExtract(bit_plane_config, int(chunk_index * 4), 4); + uint base_value_xy = meshlet_streams.data[stream_index].base_value_or_offsets[chunk_index]; + uint base_value_z = meshlet_streams.data[stream_index].base_value_or_offsets[8 + chunk_index / 2]; + uint base_value_x = bitfieldExtract(base_value_xy, 0, 16); + uint base_value_y = bitfieldExtract(base_value_xy, 16, 16); + uvec2 base_value = uvec2(base_value_x, base_value_y); + + uvec2 value = uvec2(0); + + if (encoded_bits == 8) + { + // Vector loads. + uint value_xy = payload_u32.data[offset_in_b128 * 4 + lane_index]; + + value.x = bitfieldExtract(value_xy, 0, 16); + value.y = bitfieldExtract(value_xy, 16, 16); + } + else if (encoded_bits != 0) + { + uvec4 p0, p1, p2, p3; + + // Scalar loads, vector math. + // Preload early. Also helps compiler prove it can use common descriptor (RADV thing). + p0 = payload.data[offset_in_b128]; + offset_in_b128 += 1; + + int bit_offset = 0; + if ((encoded_bits & 4) != 0) + { + p1 = payload.data[offset_in_b128 + 0]; + p2 = payload.data[offset_in_b128 + 1]; + p3 = payload.data[offset_in_b128 + 2]; + + UNROLL_BITS_8(value.x, 0, p0, p1); + UNROLL_BITS_8(value.y, 0, p2, p3); + + // Preload for next iteration. + p0 = payload.data[offset_in_b128 + 3]; + + offset_in_b128 += 4; + bit_offset += 8; + } + + if ((encoded_bits & 2) != 0) + { + p1 = payload.data[offset_in_b128 + 0]; + + UNROLL_BITS_4(value.x, bit_offset, p0); + UNROLL_BITS_4(value.y, bit_offset, p1); + + // Preload for next iteration. + p0 = payload.data[offset_in_b128 + 1]; + offset_in_b128 += 2; + bit_offset += 4; + } + + if ((encoded_bits & 1) != 0) + { + value.x |= bitfieldExtract(p0.x, lane_index, 1) << (bit_offset + 0); + value.x |= bitfieldExtract(p0.y, lane_index, 1) << (bit_offset + 1); + value.y |= bitfieldExtract(p0.z, lane_index, 1) << (bit_offset + 0); + value.y |= bitfieldExtract(p0.w, lane_index, 1) << (bit_offset + 1); + } + } + + value += base_value; + return i16vec2(value); +} + +#undef UNROLL_BITS_4 +#undef UNROLL_BITS_8 + +u8vec4 meshlet_decode_normal_tangent_oct8(uint stream_index, uint chunk_index, int lane_index, out bool t_sign) +{ + uint offset_in_b128 = meshlet_streams.data[stream_index].offset_in_b128; + uint bit_plane_config = meshlet_streams.data[stream_index].bit_plane_config; + + // Scalar math. + if (chunk_index != 0) + { + uint prev_bit_mask = bitfieldExtract(bit_plane_config, 0, int(chunk_index) * 4); + offset_in_b128 += bitCount(prev_bit_mask & 0x88888888) * 8; + offset_in_b128 += bitCount(prev_bit_mask & 0x44444444) * 4; + offset_in_b128 += bitCount(prev_bit_mask & 0x22222222) * 2; + offset_in_b128 += bitCount(prev_bit_mask & 0x11111111) * 1; + } + + // Scalar math. + uint encoded_bits = bitfieldExtract(bit_plane_config, int(chunk_index * 4), 4); + uvec4 base_value = uvec4(unpack8(meshlet_streams.data[stream_index].base_value_or_offsets[chunk_index])); + uvec4 value = uvec4(0); + + if (encoded_bits == 8) + { + // Vector loads. + uint value_xyzw = payload_u32.data[offset_in_b128 * 4 + lane_index]; + value = uvec4(unpack8(value_xyzw)); + } + else if (encoded_bits != 0) + { + uvec4 p0, p1, p2, p3; + + // Scalar loads, vector math. + // Preload early. Also helps compiler prove it can use common descriptor (RADV thing). + p0 = payload.data[offset_in_b128]; + offset_in_b128 += 1; + +#define UNROLL_BITS_4(out_value, bit_offset, p) \ + out_value |= bitfieldExtract(p.x, lane_index, 1) << ((bit_offset) + 0); \ + out_value |= bitfieldExtract(p.y, lane_index, 1) << ((bit_offset) + 1); \ + out_value |= bitfieldExtract(p.z, lane_index, 1) << ((bit_offset) + 2); \ + out_value |= bitfieldExtract(p.w, lane_index, 1) << ((bit_offset) + 3) + + int bit_offset = 0; + if ((encoded_bits & 4) != 0) + { + p1 = payload.data[offset_in_b128 + 0]; + p2 = payload.data[offset_in_b128 + 1]; + p3 = payload.data[offset_in_b128 + 2]; + + UNROLL_BITS_4(value.x, 0, p0); + UNROLL_BITS_4(value.y, 0, p1); + UNROLL_BITS_4(value.z, 0, p2); + UNROLL_BITS_4(value.w, 0, p3); + + // Preload for next iteration. + p0 = payload.data[offset_in_b128 + 3]; + + offset_in_b128 += 4; + bit_offset += 4; + } + + if ((encoded_bits & 2) != 0) + { + p1 = payload.data[offset_in_b128 + 0]; + + value.x |= bitfieldExtract(p0.x, lane_index, 1) << (bit_offset + 0); + value.x |= bitfieldExtract(p0.y, lane_index, 1) << (bit_offset + 1); + value.y |= bitfieldExtract(p0.z, lane_index, 1) << (bit_offset + 0); + value.y |= bitfieldExtract(p0.w, lane_index, 1) << (bit_offset + 1); + value.z |= bitfieldExtract(p1.x, lane_index, 1) << (bit_offset + 0); + value.z |= bitfieldExtract(p1.y, lane_index, 1) << (bit_offset + 1); + value.w |= bitfieldExtract(p1.z, lane_index, 1) << (bit_offset + 0); + value.w |= bitfieldExtract(p1.w, lane_index, 1) << (bit_offset + 1); + + // Preload for next iteration. + p0 = payload.data[offset_in_b128 + 1]; + offset_in_b128 += 2; + bit_offset += 2; + } + + if ((encoded_bits & 1) != 0) + { + value.x |= bitfieldExtract(p0.x, lane_index, 1) << bit_offset; + value.y |= bitfieldExtract(p0.y, lane_index, 1) << bit_offset; + value.z |= bitfieldExtract(p0.z, lane_index, 1) << bit_offset; + value.w |= bitfieldExtract(p0.w, lane_index, 1) << bit_offset; + } + } + + value += base_value; + + uint aux = bitfieldExtract(uint(meshlet_streams.data[stream_index].aux), int(chunk_index * 2), 2); + if (aux == 3) + { + t_sign = bool(value.w & 1); + value.w &= ~1; + } + else + t_sign = aux == 2; + + return u8vec4(value); +} + #endif diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp index 002c5df6..87f347c7 100644 --- a/tests/meshopt_sandbox.cpp +++ b/tests/meshopt_sandbox.cpp @@ -322,14 +322,32 @@ static void decode_mesh(std::vector &out_index_buffer, } } +static vec4 decode_bgr10a2(uint32_t v) +{ + vec4 fvalue = vec4(ivec4((uvec4(v) >> uvec4(0, 10, 20, 30)) & 0x3ffu) - ivec4(512, 512, 512, 2)) * + vec4(1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f); + fvalue = clamp(fvalue, vec4(-1.0f), vec4(1.0f)); + return fvalue; +} + static void decode_mesh_gpu( Vulkan::Device &dev, std::vector &out_index_buffer, std::vector &out_pos_buffer, + std::vector &out_uvs, std::vector &out_normals, std::vector &out_tangents, const MeshView &mesh) { out_index_buffer.resize(mesh.total_primitives); out_pos_buffer.resize(mesh.total_vertices); + struct Attr + { + uint32_t n; + uint32_t t; + vec2 uv; + }; + + std::vector out_attr_buffer(mesh.total_vertices); + Vulkan::BufferCreateInfo buf_info = {}; buf_info.domain = Vulkan::BufferDomain::LinkedDeviceHost; buf_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; @@ -345,6 +363,10 @@ static void decode_mesh_gpu( buf_info.domain = Vulkan::BufferDomain::CachedHost; auto readback_decoded_pos_buffer = dev.create_buffer(buf_info); + buf_info.size = out_attr_buffer.size() * sizeof(Attr); + buf_info.domain = Vulkan::BufferDomain::CachedHost; + auto readback_decoded_attr_buffer = dev.create_buffer(buf_info); + bool has_renderdoc = Vulkan::Device::init_renderdoc_capture(); if (has_renderdoc) dev.begin_renderdoc_capture(); @@ -354,12 +376,11 @@ static void decode_mesh_gpu( DecodeInfo info = {}; info.ibo = readback_decoded_index_buffer.get(); info.streams[0] = readback_decoded_pos_buffer.get(); + info.streams[1] = readback_decoded_attr_buffer.get(); info.target_style = mesh.format_header->style; info.payload = payload_buffer.get(); info.flags = DECODE_MODE_UNROLLED_MESH; - info.target_style = MeshStyle::Wireframe; - decode_mesh(*cmd, info, mesh); cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); @@ -376,6 +397,23 @@ static void decode_mesh_gpu( memcpy(out_pos_buffer.data(), dev.map_host_buffer(*readback_decoded_pos_buffer, Vulkan::MEMORY_ACCESS_READ_BIT), out_pos_buffer.size() * sizeof(vec3)); + + out_uvs.clear(); + out_normals.clear(); + out_tangents.clear(); + + out_uvs.reserve(mesh.total_vertices); + out_normals.reserve(mesh.total_vertices); + out_tangents.reserve(mesh.total_vertices); + + auto *attrs = static_cast(dev.map_host_buffer(*readback_decoded_attr_buffer, Vulkan::MEMORY_ACCESS_READ_BIT)); + for (size_t i = 0, n = mesh.total_vertices; i < n; i++) + { + auto &attr = attrs[i]; + out_uvs.push_back(attr.uv); + out_normals.push_back(decode_bgr10a2(attr.n).xyz()); + out_tangents.push_back(decode_bgr10a2(attr.t)); + } } struct Attr @@ -574,7 +612,10 @@ int main() std::vector gpu_index_buffer; std::vector gpu_positions; - decode_mesh_gpu(dev, gpu_index_buffer, gpu_positions, view); + std::vector gpu_uvs; + std::vector gpu_normals; + std::vector gpu_tangents; + decode_mesh_gpu(dev, gpu_index_buffer, gpu_positions, gpu_uvs, gpu_normals, gpu_tangents, view); if (!validate_mesh(decoded_index_buffer, decoded_positions, gpu_index_buffer, gpu_positions, false)) From 5c986b60e6807fa969291434a042442ec2484ed5 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 26 Dec 2023 11:11:05 +0100 Subject: [PATCH 54/59] Decode shader compiles. --- assets/shaders/decode/meshlet_decode.comp | 12 ++++-- assets/shaders/inc/meshlet_attribute_decode.h | 42 ++++++++----------- .../shaders/inc/meshlet_payload_constants.h | 14 +++---- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index 21f2e5cf..db5fd6b0 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -17,6 +17,7 @@ layout(constant_id = 3) const bool RUNTIME_MESH = false; #define MESHLET_PAYLOAD_PAYLOAD_BINDING 2 #include "../inc/meshlet_payload_decode.h" #include "../inc/meshlet_attribute_decode.h" +#include "../inc/meshlet_payload_constants.h" const int MESH_STYLE_WIREFRAME = 0; const int MESH_STYLE_TEXTURED = 1; @@ -196,11 +197,16 @@ void main() { bool t_sign; u8vec4 nt = meshlet_decode_normal_tangent_oct8( - meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8, t_sign); - i16vec2 uv = meshlet_decode_snorm_scaled_i16x2(meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_UV, exponent); + meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8, + chunk_index, lane_index, t_sign); + i16vec2 uv = meshlet_decode_snorm_scaled_i16x2( + meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_UV, + chunk_index, lane_index, exponent); vec2 fp_uv = ldexp(vec2(uv), ivec2(exponent)); + + mediump mat2x4 NT = attribute_decode_oct8_normal_tangent(nt, t_sign); output_stream_textured_attr.data[vertex_output_offset + lane_index] = - TexturedAttr(pack_a2bgr10(nt[0]), pack_a2bgr10(nt[1]), fp_uv); + TexturedAttr(pack_a2bgr10(NT[0]), pack_a2bgr10(NT[1]), fp_uv); } } else if (RUNTIME_MESH) diff --git a/assets/shaders/inc/meshlet_attribute_decode.h b/assets/shaders/inc/meshlet_attribute_decode.h index ea7c2897..2db0f0ba 100644 --- a/assets/shaders/inc/meshlet_attribute_decode.h +++ b/assets/shaders/inc/meshlet_attribute_decode.h @@ -1,39 +1,33 @@ #ifndef MESHLET_ATTRIBUTE_DECODE_H_ #define MESHLET_ATTRIBUTE_DECODE_H_ -vec3 attribute_decode_snorm_exp_position(uvec2 payload) +vec3 attribute_decode_snorm_exp_position(i16vec3 payload, int exponent) { - ivec3 sint_value = ivec3( - bitfieldExtract(int(payload.x), 0, 16), - bitfieldExtract(int(payload.x), 16, 16), - bitfieldExtract(int(payload.y), 0, 16)); - int exp = bitfieldExtract(int(payload.y), 16, 16); - return vec3( - ldexp(float(sint_value.x), exp), - ldexp(float(sint_value.y), exp), - ldexp(float(sint_value.z), exp)); + vec3 fp_pos = ldexp(vec3(payload), ivec3(exponent)); + return fp_pos; } -vec2 attribute_decode_snorm_exp_uv(uvec2 payload) +vec2 attribute_decode_snorm_exp_uv(i16vec2 payload, int exponent) { - ivec2 sint_value = ivec2( - bitfieldExtract(int(payload.x), 0, 16), - bitfieldExtract(int(payload.x), 16, 16)); - int exp = bitfieldExtract(int(payload.y), 0, 16); - return 0.5 * vec2( - ldexp(float(sint_value.x), exp), - ldexp(float(sint_value.y), exp)) + 0.5; + return 0.5 * ldexp(vec2(payload), ivec2(exponent)) + 0.5; } -// Adapted from: https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding/ -// https://twitter.com/Stubbesaurus/status/9379947905532272640 -mediump vec4 attribute_decode_oct8_normal_tangent(uint payload) +mediump vec3 attribute_decode_oct_normal(mediump vec2 f) { - mediump vec4 f = unpackSnorm4x8(payload); mediump vec3 n = vec3(f.x, f.y, 1.0 - abs(f.x) - abs(f.y)); mediump float t = max(-n.z, 0.0); n.xy += mix(vec2(t), vec2(-t), greaterThanEqual(n.xy, vec2(0.0))); - return vec4(normalize(n), f.w != 0.0 ? -1.0 : 1.0); + return normalize(n); +} + +// Adapted from: https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding/ +// https://twitter.com/Stubbesaurus/status/9379947905532272640 +mediump mat2x4 attribute_decode_oct8_normal_tangent(u8vec4 payload, bool t_sign) +{ + mediump vec4 f = vec4(i8vec4(payload)) / 127.0; + mediump vec3 N = attribute_decode_oct_normal(f.xy); + mediump vec3 T = attribute_decode_oct_normal(f.zw); + return mat2x4(vec4(N, 0.0), vec4(T, t_sign ? -1.0 : 1.0)); } -#endif \ No newline at end of file +#endif diff --git a/assets/shaders/inc/meshlet_payload_constants.h b/assets/shaders/inc/meshlet_payload_constants.h index ae1c8daa..90a1743c 100644 --- a/assets/shaders/inc/meshlet_payload_constants.h +++ b/assets/shaders/inc/meshlet_payload_constants.h @@ -5,11 +5,11 @@ #define MESHLET_PAYLOAD_NUM_CHUNKS 8 #define MESHLET_PAYLOAD_MAX_STREAMS 16 -const int MESHLET_STERAM_TYPE_PRIMITIVE = 0; -const int MESHLET_STERAM_TYPE_POSITION = 1; -const int MESHLET_STERAM_TYPE_NORMAL_TANGENT_OCT8 = 2; -const int MESHLET_STERAM_TYPE_UV = 3; -const int MESHLET_STERAM_TYPE_BONE_INDICES = 4; -const int MESHLET_STERAM_TYPE_BONE_WEIGHTS = 5; +const int MESHLET_STREAM_TYPE_PRIMITIVE = 0; +const int MESHLET_STREAM_TYPE_POSITION = 1; +const int MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8 = 2; +const int MESHLET_STREAM_TYPE_UV = 3; +const int MESHLET_STREAM_TYPE_BONE_INDICES = 4; +const int MESHLET_STREAM_TYPE_BONE_WEIGHTS = 5; -#endif \ No newline at end of file +#endif From bdf36bdd6f06b5af867ffab385512121ab68cca7 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 26 Dec 2023 11:56:46 +0100 Subject: [PATCH 55/59] Decode attrs in "decode" mesh path. --- scene-export/meshlet_export.cpp | 5 ++- tests/assets/shaders/meshlet_debug.frag | 23 ++++++++---- tests/assets/shaders/meshlet_debug.mesh.frag | 25 ++++++++----- tests/assets/shaders/meshlet_debug.vert | 15 ++++++-- tests/assets/shaders/meshlet_debug_plain.mesh | 37 +++++++++++++++++-- tests/meshlet_viewer.cpp | 15 +------- 6 files changed, 82 insertions(+), 38 deletions(-) diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp index e83a645f..9fa41e37 100644 --- a/scene-export/meshlet_export.cpp +++ b/scene-export/meshlet_export.cpp @@ -263,7 +263,10 @@ static std::vector mesh_extract_uv_snorm_scale(const SceneFormats::Mesh } } else if (fmt == VK_FORMAT_UNDEFINED) - return {}; + { + for (auto &uv : uvs) + uv = {}; + } else { LOGE("Unexpected format %u.\n", fmt); diff --git a/tests/assets/shaders/meshlet_debug.frag b/tests/assets/shaders/meshlet_debug.frag index a23559c2..8554a2b0 100644 --- a/tests/assets/shaders/meshlet_debug.frag +++ b/tests/assets/shaders/meshlet_debug.frag @@ -2,10 +2,12 @@ #extension GL_EXT_nonuniform_qualifier : require #extension GL_EXT_fragment_shader_barycentric : require -layout(location = 0) pervertexEXT in vec3 vWorldPos[]; +layout(location = 0) in mediump vec3 vNormal; +layout(location = 1) in mediump vec4 vTangent; +layout(location = 2) in vec2 vUV; #if !SINGLE_INSTANCE_RENDER -layout(location = 1) flat in uint vDrawID; +layout(location = 3) flat in uint vDrawID; #else struct CompactedDrawInfo { uint meshlet_index; uint node_offset; uint material_index; }; layout(push_constant) uniform Registers @@ -29,13 +31,18 @@ void main() float pixels_from_edge = l / max(d, 0.0001); float highlight = 1.0 - smoothstep(0.25, 0.75, pixels_from_edge); - vec3 normal = normalize(cross(vWorldPos[1] - vWorldPos[0], vWorldPos[2] - vWorldPos[0])); + vec3 normal = normalize(vNormal); + vec3 tangent = normalize(vTangent.xyz); - FragColor = 0.5 * (0.5 * normal + 0.5); + FragColor = 0.3 * (0.5 * (normal * tangent * vTangent.w) + 0.5); FragColor.rg += 0.05 * highlight; + FragColor.rg += vUV * 0.02; - uint hashed = vDrawID ^ (vDrawID * 23423465); - FragColor.r += 0.01 * float(hashed % 19) / 19.0; - FragColor.g += 0.01 * float(hashed % 29) / 29.0; - FragColor.b += 0.01 * float(hashed % 131) / 131.0; + FragColor = clamp(0.5 * normal + 0.5, vec3(0.0), vec3(1.0)); + FragColor = pow(FragColor, vec3(4.0)); + + //uint hashed = vDrawID ^ (vDrawID * 23423465); + //FragColor.r += 0.1 * float(hashed % 19) / 19.0; + //FragColor.g += 0.1 * float(hashed % 29) / 29.0; + //FragColor.b += 0.1 * float(hashed % 131) / 131.0; } diff --git a/tests/assets/shaders/meshlet_debug.mesh.frag b/tests/assets/shaders/meshlet_debug.mesh.frag index b785c964..b8b703a5 100644 --- a/tests/assets/shaders/meshlet_debug.mesh.frag +++ b/tests/assets/shaders/meshlet_debug.mesh.frag @@ -3,8 +3,10 @@ #extension GL_EXT_nonuniform_qualifier : require #extension GL_EXT_fragment_shader_barycentric : require -layout(location = 0) pervertexEXT in vec3 vWorldPos[]; -layout(location = 1) perprimitiveEXT flat in uint vDrawID; +layout(location = 0) in mediump vec3 vNormal; +layout(location = 1) in mediump vec4 vTangent; +layout(location = 2) in vec2 vUV; +layout(location = 3) perprimitiveEXT flat in uint vDrawID; layout(location = 0) out vec3 FragColor; @@ -17,13 +19,18 @@ void main() float pixels_from_edge = l / max(d, 0.0001); float highlight = 1.0 - smoothstep(0.25, 0.75, pixels_from_edge); - vec3 normal = normalize(cross(vWorldPos[1] - vWorldPos[0], vWorldPos[2] - vWorldPos[0])); + vec3 normal = normalize(vNormal); + vec3 tangent = normalize(vTangent.xyz); - FragColor = 0.5 * (0.5 * normal + 0.5); + FragColor = 0.3 * (0.5 * (normal * tangent * vTangent.w) + 0.5); FragColor.rg += 0.05 * highlight; + FragColor.rg += vUV * 0.02; - uint hashed = vDrawID ^ (vDrawID * 23423465); - FragColor.r += 0.02 * float(hashed % 19) / 19.0; - FragColor.g += 0.02 * float(hashed % 29) / 29.0; - FragColor.b += 0.02 * float(hashed % 131) / 131.0; -} \ No newline at end of file + FragColor = clamp(0.5 * normal + 0.5, vec3(0.0), vec3(1.0)); + FragColor = pow(FragColor, vec3(4.0)); + + //uint hashed = vDrawID ^ (vDrawID * 23423465); + //FragColor.r += 0.1 * float(hashed % 19) / 19.0; + //FragColor.g += 0.1 * float(hashed % 29) / 29.0; + //FragColor.b += 0.1 * float(hashed % 131) / 131.0; +} diff --git a/tests/assets/shaders/meshlet_debug.vert b/tests/assets/shaders/meshlet_debug.vert index 5ba65b65..2ff2e651 100644 --- a/tests/assets/shaders/meshlet_debug.vert +++ b/tests/assets/shaders/meshlet_debug.vert @@ -4,9 +4,16 @@ #include "meshlet_render_types.h" layout(location = 0) in vec3 POS; -layout(location = 0) out vec3 vWorldPos; +layout(location = 1) in mediump vec3 NORMAL; +layout(location = 2) in mediump vec4 TANGENT; +layout(location = 3) in vec2 UV; + +layout(location = 0) out mediump vec3 vNormal; +layout(location = 1) out mediump vec4 vTangent; +layout(location = 2) out vec2 vUV; + #if !SINGLE_INSTANCE_RENDER -layout(location = 1) flat out uint vDrawID; +layout(location = 3) flat out uint vDrawID; #endif layout(set = 1, binding = 0) uniform UBO @@ -37,7 +44,9 @@ void main() mat4 M = transforms.data[draw_info.data[gl_DrawIDARB].node_offset]; #endif vec3 world_pos = (M * vec4(POS, 1.0)).xyz; - vWorldPos = world_pos; + vNormal = mat3(M) * NORMAL; + vTangent = vec4(mat3(M) * TANGENT.xyz, TANGENT.w); + vUV = UV; #if !SINGLE_INSTANCE_RENDER vDrawID = draw_info.data[gl_DrawIDARB].meshlet_index; #endif diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 73a241ed..1ecc6ec0 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -21,8 +21,10 @@ layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" #include "meshlet_primitive_cull.h" -layout(location = 0) out vec3 vWorldPos[]; -layout(location = 1) perprimitiveEXT out uint vDrawID[]; +layout(location = 0) out vec3 vNormal[]; +layout(location = 1) out vec4 vTangent[]; +layout(location = 2) out vec2 vUV[]; +layout(location = 3) perprimitiveEXT out uint vDrawID[]; layout(set = 1, binding = 0) uniform UBO { @@ -44,6 +46,18 @@ layout(set = 0, binding = 1, scalar) readonly buffer VBOPOS vec3 data[]; } pos; +struct TexturedAttr +{ + uint n; + uint t; + vec2 uv; +}; + +layout(set = 0, binding = 2, std430) readonly buffer VBOATTR +{ + TexturedAttr data[]; +} attr; + layout(set = 0, binding = 3, std430) readonly buffer IndirectCommands { IndirectDrawMesh draws[]; @@ -70,6 +84,16 @@ layout(set = 0, binding = 10) buffer Stats uint vert; } stats; +mediump vec4 unpack_bgr10a2(uint v) +{ + mediump ivec4 vs; + vs.x = bitfieldExtract(int(v), 0, 10); + vs.y = bitfieldExtract(int(v), 10, 10); + vs.z = bitfieldExtract(int(v), 20, 10); + vs.w = bitfieldExtract(int(v), 30, 3); + return vec4(vs) / vec4(511.0, 511.0, 511.0, 1.0); +} + void main() { uint compacted_meshlet_index = meshlet_get_meshlet_index(); @@ -108,7 +132,14 @@ void main() { uint out_vert_index = meshlet_compacted_vertex_output(); gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - vWorldPos[out_vert_index] = world_pos; + + TexturedAttr a = attr.data[meshlet.vertex_offset + linear_index + 32u * base_chunk_index]; + + mediump vec3 n = unpack_bgr10a2(a.n).xyz; + mediump vec4 t = unpack_bgr10a2(a.t); + vUV[out_vert_index] = a.uv; + vNormal[out_vert_index] = mat3(M) * n; + vTangent[out_vert_index] = vec4(mat3(M) * t.xyz, t.w); } if (gl_LocalInvocationIndex == 0) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 5401d15f..848892d1 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -95,29 +95,16 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // materials.push_back(GRANITE_MATERIAL_MANAGER()->register_material(&albedo, 1, nullptr, 0)); } -#if 1 unsigned count = 0; for (auto &mesh : parser.get_meshes()) { -#if 0 - if (!mesh.has_material || - mesh.attribute_layout[int(MeshAttribute::Normal)].format == VK_FORMAT_UNDEFINED || - mesh.attribute_layout[int(MeshAttribute::UV)].format == VK_FORMAT_UNDEFINED || - mesh.attribute_layout[int(MeshAttribute::Tangent)].format == VK_FORMAT_UNDEFINED) - { - mesh_assets.emplace_back(); - continue; - } -#endif - auto internal_path = std::string("memory://mesh") + std::to_string(count++); - if (!::Granite::Meshlet::export_mesh_to_meshlet(internal_path, mesh, MeshStyle::Wireframe)) + if (!::Granite::Meshlet::export_mesh_to_meshlet(internal_path, mesh, MeshStyle::Textured)) throw std::runtime_error("Failed to export meshlet."); mesh_assets.push_back(GRANITE_ASSET_MANAGER()->register_asset( *GRANITE_FILESYSTEM(), internal_path, Granite::AssetClass::Mesh)); } -#endif for (auto &node : parser.get_nodes()) { From d237aa96cf56bb51c40da75b6cefd55d48384a1b Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 26 Dec 2023 12:44:02 +0100 Subject: [PATCH 56/59] Add attr path to inline decode as well. --- assets/shaders/decode/meshlet_decode.comp | 4 +-- tests/assets/shaders/meshlet_debug.mesh | 30 +++++++++++++++---- tests/assets/shaders/meshlet_debug_plain.mesh | 8 ++--- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp index db5fd6b0..5e36ba21 100644 --- a/assets/shaders/decode/meshlet_decode.comp +++ b/assets/shaders/decode/meshlet_decode.comp @@ -202,7 +202,7 @@ void main() i16vec2 uv = meshlet_decode_snorm_scaled_i16x2( meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_UV, chunk_index, lane_index, exponent); - vec2 fp_uv = ldexp(vec2(uv), ivec2(exponent)); + vec2 fp_uv = attribute_decode_snorm_exp_uv(uv, exponent); mediump mat2x4 NT = attribute_decode_oct8_normal_tangent(nt, t_sign); output_stream_textured_attr.data[vertex_output_offset + lane_index] = @@ -217,7 +217,7 @@ void main() if (TARGET_MESH_STYLE >= MESH_STYLE_TEXTURED) { output_stream_textured_attr.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] = - TexturedAttr(0, 0, vec2(0.0)); + TexturedAttr(-1u, -1u, vec2(intBitsToFloat(-1))); } } } diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh index b5311c66..f9310a3e 100644 --- a/tests/assets/shaders/meshlet_debug.mesh +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -26,8 +26,10 @@ layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" #include "meshlet_primitive_cull.h" -layout(location = 0) out vec3 vWorldPos[]; -layout(location = 1) perprimitiveEXT out uint vDrawID[]; +layout(location = 0) out vec3 vNormal[]; +layout(location = 1) out vec4 vTangent[]; +layout(location = 2) out vec2 vUV[]; +layout(location = 3) perprimitiveEXT out uint vDrawID[]; layout(set = 1, binding = 0) uniform UBO { @@ -106,7 +108,9 @@ void main() if (lane_index < index_chunk_info.vertex_count) { int exponent; - i16vec3 ipos = meshlet_decode_snorm_scaled_i16x3(meta.stream_offset + 1, chunk_index, lane_index, exponent); + i16vec3 ipos = meshlet_decode_snorm_scaled_i16x3( + meta.stream_offset + MESHLET_STREAM_TYPE_POSITION, + chunk_index, lane_index, exponent); vec3 pos = ldexp(vec3(ipos), ivec3(exponent)); world_pos = (M * vec4(pos, 1.0)).xyz; clip_pos = VP * vec4(world_pos, 1.0); @@ -114,14 +118,28 @@ void main() } meshlet_emit_primitive(unpack8(decoded_index_buffer).xyz, clip_pos, viewport); - if (linear_index < shared_active_prim_count_total) - vDrawID[linear_index] = task.meshlet_index; + if (gl_LocalInvocationIndex < shared_active_prim_count_total) + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; if (meshlet_lane_has_active_vert()) { uint out_vert_index = meshlet_compacted_vertex_output(); gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - vWorldPos[out_vert_index] = world_pos; + + int exponent; + bool t_sign; + u8vec4 nt = meshlet_decode_normal_tangent_oct8( + meta.stream_offset + MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8, + chunk_index, lane_index, t_sign); + i16vec2 uv = meshlet_decode_snorm_scaled_i16x2( + meta.stream_offset + MESHLET_STREAM_TYPE_UV, + chunk_index, lane_index, exponent); + + vUV[out_vert_index] = attribute_decode_snorm_exp_uv(uv, exponent); + + mediump mat2x4 NT = attribute_decode_oct8_normal_tangent(nt, t_sign); + vNormal[out_vert_index] = mat3(M) * NT[0].xyz; + vTangent[out_vert_index] = vec4(mat3(M) * NT[1].xyz, NT[1].w); } if (gl_LocalInvocationIndex == 0) diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 1ecc6ec0..5edea5a1 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -21,8 +21,8 @@ layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" #include "meshlet_primitive_cull.h" -layout(location = 0) out vec3 vNormal[]; -layout(location = 1) out vec4 vTangent[]; +layout(location = 0) out mediump vec3 vNormal[]; +layout(location = 1) out mediump vec4 vTangent[]; layout(location = 2) out vec2 vUV[]; layout(location = 3) perprimitiveEXT out uint vDrawID[]; @@ -125,8 +125,8 @@ void main() uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index + 32u * base_chunk_index]); meshlet_emit_primitive(prim, clip_pos, viewport); - if (linear_index < shared_active_prim_count_total) - vDrawID[linear_index] = task.meshlet_index; + if (gl_LocalInvocationIndex < shared_active_prim_count_total) + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; if (meshlet_lane_has_active_vert()) { From 176438e97fa97c9203beb3e1fbb415c4ab9a751d Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 26 Dec 2023 14:03:10 +0100 Subject: [PATCH 57/59] Add some paths that use local invocation indexed outputs. --- assets/shaders/inc/meshlet_primitive_cull.h | 8 +++- tests/assets/shaders/meshlet_debug_plain.mesh | 39 +++++++++++++++++-- tests/meshlet_viewer.cpp | 2 + 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/assets/shaders/inc/meshlet_primitive_cull.h b/assets/shaders/inc/meshlet_primitive_cull.h index f8675e0a..d1d654d5 100644 --- a/assets/shaders/inc/meshlet_primitive_cull.h +++ b/assets/shaders/inc/meshlet_primitive_cull.h @@ -297,7 +297,13 @@ void meshlet_emit_primitive(uvec3 prim, vec4 clip_pos, vec4 viewport) SetMeshOutputsEXT(shared_active_vert_count_total, shared_active_prim_count_total); if (is_active_prim) - gl_PrimitiveTriangleIndicesEXT[compacted_index_output()] = remap_index_buffer(prim); + { +#ifdef MESHLET_PRIMITIVE_CULL_SHARED_INDEX + MESHLET_PRIMITIVE_CULL_SHARED_INDEX[compacted_index_output()] = u8vec3(remap_index_buffer(prim)); +#else + gl_PrimitiveTriangleIndicesEXT[compacted_index_output()] = remap_index_buffer(prim); +#endif + } } #endif diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 5edea5a1..1fce6442 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -15,6 +15,13 @@ #error "Must define MESHLET_SIZE" #endif +#if MESHLET_SIZE > 32 +shared uint shared_attr_index[MESHLET_SIZE]; +shared vec4 shared_clip_pos[MESHLET_SIZE]; +//#define MESHLET_PRIMITIVE_CULL_SHARED_INDEX shared_primitive +//shared u8vec3 shared_primitive[MESHLET_SIZE]; +#endif + layout(max_primitives = MESHLET_SIZE, max_vertices = MESHLET_SIZE, triangles) out; layout(local_size_x = 32, local_size_y_id = 0) in; @@ -125,14 +132,14 @@ void main() uvec3 prim = uvec3(ibo.data[meshlet.primitive_offset + linear_index + 32u * base_chunk_index]); meshlet_emit_primitive(prim, clip_pos, viewport); - if (gl_LocalInvocationIndex < shared_active_prim_count_total) - vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; - if (meshlet_lane_has_active_vert()) { uint out_vert_index = meshlet_compacted_vertex_output(); +#if MESHLET_SIZE > 32 + shared_attr_index[out_vert_index] = meshlet.vertex_offset + linear_index + 32u * base_chunk_index; + shared_clip_pos[out_vert_index] = clip_pos; +#else gl_MeshVerticesEXT[out_vert_index].gl_Position = clip_pos; - TexturedAttr a = attr.data[meshlet.vertex_offset + linear_index + 32u * base_chunk_index]; mediump vec3 n = unpack_bgr10a2(a.n).xyz; @@ -140,6 +147,30 @@ void main() vUV[out_vert_index] = a.uv; vNormal[out_vert_index] = mat3(M) * n; vTangent[out_vert_index] = vec4(mat3(M) * t.xyz, t.w); +#endif + } + +#if MESHLET_SIZE > 32 + barrier(); + + if (gl_LocalInvocationIndex < shared_active_vert_count_total) + { + gl_MeshVerticesEXT[gl_LocalInvocationIndex].gl_Position = shared_clip_pos[gl_LocalInvocationIndex]; + TexturedAttr a = attr.data[shared_attr_index[gl_LocalInvocationIndex]]; + mediump vec3 n = unpack_bgr10a2(a.n).xyz; + mediump vec4 t = unpack_bgr10a2(a.t); + vUV[gl_LocalInvocationIndex] = a.uv; + vNormal[gl_LocalInvocationIndex] = mat3(M) * n; + vTangent[gl_LocalInvocationIndex] = vec4(mat3(M) * t.xyz, t.w); + } +#endif + + if (gl_LocalInvocationIndex < shared_active_prim_count_total) + { +#ifdef MESHLET_PRIMITIVE_CULL_SHARED_INDEX + gl_PrimitiveTriangleIndicesEXT[gl_LocalInvocationIndex] = uvec3(shared_primitive[gl_LocalInvocationIndex]); +#endif + vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; } if (gl_LocalInvocationIndex == 0) diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp index 848892d1..b41b13fc 100644 --- a/tests/meshlet_viewer.cpp +++ b/tests/meshlet_viewer.cpp @@ -75,6 +75,8 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler // { explicit MeshletViewerApplication(const char *path) { + get_wsi().set_present_mode(Vulkan::PresentMode::UnlockedMaybeTear); + GLTF::Parser parser{path}; std::vector mesh_assets; From 9e3cd052d7850d0e7c48f62ab942d2eaf01b5884 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 26 Dec 2023 15:39:36 +0100 Subject: [PATCH 58/59] Experiment with vertexID passing. --- tests/assets/shaders/meshlet_debug.mesh.frag | 56 +++++++++++++------ tests/assets/shaders/meshlet_debug_plain.mesh | 13 +++++ 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh.frag b/tests/assets/shaders/meshlet_debug.mesh.frag index b8b703a5..08a2b09d 100644 --- a/tests/assets/shaders/meshlet_debug.mesh.frag +++ b/tests/assets/shaders/meshlet_debug.mesh.frag @@ -3,34 +3,58 @@ #extension GL_EXT_nonuniform_qualifier : require #extension GL_EXT_fragment_shader_barycentric : require +#define MESHLET_VERTEX_ID + +#ifdef MESHLET_VERTEX_ID +layout(location = 0) pervertexEXT in uint vVertexID[]; +layout(location = 1) perprimitiveEXT flat in uint vDrawID; +#else layout(location = 0) in mediump vec3 vNormal; layout(location = 1) in mediump vec4 vTangent; layout(location = 2) in vec2 vUV; layout(location = 3) perprimitiveEXT flat in uint vDrawID; +#endif layout(location = 0) out vec3 FragColor; -void main() +struct TexturedAttr +{ + uint n; + uint t; + vec2 uv; +}; + +layout(set = 0, binding = 2, std430) readonly buffer VBOATTR +{ + TexturedAttr data[]; +} attr; + +mediump vec4 unpack_bgr10a2(uint v) { - vec3 dd = fwidth(gl_BaryCoordEXT); - float d = max(max(dd.x, dd.y), dd.z); - float l = min(min(gl_BaryCoordEXT.x, gl_BaryCoordEXT.y), gl_BaryCoordEXT.z); + mediump ivec4 vs; + vs.x = bitfieldExtract(int(v), 0, 10); + vs.y = bitfieldExtract(int(v), 10, 10); + vs.z = bitfieldExtract(int(v), 20, 10); + vs.w = bitfieldExtract(int(v), 30, 3); + return vec4(vs) / vec4(511.0, 511.0, 511.0, 1.0); +} - float pixels_from_edge = l / max(d, 0.0001); - float highlight = 1.0 - smoothstep(0.25, 0.75, pixels_from_edge); +void main() +{ + uint va = vVertexID[0]; + uint vb = vVertexID[1]; + uint vc = vVertexID[2]; - vec3 normal = normalize(vNormal); - vec3 tangent = normalize(vTangent.xyz); + TexturedAttr attr_a = attr.data[va]; + TexturedAttr attr_b = attr.data[vb]; + TexturedAttr attr_c = attr.data[vc]; - FragColor = 0.3 * (0.5 * (normal * tangent * vTangent.w) + 0.5); - FragColor.rg += 0.05 * highlight; - FragColor.rg += vUV * 0.02; + mediump vec3 coeff = gl_BaryCoordEXT; + mediump vec3 Na = unpack_bgr10a2(attr_a.n).xyz; + mediump vec3 Nb = unpack_bgr10a2(attr_b.n).xyz; + mediump vec3 Nc = unpack_bgr10a2(attr_c.n).xyz; + mediump vec3 normal = normalize(Na * coeff.x + Nb * coeff.y + Nc * coeff.z); FragColor = clamp(0.5 * normal + 0.5, vec3(0.0), vec3(1.0)); FragColor = pow(FragColor, vec3(4.0)); - - //uint hashed = vDrawID ^ (vDrawID * 23423465); - //FragColor.r += 0.1 * float(hashed % 19) / 19.0; - //FragColor.g += 0.1 * float(hashed % 29) / 29.0; - //FragColor.b += 0.1 * float(hashed % 131) / 131.0; } diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index 1fce6442..c51047b7 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -15,6 +15,8 @@ #error "Must define MESHLET_SIZE" #endif +#define MESHLET_VERTEX_ID + #if MESHLET_SIZE > 32 shared uint shared_attr_index[MESHLET_SIZE]; shared vec4 shared_clip_pos[MESHLET_SIZE]; @@ -28,10 +30,15 @@ layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" #include "meshlet_primitive_cull.h" +#ifdef MESHLET_VERTEX_ID +layout(location = 0) out uint vVertexID[]; +layout(location = 1) perprimitiveEXT out uint vDrawID[]; +#else layout(location = 0) out mediump vec3 vNormal[]; layout(location = 1) out mediump vec4 vTangent[]; layout(location = 2) out vec2 vUV[]; layout(location = 3) perprimitiveEXT out uint vDrawID[]; +#endif layout(set = 1, binding = 0) uniform UBO { @@ -144,9 +151,13 @@ void main() mediump vec3 n = unpack_bgr10a2(a.n).xyz; mediump vec4 t = unpack_bgr10a2(a.t); +#ifdef MESHLET_VERTEX_ID + vVertexID[out_vert_index] = meshlet.vertex_offset + linear_index + 32u * base_chunk_index; +#else vUV[out_vert_index] = a.uv; vNormal[out_vert_index] = mat3(M) * n; vTangent[out_vert_index] = vec4(mat3(M) * t.xyz, t.w); +#endif #endif } @@ -173,10 +184,12 @@ void main() vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; } +#if 0 if (gl_LocalInvocationIndex == 0) { atomicAdd(stats.invocations, gl_WorkGroupSize.x * gl_WorkGroupSize.y); atomicAdd(stats.prim, shared_active_prim_count_total); atomicAdd(stats.vert, shared_active_vert_count_total); } +#endif } From 9bb9a6de7ec41725f5baddae31dddb7e5bfea06e Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Wed, 27 Dec 2023 10:26:38 +0100 Subject: [PATCH 59/59] Revert "Experiment with vertexID passing." This reverts commit 9e3cd052d7850d0e7c48f62ab942d2eaf01b5884. --- tests/assets/shaders/meshlet_debug.mesh.frag | 56 ++++++------------- tests/assets/shaders/meshlet_debug_plain.mesh | 13 ----- 2 files changed, 16 insertions(+), 53 deletions(-) diff --git a/tests/assets/shaders/meshlet_debug.mesh.frag b/tests/assets/shaders/meshlet_debug.mesh.frag index 08a2b09d..b8b703a5 100644 --- a/tests/assets/shaders/meshlet_debug.mesh.frag +++ b/tests/assets/shaders/meshlet_debug.mesh.frag @@ -3,58 +3,34 @@ #extension GL_EXT_nonuniform_qualifier : require #extension GL_EXT_fragment_shader_barycentric : require -#define MESHLET_VERTEX_ID - -#ifdef MESHLET_VERTEX_ID -layout(location = 0) pervertexEXT in uint vVertexID[]; -layout(location = 1) perprimitiveEXT flat in uint vDrawID; -#else layout(location = 0) in mediump vec3 vNormal; layout(location = 1) in mediump vec4 vTangent; layout(location = 2) in vec2 vUV; layout(location = 3) perprimitiveEXT flat in uint vDrawID; -#endif layout(location = 0) out vec3 FragColor; -struct TexturedAttr -{ - uint n; - uint t; - vec2 uv; -}; - -layout(set = 0, binding = 2, std430) readonly buffer VBOATTR -{ - TexturedAttr data[]; -} attr; - -mediump vec4 unpack_bgr10a2(uint v) -{ - mediump ivec4 vs; - vs.x = bitfieldExtract(int(v), 0, 10); - vs.y = bitfieldExtract(int(v), 10, 10); - vs.z = bitfieldExtract(int(v), 20, 10); - vs.w = bitfieldExtract(int(v), 30, 3); - return vec4(vs) / vec4(511.0, 511.0, 511.0, 1.0); -} - void main() { - uint va = vVertexID[0]; - uint vb = vVertexID[1]; - uint vc = vVertexID[2]; + vec3 dd = fwidth(gl_BaryCoordEXT); + float d = max(max(dd.x, dd.y), dd.z); + float l = min(min(gl_BaryCoordEXT.x, gl_BaryCoordEXT.y), gl_BaryCoordEXT.z); - TexturedAttr attr_a = attr.data[va]; - TexturedAttr attr_b = attr.data[vb]; - TexturedAttr attr_c = attr.data[vc]; + float pixels_from_edge = l / max(d, 0.0001); + float highlight = 1.0 - smoothstep(0.25, 0.75, pixels_from_edge); - mediump vec3 coeff = gl_BaryCoordEXT; - mediump vec3 Na = unpack_bgr10a2(attr_a.n).xyz; - mediump vec3 Nb = unpack_bgr10a2(attr_b.n).xyz; - mediump vec3 Nc = unpack_bgr10a2(attr_c.n).xyz; - mediump vec3 normal = normalize(Na * coeff.x + Nb * coeff.y + Nc * coeff.z); + vec3 normal = normalize(vNormal); + vec3 tangent = normalize(vTangent.xyz); + + FragColor = 0.3 * (0.5 * (normal * tangent * vTangent.w) + 0.5); + FragColor.rg += 0.05 * highlight; + FragColor.rg += vUV * 0.02; FragColor = clamp(0.5 * normal + 0.5, vec3(0.0), vec3(1.0)); FragColor = pow(FragColor, vec3(4.0)); + + //uint hashed = vDrawID ^ (vDrawID * 23423465); + //FragColor.r += 0.1 * float(hashed % 19) / 19.0; + //FragColor.g += 0.1 * float(hashed % 29) / 29.0; + //FragColor.b += 0.1 * float(hashed % 131) / 131.0; } diff --git a/tests/assets/shaders/meshlet_debug_plain.mesh b/tests/assets/shaders/meshlet_debug_plain.mesh index c51047b7..1fce6442 100644 --- a/tests/assets/shaders/meshlet_debug_plain.mesh +++ b/tests/assets/shaders/meshlet_debug_plain.mesh @@ -15,8 +15,6 @@ #error "Must define MESHLET_SIZE" #endif -#define MESHLET_VERTEX_ID - #if MESHLET_SIZE > 32 shared uint shared_attr_index[MESHLET_SIZE]; shared vec4 shared_clip_pos[MESHLET_SIZE]; @@ -30,15 +28,10 @@ layout(local_size_x = 32, local_size_y_id = 0) in; #include "meshlet_render_types.h" #include "meshlet_primitive_cull.h" -#ifdef MESHLET_VERTEX_ID -layout(location = 0) out uint vVertexID[]; -layout(location = 1) perprimitiveEXT out uint vDrawID[]; -#else layout(location = 0) out mediump vec3 vNormal[]; layout(location = 1) out mediump vec4 vTangent[]; layout(location = 2) out vec2 vUV[]; layout(location = 3) perprimitiveEXT out uint vDrawID[]; -#endif layout(set = 1, binding = 0) uniform UBO { @@ -151,13 +144,9 @@ void main() mediump vec3 n = unpack_bgr10a2(a.n).xyz; mediump vec4 t = unpack_bgr10a2(a.t); -#ifdef MESHLET_VERTEX_ID - vVertexID[out_vert_index] = meshlet.vertex_offset + linear_index + 32u * base_chunk_index; -#else vUV[out_vert_index] = a.uv; vNormal[out_vert_index] = mat3(M) * n; vTangent[out_vert_index] = vec4(mat3(M) * t.xyz, t.w); -#endif #endif } @@ -184,12 +173,10 @@ void main() vDrawID[gl_LocalInvocationIndex] = task.meshlet_index; } -#if 0 if (gl_LocalInvocationIndex == 0) { atomicAdd(stats.invocations, gl_WorkGroupSize.x * gl_WorkGroupSize.y); atomicAdd(stats.prim, shared_active_prim_count_total); atomicAdd(stats.vert, shared_active_vert_count_total); } -#endif }