Skip to content

Commit

Permalink
Merge pull request #126 from Themaister/meshlet-cull
Browse files Browse the repository at this point in the history
Iterate further on meshlet encoding scheme
  • Loading branch information
Themaister authored Dec 27, 2023
2 parents 67f573f + 9bb9a6d commit 4351239
Show file tree
Hide file tree
Showing 37 changed files with 3,184 additions and 1,715 deletions.
4 changes: 2 additions & 2 deletions application/global/global_managers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
*/

#include "global_managers.hpp"
#include "environment.hpp"
#include "logging.hpp"
#include <thread>
#include <assert.h>
Expand Down Expand Up @@ -219,8 +220,7 @@ void init(Factory &factory, ManagerFeatureFlags flags, unsigned max_threads, flo

if (cpu_threads > max_threads)
cpu_threads = max_threads;
if (const char *env = getenv("GRANITE_NUM_WORKER_THREADS"))
cpu_threads = strtoul(env, nullptr, 0);
cpu_threads = Util::get_environment_uint("GRANITE_NUM_WORKER_THREADS", cpu_threads);

unsigned background_cpu_threads = (cpu_threads + 1) / 2;

Expand Down
218 changes: 120 additions & 98 deletions assets/shaders/decode/meshlet_decode.comp
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,23 @@

#extension GL_EXT_scalar_block_layout : require
#include "../inc/meshlet_payload_constants.h"
#extension GL_KHR_shader_subgroup_basic : require

#define MESHLET_PAYLOAD_LARGE_WORKGROUP 1

#if MESHLET_PAYLOAD_LARGE_WORKGROUP
#define MESHLET_PAYLOAD_WG_Y MESHLET_PAYLOAD_NUM_CHUNKS
#else
#define MESHLET_PAYLOAD_WG_Y 1
#endif
layout(local_size_x = 32, local_size_y = MESHLET_PAYLOAD_WG_Y) in;
layout(local_size_x = 256) in;

layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS;
layout(constant_id = 1) const uint NUM_OUTPUT_U32_STREAMS = 1;
layout(constant_id = 2) const bool RAW_PAYLOAD = false;
#define MESHLET_PAYLOAD_NUM_U32_STREAMS NUM_U32_STREAMS
layout(constant_id = 1) const bool UNROLLED_MESH = false;
layout(constant_id = 2) const uint TARGET_MESH_STYLE = 0;
layout(constant_id = 3) const bool RUNTIME_MESH = false;

#define MESHLET_PAYLOAD_DESCRIPTOR_SET 0
#define MESHLET_PAYLOAD_META_BINDING 0
#define MESHLET_PAYLOAD_STREAM_BINDING 1
#define MESHLET_PAYLOAD_PAYLOAD_BINDING 2
#include "../inc/meshlet_payload_decode.h"
#include "../inc/meshlet_attribute_decode.h"
#include "../inc/meshlet_payload_constants.h"

const int MESH_STYLE = int(NUM_OUTPUT_U32_STREAMS);
const int MESH_STYLE_WIREFRAME = 0;
const int MESH_STYLE_TEXTURED = 1;
const int MESH_STYLE_SKINNED = 2;
Expand All @@ -38,11 +33,6 @@ layout(set = 0, binding = 3, scalar) writeonly buffer OutputIndices8
u8vec3 data[];
} output_indices8;

layout(set = 0, binding = 4, std430) writeonly buffer OutputStream0
{
uint data[];
} output_stream_raw;

layout(set = 0, binding = 4, scalar) writeonly buffer OutputStreamPos
{
vec3 data[];
Expand All @@ -67,8 +57,19 @@ layout(set = 0, binding = 6, std430) writeonly buffer OutputStreamSkin

layout(set = 0, binding = 7, std430) readonly buffer OutputOffsets
{
uvec2 data[];
} output_offset_strides;
uint data[];
} primitive_output_offsets;

struct IndirectDrawMesh
{
uint primitive_offset;
uint vertex_offset;
};

layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommandsMesh
{
IndirectDrawMesh draws[];
} indirect_commands_mesh;

struct IndirectIndexedDraw
{
Expand All @@ -79,10 +80,10 @@ struct IndirectIndexedDraw
uint firstInstance;
};

layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommands
layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommandsMDI
{
IndirectIndexedDraw draws[];
} indirect_commands;
} indirect_commands_mdi;

layout(push_constant, std430) uniform Registers
{
Expand All @@ -100,102 +101,123 @@ uint pack_a2bgr10(vec4 v)
void main()
{
uint meshlet_index = gl_WorkGroupID.x;
meshlet_init_workgroup(meshlet_index * NUM_U32_STREAMS);
MeshletMetaRaw meta = meshlet_metas_raw.data[meshlet_index];

if (!RAW_PAYLOAD)
int lane_index;
uint chunk_index;
uint linear_index;

if (gl_SubgroupSize == 32)
{
IndirectIndexedDraw draw;
draw.indexCount = 3 * (meta.num_primitives_minus_1 + 1);
draw.instanceCount = 1;
draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset;
draw.firstIndex = 3 * (output_offset_strides.data[meshlet_index].x + registers.primitive_offset);
draw.firstInstance = 0;
indirect_commands.draws[meshlet_index + registers.meshlet_offset] = draw;
chunk_index = gl_SubgroupID;
lane_index = int(gl_SubgroupInvocationID);
linear_index = chunk_index * gl_SubgroupSize + lane_index;
}

#define INDEX(linear_index, packed_indices) { \
uint output_offset; \
if (RAW_PAYLOAD) { \
uvec3 indices = uvec4(unpack8(packed_indices)).xyz; \
indices += meta.base_vertex_offset + registers.vertex_offset; \
output_offset = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS].x; \
output_offset += registers.primitive_offset; \
if (linear_index <= uint(meta.num_primitives_minus_1)) \
output_indices32.data[output_offset + linear_index] = indices; \
} else { \
output_offset = output_offset_strides.data[meshlet_index].x; \
output_offset += registers.primitive_offset; \
if (linear_index <= uint(meta.num_primitives_minus_1)) \
output_indices8.data[output_offset + linear_index] = unpack8(packed_indices).xyz; \
} \
}

else
{
MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 0, INDEX);
linear_index = gl_LocalInvocationIndex;
chunk_index = linear_index / 32u;
lane_index = int(linear_index & 31u);
}

if (RAW_PAYLOAD)
uint primitive_output_offset = primitive_output_offsets.data[meshlet_index] + registers.primitive_offset;

if (!UNROLLED_MESH && gl_LocalInvocationIndex == 0)
{
#define ATTR(linear_index, packed_decoded) { \
uvec2 output_offset_stride0 = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS + i]; \
output_offset_stride0.x += registers.vertex_offset; \
if (linear_index <= uint(meta.num_attributes_minus_1)) \
output_stream_raw.data[output_offset_stride0.x + linear_index * output_offset_stride0.y] = packed_decoded; \
}
MeshletInfo info = meshlet_get_meshlet_info(meshlet_index * NUM_U32_STREAMS);

for (uint i = 1; i < NUM_OUTPUT_U32_STREAMS; i++)
if (RUNTIME_MESH)
{
MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, i, ATTR);
IndirectDrawMesh draw;
draw.primitive_offset = primitive_output_offset;
// Unrolled. Always allocate full 256 entries.
draw.vertex_offset = gl_WorkGroupSize.x * meshlet_index + registers.vertex_offset;
indirect_commands_mesh.draws[meshlet_index + registers.meshlet_offset] = draw;
}
else
{
IndirectIndexedDraw draw;
draw.indexCount = 3 * info.primitive_count;
draw.instanceCount = 1;
draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset;
draw.firstIndex = 3 * primitive_output_offset;
draw.firstInstance = 0;
indirect_commands_mdi.draws[meshlet_index + registers.meshlet_offset] = draw;
}
}
else
{
uint output_offset = output_offset_strides.data[meshlet_index].y;
output_offset += registers.vertex_offset;

#define POS(linear_index, packed_decoded) { \
if (linear_index <= uint(meta.num_attributes_minus_1)) \
output_stream_pos.data[output_offset + linear_index] = attribute_decode_snorm_exp_position(packed_decoded); \
}

#define NORMAL(linear_index, packed_decoded) { \
if (linear_index <= uint(meta.num_attributes_minus_1)) { \
output_stream_textured_attr.data[output_offset + linear_index].normal = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \
} \
}

#define TANGENT(linear_index, packed_decoded) { \
if (linear_index <= uint(meta.num_attributes_minus_1)) { \
output_stream_textured_attr.data[output_offset + linear_index].tangent = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \
} \
}

#define UV(linear_index, packed_decoded) { \
if (linear_index <= uint(meta.num_attributes_minus_1)) { \
output_stream_textured_attr.data[output_offset + linear_index].uv = attribute_decode_snorm_exp_uv(packed_decoded); \
} \
}
MeshletChunkInfo chunk_info = meshlet_get_chunk_info(meshlet_index * NUM_U32_STREAMS, chunk_index);

#define SKIN(linear_index, packed_decoded) { \
if (linear_index <= uint(meta.num_attributes_minus_1)) { \
output_stream_skin.data[output_offset + linear_index] = packed_decoded; \
} \
}
{
MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 1, POS);
}
// Index
if (chunk_index < meta.num_chunks && lane_index < chunk_info.primitive_count)
{
uint decoded_index_buffer = meshlet_decode_index_buffer(
meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_PRIMITIVE,
chunk_index, lane_index);

if (RUNTIME_MESH)
primitive_output_offset += 32u * chunk_index;
else
primitive_output_offset += chunk_info.primitive_offset;

uvec3 indices;
indices.x = bitfieldExtract(decoded_index_buffer, 0, 8);
indices.y = bitfieldExtract(decoded_index_buffer, 8, 8);
indices.z = bitfieldExtract(decoded_index_buffer, 16, 8);
if (!RUNTIME_MESH)
indices += chunk_info.vertex_offset;

if (UNROLLED_MESH)
output_indices32.data[primitive_output_offset + lane_index] = indices + meta.base_vertex_offset;
else
output_indices8.data[primitive_output_offset + lane_index] = u8vec3(indices);
}
else if (RUNTIME_MESH)
{
output_indices8.data[primitive_output_offset + 32u * chunk_index + lane_index] = u8vec3(0);
}

if (MESH_STYLE >= MESH_STYLE_TEXTURED)
// Attributes
if (chunk_index < meta.num_chunks && lane_index < chunk_info.vertex_count)
{
int exponent;
i16vec3 pos = meshlet_decode_snorm_scaled_i16x3(
meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_POSITION,
chunk_index, lane_index, exponent);

vec3 fp_pos = ldexp(vec3(pos), ivec3(exponent));
uint vertex_output_offset;
if (RUNTIME_MESH)
vertex_output_offset = gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset;
else
vertex_output_offset = registers.vertex_offset + meta.base_vertex_offset + chunk_info.vertex_offset;
output_stream_pos.data[vertex_output_offset + lane_index] = fp_pos;

if (TARGET_MESH_STYLE >= MESH_STYLE_TEXTURED)
{
MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 3, NORMAL);
MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 4, TANGENT);
MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 5, UV);
bool t_sign;
u8vec4 nt = meshlet_decode_normal_tangent_oct8(
meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8,
chunk_index, lane_index, t_sign);
i16vec2 uv = meshlet_decode_snorm_scaled_i16x2(
meshlet_index * NUM_U32_STREAMS + MESHLET_STREAM_TYPE_UV,
chunk_index, lane_index, exponent);
vec2 fp_uv = attribute_decode_snorm_exp_uv(uv, exponent);

mediump mat2x4 NT = attribute_decode_oct8_normal_tangent(nt, t_sign);
output_stream_textured_attr.data[vertex_output_offset + lane_index] =
TexturedAttr(pack_a2bgr10(NT[0]), pack_a2bgr10(NT[1]), fp_uv);
}
}
else if (RUNTIME_MESH)
{
output_stream_pos.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] =
vec3(intBitsToFloat(-1));

if (MESH_STYLE >= MESH_STYLE_SKINNED)
if (TARGET_MESH_STYLE >= MESH_STYLE_TEXTURED)
{
MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 7, SKIN);
output_stream_textured_attr.data[gl_WorkGroupSize.x * meshlet_index + 32u * chunk_index + registers.vertex_offset + lane_index] =
TexturedAttr(-1u, -1u, vec2(intBitsToFloat(-1)));
}
}
}
42 changes: 18 additions & 24 deletions assets/shaders/inc/meshlet_attribute_decode.h
Original file line number Diff line number Diff line change
@@ -1,39 +1,33 @@
#ifndef MESHLET_ATTRIBUTE_DECODE_H_
#define MESHLET_ATTRIBUTE_DECODE_H_

vec3 attribute_decode_snorm_exp_position(uvec2 payload)
vec3 attribute_decode_snorm_exp_position(i16vec3 payload, int exponent)
{
ivec3 sint_value = ivec3(
bitfieldExtract(int(payload.x), 0, 16),
bitfieldExtract(int(payload.x), 16, 16),
bitfieldExtract(int(payload.y), 0, 16));
int exp = bitfieldExtract(int(payload.y), 16, 16);
return vec3(
ldexp(float(sint_value.x), exp),
ldexp(float(sint_value.y), exp),
ldexp(float(sint_value.z), exp));
vec3 fp_pos = ldexp(vec3(payload), ivec3(exponent));
return fp_pos;
}

vec2 attribute_decode_snorm_exp_uv(uvec2 payload)
vec2 attribute_decode_snorm_exp_uv(i16vec2 payload, int exponent)
{
ivec2 sint_value = ivec2(
bitfieldExtract(int(payload.x), 0, 16),
bitfieldExtract(int(payload.x), 16, 16));
int exp = bitfieldExtract(int(payload.y), 0, 16);
return 0.5 * vec2(
ldexp(float(sint_value.x), exp),
ldexp(float(sint_value.y), exp)) + 0.5;
return 0.5 * ldexp(vec2(payload), ivec2(exponent)) + 0.5;
}

// Adapted from: https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding/
// https://twitter.com/Stubbesaurus/status/9379947905532272640
mediump vec4 attribute_decode_oct8_normal_tangent(uint payload)
mediump vec3 attribute_decode_oct_normal(mediump vec2 f)
{
mediump vec4 f = unpackSnorm4x8(payload);
mediump vec3 n = vec3(f.x, f.y, 1.0 - abs(f.x) - abs(f.y));
mediump float t = max(-n.z, 0.0);
n.xy += mix(vec2(t), vec2(-t), greaterThanEqual(n.xy, vec2(0.0)));
return vec4(normalize(n), f.w != 0.0 ? -1.0 : 1.0);
return normalize(n);
}

// Adapted from: https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding/
// https://twitter.com/Stubbesaurus/status/9379947905532272640
mediump mat2x4 attribute_decode_oct8_normal_tangent(u8vec4 payload, bool t_sign)
{
mediump vec4 f = vec4(i8vec4(payload)) / 127.0;
mediump vec3 N = attribute_decode_oct_normal(f.xy);
mediump vec3 T = attribute_decode_oct_normal(f.zw);
return mat2x4(vec4(N, 0.0), vec4(T, t_sign ? -1.0 : 1.0));
}

#endif
#endif
9 changes: 8 additions & 1 deletion assets/shaders/inc/meshlet_payload_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,11 @@
#define MESHLET_PAYLOAD_NUM_CHUNKS 8
#define MESHLET_PAYLOAD_MAX_STREAMS 16

#endif
const int MESHLET_STREAM_TYPE_PRIMITIVE = 0;
const int MESHLET_STREAM_TYPE_POSITION = 1;
const int MESHLET_STREAM_TYPE_NORMAL_TANGENT_OCT8 = 2;
const int MESHLET_STREAM_TYPE_UV = 3;
const int MESHLET_STREAM_TYPE_BONE_INDICES = 4;
const int MESHLET_STREAM_TYPE_BONE_WEIGHTS = 5;

#endif
Loading

0 comments on commit 4351239

Please sign in to comment.