-
Notifications
You must be signed in to change notification settings - Fork 573
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support XFB in MoltenVK #2169
base: main
Are you sure you want to change the base?
Support XFB in MoltenVK #2169
Changes from all commits
10154f5
2127a3b
4a8543e
f195855
343ff6e
048ac2d
179c6e0
f1c0ad2
117eaa3
1e8cbe4
f1913aa
f8a27d9
cebb964
37c0972
aab161a
9d2329a
111cebb
e3cf900
562b959
d62fe77
36d39df
35858fb
3c427de
c352f94
001ff7d
28babde
fb520f4
556c9fa
b6279e5
579635a
b86f512
66ca6c4
948651b
e74800f
b352521
2798c48
b931900
ab2b37b
a1d92e7
2959f3a
16dd1f1
742f725
8f66f30
2e14c91
8dbf250
b020270
3521814
bf4f823
109959e
3bd855f
1154932
a547b52
64fa0b6
dada588
adb3a7b
fec7607
0393302
739a140
15a8b70
575e75d
8bcfd32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#include <metal_stdlib> | ||
#include <simd/simd.h> | ||
|
||
using namespace metal; | ||
|
||
struct VertOut | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
struct spvXfbBuffer1 | ||
{ | ||
char _m0_pad[4]; | ||
packed_float4 gl_Position; | ||
}; | ||
|
||
struct spvXfbBuffer2 | ||
{ | ||
char _m0_pad[16]; | ||
float4 vFoo; | ||
}; | ||
|
||
struct spvXfbBuffer3 | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device atomic_uint* spvXfbCounter1 [[buffer(17)]], device spvXfbBuffer1* spvXfb1 [[buffer(13)]], device atomic_uint* spvXfbCounter2 [[buffer(18)]], device spvXfbBuffer2* spvXfb2 [[buffer(14)]], device atomic_uint* spvXfbCounter3 [[buffer(19)]], device spvXfbBuffer3* spvXfb3 [[buffer(15)]]) | ||
{ | ||
spvXfbBuffer1 spvXfbOutput1 = {}; | ||
spvXfbBuffer2 spvXfbOutput2 = {}; | ||
spvXfbBuffer3 spvXfbOutput3 = {}; | ||
VertOut _20 = {}; | ||
if (any(gl_GlobalInvocationID >= spvStageInputSize)) | ||
return; | ||
spvXfbOutput1.gl_Position = float4(1.0); | ||
spvXfbOutput2.vFoo = float4(3.0); | ||
_20.vBar = float4(5.0); | ||
spvXfbOutput3.vBar = _20.vBar; | ||
uint spvXfbIndex = gl_GlobalInvocationID.y * (spvStageInputSize.x & ~1u) + gl_GlobalInvocationID.x; | ||
uint spvInitOffset1 = atomic_load_explicit(spvXfbCounter1, memory_order_relaxed); | ||
spvXfb1 = reinterpret_cast<device spvXfbBuffer1*>(reinterpret_cast<device char*>(spvXfb1) + spvInitOffset1); | ||
if ((gl_GlobalInvocationID.x & 1) || gl_GlobalInvocationID.x < spvStageInputSize.x - 1u) | ||
spvXfb1[spvXfbIndex] = spvXfbOutput1; | ||
uint spvInitOffset2 = atomic_load_explicit(spvXfbCounter2, memory_order_relaxed); | ||
spvXfb2 = reinterpret_cast<device spvXfbBuffer2*>(reinterpret_cast<device char*>(spvXfb2) + spvInitOffset2); | ||
if ((gl_GlobalInvocationID.x & 1) || gl_GlobalInvocationID.x < spvStageInputSize.x - 1u) | ||
spvXfb2[spvXfbIndex] = spvXfbOutput2; | ||
uint spvInitOffset3 = atomic_load_explicit(spvXfbCounter3, memory_order_relaxed); | ||
spvXfb3 = reinterpret_cast<device spvXfbBuffer3*>(reinterpret_cast<device char*>(spvXfb3) + spvInitOffset3); | ||
if ((gl_GlobalInvocationID.x & 1) || gl_GlobalInvocationID.x < spvStageInputSize.x - 1u) | ||
spvXfb3[spvXfbIndex] = spvXfbOutput3; | ||
threadgroup_barrier(mem_flags::mem_device); | ||
if (all(gl_GlobalInvocationID.xy == 0)) | ||
{ | ||
uint spvWritten = (spvStageInputSize.x & ~1u) * spvStageInputSize.y; | ||
atomic_store_explicit(spvXfbCounter1, spvInitOffset1 + sizeof(*spvXfb1) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter2, spvInitOffset2 + sizeof(*spvXfb2) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter3, spvInitOffset3 + sizeof(*spvXfb3) * spvWritten, memory_order_relaxed); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#include <metal_stdlib> | ||
#include <simd/simd.h> | ||
|
||
using namespace metal; | ||
|
||
struct VertOut | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
struct spvXfbBuffer1 | ||
{ | ||
char _m0_pad[4]; | ||
packed_float4 gl_Position; | ||
}; | ||
|
||
struct spvXfbBuffer2 | ||
{ | ||
char _m0_pad[16]; | ||
float4 vFoo; | ||
}; | ||
|
||
struct spvXfbBuffer3 | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device atomic_uint* spvXfbCounter1 [[buffer(17)]], device spvXfbBuffer1* spvXfb1 [[buffer(13)]], device atomic_uint* spvXfbCounter2 [[buffer(18)]], device spvXfbBuffer2* spvXfb2 [[buffer(14)]], device atomic_uint* spvXfbCounter3 [[buffer(19)]], device spvXfbBuffer3* spvXfb3 [[buffer(15)]]) | ||
{ | ||
spvXfbBuffer1 spvXfbOutput1 = {}; | ||
spvXfbBuffer2 spvXfbOutput2 = {}; | ||
spvXfbBuffer3 spvXfbOutput3 = {}; | ||
VertOut _25 = {}; | ||
if (any(gl_GlobalInvocationID >= spvStageInputSize)) | ||
return; | ||
spvXfbOutput1.gl_Position = float4(1.0); | ||
spvXfbOutput2.vFoo = float4(3.0); | ||
_25.vBar = float4(5.0); | ||
spvXfbOutput3.vBar = _25.vBar; | ||
uint spvXfbIndex = 2 * gl_GlobalInvocationID.y * (spvStageInputSize.x - 1u) + 2 * gl_GlobalInvocationID.x; | ||
uint spvInitOffset1 = atomic_load_explicit(spvXfbCounter1, memory_order_relaxed); | ||
spvXfb1 = reinterpret_cast<device spvXfbBuffer1*>(reinterpret_cast<device char*>(spvXfb1) + spvInitOffset1); | ||
if (gl_GlobalInvocationID.x != spvStageInputSize.x - 1u) | ||
spvXfb1[spvXfbIndex] = spvXfbOutput1; | ||
if (gl_GlobalInvocationID.x != 0) | ||
spvXfb1[spvXfbIndex - 1u] = spvXfbOutput1; | ||
uint spvInitOffset2 = atomic_load_explicit(spvXfbCounter2, memory_order_relaxed); | ||
spvXfb2 = reinterpret_cast<device spvXfbBuffer2*>(reinterpret_cast<device char*>(spvXfb2) + spvInitOffset2); | ||
if (gl_GlobalInvocationID.x != spvStageInputSize.x - 1u) | ||
spvXfb2[spvXfbIndex] = spvXfbOutput2; | ||
if (gl_GlobalInvocationID.x != 0) | ||
spvXfb2[spvXfbIndex - 1u] = spvXfbOutput2; | ||
uint spvInitOffset3 = atomic_load_explicit(spvXfbCounter3, memory_order_relaxed); | ||
spvXfb3 = reinterpret_cast<device spvXfbBuffer3*>(reinterpret_cast<device char*>(spvXfb3) + spvInitOffset3); | ||
if (gl_GlobalInvocationID.x != spvStageInputSize.x - 1u) | ||
spvXfb3[spvXfbIndex] = spvXfbOutput3; | ||
if (gl_GlobalInvocationID.x != 0) | ||
spvXfb3[spvXfbIndex - 1u] = spvXfbOutput3; | ||
threadgroup_barrier(mem_flags::mem_device); | ||
if (all(gl_GlobalInvocationID.xy == 0)) | ||
{ | ||
uint spvWritten = 2 * (spvStageInputSize.x - 1u) * spvStageInputSize.y; | ||
atomic_store_explicit(spvXfbCounter1, spvInitOffset1 + sizeof(*spvXfb1) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter2, spvInitOffset2 + sizeof(*spvXfb2) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter3, spvInitOffset3 + sizeof(*spvXfb3) * spvWritten, memory_order_relaxed); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#include <metal_stdlib> | ||
#include <simd/simd.h> | ||
|
||
using namespace metal; | ||
|
||
struct VertOut | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
struct spvXfbBuffer1 | ||
{ | ||
char _m0_pad[4]; | ||
packed_float4 gl_Position; | ||
}; | ||
|
||
struct spvXfbBuffer2 | ||
{ | ||
char _m0_pad[16]; | ||
float4 vFoo; | ||
}; | ||
|
||
struct spvXfbBuffer3 | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device atomic_uint* spvXfbCounter1 [[buffer(17)]], device spvXfbBuffer1* spvXfb1 [[buffer(13)]], device atomic_uint* spvXfbCounter2 [[buffer(18)]], device spvXfbBuffer2* spvXfb2 [[buffer(14)]], device atomic_uint* spvXfbCounter3 [[buffer(19)]], device spvXfbBuffer3* spvXfb3 [[buffer(15)]]) | ||
{ | ||
spvXfbBuffer1 spvXfbOutput1 = {}; | ||
spvXfbBuffer2 spvXfbOutput2 = {}; | ||
spvXfbBuffer3 spvXfbOutput3 = {}; | ||
VertOut _25 = {}; | ||
if (any(gl_GlobalInvocationID >= spvStageInputSize)) | ||
return; | ||
spvXfbOutput1.gl_Position = float4(1.0); | ||
spvXfbOutput2.vFoo = float4(3.0); | ||
_25.vBar = float4(5.0); | ||
spvXfbOutput3.vBar = _25.vBar; | ||
uint spvXfbIndex = gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x; | ||
uint spvInitOffset1 = atomic_load_explicit(spvXfbCounter1, memory_order_relaxed); | ||
spvXfb1 = reinterpret_cast<device spvXfbBuffer1*>(reinterpret_cast<device char*>(spvXfb1) + spvInitOffset1); | ||
spvXfb1[spvXfbIndex] = spvXfbOutput1; | ||
uint spvInitOffset2 = atomic_load_explicit(spvXfbCounter2, memory_order_relaxed); | ||
spvXfb2 = reinterpret_cast<device spvXfbBuffer2*>(reinterpret_cast<device char*>(spvXfb2) + spvInitOffset2); | ||
spvXfb2[spvXfbIndex] = spvXfbOutput2; | ||
uint spvInitOffset3 = atomic_load_explicit(spvXfbCounter3, memory_order_relaxed); | ||
spvXfb3 = reinterpret_cast<device spvXfbBuffer3*>(reinterpret_cast<device char*>(spvXfb3) + spvInitOffset3); | ||
spvXfb3[spvXfbIndex] = spvXfbOutput3; | ||
threadgroup_barrier(mem_flags::mem_device); | ||
if (all(gl_GlobalInvocationID.xy == 0)) | ||
{ | ||
uint spvWritten = spvStageInputSize.x * spvStageInputSize.y; | ||
atomic_store_explicit(spvXfbCounter1, spvInitOffset1 + sizeof(*spvXfb1) * spvWritten, memory_order_relaxed); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How is XFB ordering maintained here? XFB data must be emitted in-order with input primitives. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The actual XFB buffers are indexed by the global invocation ID. |
||
atomic_store_explicit(spvXfbCounter2, spvInitOffset2 + sizeof(*spvXfb2) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter3, spvInitOffset3 + sizeof(*spvXfb3) * spvWritten, memory_order_relaxed); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#include <metal_stdlib> | ||
#include <simd/simd.h> | ||
|
||
using namespace metal; | ||
|
||
struct VertOut | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
struct spvXfbBuffer1 | ||
{ | ||
char _m0_pad[4]; | ||
packed_float4 gl_Position; | ||
}; | ||
|
||
struct spvXfbBuffer2 | ||
{ | ||
char _m0_pad[16]; | ||
float4 vFoo; | ||
}; | ||
|
||
struct spvXfbBuffer3 | ||
{ | ||
float4 vBar; | ||
}; | ||
|
||
kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device atomic_uint* spvXfbCounter1 [[buffer(17)]], device spvXfbBuffer1* spvXfb1 [[buffer(13)]], device atomic_uint* spvXfbCounter2 [[buffer(18)]], device spvXfbBuffer2* spvXfb2 [[buffer(14)]], device atomic_uint* spvXfbCounter3 [[buffer(19)]], device spvXfbBuffer3* spvXfb3 [[buffer(15)]]) | ||
{ | ||
spvXfbBuffer1 spvXfbOutput1 = {}; | ||
spvXfbBuffer2 spvXfbOutput2 = {}; | ||
spvXfbBuffer3 spvXfbOutput3 = {}; | ||
VertOut _25 = {}; | ||
if (any(gl_GlobalInvocationID >= spvStageInputSize)) | ||
return; | ||
spvXfbOutput1.gl_Position = float4(1.0); | ||
spvXfbOutput2.vFoo = float4(3.0); | ||
_25.vBar = float4(5.0); | ||
spvXfbOutput3.vBar = _25.vBar; | ||
uint spvXfbBaseIndex = 3 * gl_GlobalInvocationID.y * subsat(spvStageInputSize.x, 2u); | ||
uint spvXfbIndex = spvXfbBaseIndex + 3 * gl_GlobalInvocationID.x - 2u; | ||
uint spvInitOffset1 = atomic_load_explicit(spvXfbCounter1, memory_order_relaxed); | ||
spvXfb1 = reinterpret_cast<device spvXfbBuffer1*>(reinterpret_cast<device char*>(spvXfb1) + spvInitOffset1); | ||
if (gl_GlobalInvocationID.x == 0) | ||
{ | ||
for (uint i = 0; i < subsat(spvStageInputSize.x, 2u); ++i) | ||
spvXfb1[spvXfbBaseIndex + 3 * i] = spvXfbOutput1; | ||
} | ||
else | ||
{ | ||
if (gl_GlobalInvocationID.x != spvStageInputSize.x - 1u) | ||
spvXfb1[spvXfbIndex] = spvXfbOutput1; | ||
if (gl_GlobalInvocationID.x != 1) | ||
spvXfb1[spvXfbIndex - 2u] = spvXfbOutput1; | ||
} | ||
uint spvInitOffset2 = atomic_load_explicit(spvXfbCounter2, memory_order_relaxed); | ||
spvXfb2 = reinterpret_cast<device spvXfbBuffer2*>(reinterpret_cast<device char*>(spvXfb2) + spvInitOffset2); | ||
if (gl_GlobalInvocationID.x == 0) | ||
{ | ||
for (uint i = 0; i < subsat(spvStageInputSize.x, 2u); ++i) | ||
spvXfb2[spvXfbBaseIndex + 3 * i] = spvXfbOutput2; | ||
} | ||
else | ||
{ | ||
if (gl_GlobalInvocationID.x != spvStageInputSize.x - 1u) | ||
spvXfb2[spvXfbIndex] = spvXfbOutput2; | ||
if (gl_GlobalInvocationID.x != 1) | ||
spvXfb2[spvXfbIndex - 2u] = spvXfbOutput2; | ||
} | ||
uint spvInitOffset3 = atomic_load_explicit(spvXfbCounter3, memory_order_relaxed); | ||
spvXfb3 = reinterpret_cast<device spvXfbBuffer3*>(reinterpret_cast<device char*>(spvXfb3) + spvInitOffset3); | ||
if (gl_GlobalInvocationID.x == 0) | ||
{ | ||
for (uint i = 0; i < subsat(spvStageInputSize.x, 2u); ++i) | ||
spvXfb3[spvXfbBaseIndex + 3 * i] = spvXfbOutput3; | ||
} | ||
else | ||
{ | ||
if (gl_GlobalInvocationID.x != spvStageInputSize.x - 1u) | ||
spvXfb3[spvXfbIndex] = spvXfbOutput3; | ||
if (gl_GlobalInvocationID.x != 1) | ||
spvXfb3[spvXfbIndex - 2u] = spvXfbOutput3; | ||
} | ||
threadgroup_barrier(mem_flags::mem_device); | ||
if (all(gl_GlobalInvocationID.xy == 0)) | ||
{ | ||
uint spvWritten = 3 * subsat(spvStageInputSize.x, 2u) * spvStageInputSize.y; | ||
atomic_store_explicit(spvXfbCounter1, spvInitOffset1 + sizeof(*spvXfb1) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter2, spvInitOffset2 + sizeof(*spvXfb2) * spvWritten, memory_order_relaxed); | ||
atomic_store_explicit(spvXfbCounter3, spvInitOffset3 + sizeof(*spvXfb3) * spvWritten, memory_order_relaxed); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This breaks threadgroup_barrier.