Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vulkan: Use push constant offset to handle misaligned descriptors #10987

Merged
merged 1 commit into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 64 additions & 10 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
uint32_t ne;
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t d_offset;
uint32_t misalign_offsets;
float param1; float param2;
uint32_t ne0_012mp; uint32_t ne0_012L;
uint32_t ne0_01mp; uint32_t ne0_01L;
Expand Down Expand Up @@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
uint32_t d_offset;
uint32_t misalign_offsets;
float param1; float param2; int32_t param3;
};

Expand Down Expand Up @@ -546,7 +546,7 @@ struct vk_staging_memcpy {
};

struct vk_op_upscale_push_constants {
uint32_t ne; uint32_t d_offset;
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
float sf0; float sf1; float sf2; float sf3;
Expand Down Expand Up @@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
}
}

static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
{
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
}

template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
GGML_UNUSED(p);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(dst);
static_assert(!std::is_const<T>::value, "unexpected type");
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
}

template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);

p.misalign_offsets = (a_offset << 16) | d_offset;

GGML_UNUSED(src1);
GGML_UNUSED(src2);
}

template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);

GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));

p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;

GGML_UNUSED(src2);
}

template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);

p.a_offset = a_offset;
p.d_offset = d_offset;

GGML_UNUSED(src1);
GGML_UNUSED(src2);
}

template<typename PC>
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
Expand Down Expand Up @@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
}

GGML_ASSERT(d_D != nullptr);
uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
if(!src0_uma) {
d_X = src0_buf_ctx->dev_buffer;
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
Expand All @@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
GGML_ASSERT(d_Z != nullptr);
}
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);

if (op_supports_incontiguous) {
x_sz = ggml_nbytes(src0);
Expand Down Expand Up @@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;

int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
Expand All @@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
d_offset,
0,
0.0f, 0.0f, offset,
}, dryrun);
}
Expand Down Expand Up @@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
const float sf3 = (float)dst->ne[3] / src0->ne[3];

ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
(uint32_t)ggml_nelements(dst), 0,
(uint32_t)ggml_nelements(dst), 0, 0,
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
sf0, sf1, sf2, sf3,
Expand Down Expand Up @@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;

ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
d_offset,
0,
0.0f, 0.0f,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}, dryrun);
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ void main() {
get_indices(idx, i00, i01, i02, i03);

if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
} else {
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
}
}

2 changes: 1 addition & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/add.comp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void main() {
uint i00, i01, i02, i03;
get_indices(idx, i00, i01, i02, i03);

data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));

idx += num_threads;
}
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ void main() {
return;
}

const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
}
6 changes: 3 additions & 3 deletions ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ void main() {
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;

#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
#else
if (is_src0) {
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
} else {
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
}
#endif
}
8 changes: 4 additions & 4 deletions ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ void main() {
if (idx + (num_iter-1)*num_threads < p.ne) {
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
#else
data_d[p.d_offset + idx] = data_a[idx];
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
#endif
idx += num_threads;
}
Expand All @@ -32,9 +32,9 @@ void main() {
}

#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
#else
data_d[p.d_offset + idx] = data_a[idx];
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
#endif
idx += num_threads;
}
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ void main() {
}

#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
#else
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
#endif
}
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ void main() {
return;
}

const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/div.comp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void main() {
uint i00, i01, i02, i03;
get_indices(idx, i00, i01, i02, i03);

data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));

idx += num_threads;
}
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
uint d_offset;
uint misalign_offsets;
float param1; float param2; int param3;
} p;

Expand All @@ -22,6 +22,10 @@ uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}

uint get_aoffset() { return p.misalign_offsets >> 16; }
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
uint get_doffset() { return p.misalign_offsets & 0xFF; }

// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
uint fastmod(uint a, uint b) {
if ((b & (b-1)) == 0) {
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
uint ne;
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint d_offset;
uint misalign_offsets;
float param1; float param2;

uint ne0_012mp; uint ne0_012L;
Expand All @@ -24,6 +24,9 @@ uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}

uint get_aoffset() { return p.misalign_offsets >> 16; }
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }

// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
Expand Down
6 changes: 3 additions & 3 deletions ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ void main() {
return;
}

const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];

const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;

#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void main() {
uint i00, i01, i02, i03;
get_indices(idx, i00, i01, i02, i03);

data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));

idx += num_threads;
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ void main() {

const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;

data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ void main() {
return;
}

data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ void main() {
continue;
}

data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
idx += num_threads;
}
}
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ void main() {
return;
}

const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
}
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/square.comp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ void main() {
return;
}

const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
}
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

layout (push_constant) uniform parameter
{
uint ne; uint d_offset;
uint ne; uint a_offset; uint d_offset;
uint nb00; uint nb01; uint nb02; uint nb03;
uint ne10; uint ne11; uint ne12; uint ne13;
float sf0; float sf1; float sf2; float sf3;
Expand Down Expand Up @@ -32,5 +32,5 @@ void main() {
const uint i02 = uint(i12 / p.sf2);
const uint i03 = uint(i13 / p.sf3);

data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
}
Loading