Skip to content

Commit

Permalink
test: reducer CUDA kernel tests (#3162)
Browse files Browse the repository at this point in the history
* feat: add tree reduction implementation of argmin and argmax

* feat: add awkward_ListOffsetArray_reduce_local_outoffsets_64 kernel

* test: integration tests for cuda

* test: some more integration tests for cuda

* feat: add awkward_reduce_count_64 kernel

* fix: indexing and indentation

* feat: add awkward_reduce_countnonzero kernel

* feat: add reduce sum, min and max kernels

* feat: add reduce prod and sum_int_bool

* feat: add sum_bool and prod_bool kernels

* fix: use cpt.assert_allclose

* test: reducer integration tests

* fix: typr conversion

* fix: use atomic to avoid race conditions

* fix: remove unnessary variable

* fix: minor fixes

* fix: all reducer for atomics

* fix: missing template

* fix: remove complex

* fix: atomicMin() for float 32 and indentation

* fix: pass correct dtype of identity

* fix: remove combinations test

* fix: manage resources and disable failing test

* fix: uncomment fixed test for slicing

* fix: correctly interpret typetracer array for cuda backend

* fix: tests-spec error for bool

* fix: check for the backend of head

* test: reducer CUDAkernel tests

* test: add more reducer tests

* test: add more reducer tests 2

* fix: error for EmptyArray

* test: generic_reducer_operation and block_boundary

* Update dev/generate-tests.py

Co-authored-by: Ianna Osborne <[email protected]>

---------

Co-authored-by: Ianna Osborne <[email protected]>
  • Loading branch information
ManasviGoyal and ianna authored Jun 25, 2024
1 parent ae1ba10 commit a1da072
Show file tree
Hide file tree
Showing 15 changed files with 1,411 additions and 121 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&scan_in_array[parent], temp[thread_id]);
Expand Down
22 changes: 11 additions & 11 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,19 @@ awkward_reduce_argmax_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = temp[thread_id - stride];
}
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
temp[thread_id] = index;
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = temp[thread_id - stride];
}
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
temp[thread_id] = index;
}
__syncthreads();
}
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicExch(&atomic_toptr[parent], temp[thread_id]);
Expand Down
22 changes: 11 additions & 11 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,19 @@ awkward_reduce_argmin_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = temp[thread_id - stride];
}
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
temp[thread_id] = index;
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = temp[thread_id - stride];
}
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
temp[thread_id] = index;
}
__syncthreads();
}
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicExch(&atomic_toptr[parent], temp[thread_id]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,17 @@ awkward_reduce_count_64_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&toptr[parent], temp[thread_id]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ awkward_reduce_countnonzero_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&toptr[parent], temp[thread_id]);
Expand Down
18 changes: 9 additions & 9 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,18 @@ awkward_reduce_max_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = identity;
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = identity;

if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[idx - stride];
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[idx - stride];
}
__syncthreads();
temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
__syncthreads();
}
__syncthreads();
temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicMax(&toptr[parent], temp[thread_id]);
Expand Down
18 changes: 9 additions & 9 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ awkward_reduce_min_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = identity;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = identity;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
__syncthreads();
}
__syncthreads();
temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicMin(&toptr[parent], temp[thread_id]);
Expand Down
18 changes: 9 additions & 9 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ awkward_reduce_prod_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] *= val;
__syncthreads();
}
__syncthreads();
temp[thread_id] *= val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicMul(&atomic_toptr[parent], temp[thread_id]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ awkward_reduce_prod_bool_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] &= (val != 0);
__syncthreads();
}
__syncthreads();
temp[thread_id] &= (val != 0);
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAnd(&atomic_toptr[parent], temp[thread_id]);
Expand Down
18 changes: 9 additions & 9 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ awkward_reduce_sum_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&toptr[parent], temp[thread_id]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ awkward_reduce_sum_bool_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] |= (val != 0);
__syncthreads();
}
__syncthreads();
temp[thread_id] |= (val != 0);
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicOr(&atomic_toptr[parent], temp[thread_id]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ awkward_reduce_sum_int32_bool_64_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&toptr[parent], temp[thread_id]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ awkward_reduce_sum_int64_bool_64_b(
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
if (thread_id < lenparents) {
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
T val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&toptr[parent], temp[thread_id]);
Expand Down
Loading

0 comments on commit a1da072

Please sign in to comment.